Skip to content

Commit

Permalink
Fix embeddings pipeline (huggingface#36)
Browse files Browse the repository at this point in the history
  • Loading branch information
xenova committed Mar 20, 2023
1 parent f10106a commit 851815b
Show file tree
Hide file tree
Showing 3 changed files with 101 additions and 36 deletions.
96 changes: 61 additions & 35 deletions src/pipelines.js
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,8 @@ const {
env
} = require('./env.js');

const { Tensor } = require("./tensor_utils.js");


class Pipeline extends Callable {
constructor(task, tokenizer, model) {
Expand Down Expand Up @@ -275,17 +277,68 @@ class TextGenerationPipeline extends Pipeline {


class EmbeddingsPipeline extends Pipeline {
async _call(texts) {
let [inputs, outputs] = await super._call(texts);
// Should only be used with sentence-transformers
// If you want to get the raw outputs from the model,
// use `AutoModel.from_pretrained(...)`

_mean_pooling(last_hidden_state, attention_mask) {
// last_hidden_state: [batchSize, seqLength, embedDim]
// attention_mask: [batchSize, seqLength]

let shape = [last_hidden_state.dims[0], last_hidden_state.dims[2]];
let returnedData = new last_hidden_state.data.constructor(shape[0] * shape[1])
let [batchSize, seqLength, embedDim] = last_hidden_state.dims;

let outIndex = 0;
for (let i = 0; i < batchSize; ++i) {
let offset = i * embedDim * seqLength;

for (let k = 0; k < embedDim; ++k) {
let sum = 0;
let count = 0;

let attnMaskOffset = i * seqLength;
let offset2 = offset + k;
// Pool over all words in sequence
for (let j = 0; j < seqLength; ++j) {
// index into attention mask
let attn = Number(attention_mask.data[attnMaskOffset + j]);

count += attn;
sum += last_hidden_state.data[offset2 + j * embedDim] * attn;
}

let avg = sum / count;
returnedData[outIndex++] = avg;
}
}

// Get embedding from outputs. This is typically indexed with some number.
delete outputs['last_hidden_state'];
let embeddingsTensor = Object.values(outputs)[0];
return new Tensor(
last_hidden_state.type,
returnedData,
shape
)
}

_normalize(tensor) {
// Normalise tensors along dim=1
// NOTE: only works for tensors of shape [batchSize, embedDim]
// Operates in-place
for (let batch of tensor) {
let norm = Math.sqrt(batch.data.reduce((a, b) => a + b * b))

for (let i = 0; i < batch.data.length; ++i) {
batch.data[i] /= norm;
}
}
return tensor;
}

// TODO - return as tensor?
let embeddings = reshape(embeddingsTensor.data, embeddingsTensor.dims);
async _call(texts) {
let [inputs, outputs] = await super._call(texts);

return embeddings
// Perform mean pooling, followed by a normalization step
return this._normalize(this._mean_pooling(outputs.last_hidden_state, inputs.attention_mask));
}

cos_sim(arr1, arr2) {
Expand Down Expand Up @@ -781,33 +834,6 @@ async function pipeline(

}

function reshape(data, dimensions) {

const totalElements = data.length;
const dimensionSize = dimensions.reduce((a, b) => a * b);

if (totalElements !== dimensionSize) {
throw Error(`cannot reshape array of size ${totalElements} into shape (${dimensions})`);
}

let reshapedArray = data;

for (let i = dimensions.length - 1; i >= 0; i--) {
reshapedArray = reshapedArray.reduce((acc, val) => {
let lastArray = acc[acc.length - 1];

if (lastArray.length < dimensions[i]) {
lastArray.push(val);
} else {
acc.push([val]);
}

return acc;
}, [[]]);
}

return reshapedArray[0];
}

function product(...a) {
// Cartesian product of items
Expand Down
36 changes: 36 additions & 0 deletions src/tensor_utils.js
Original file line number Diff line number Diff line change
Expand Up @@ -51,9 +51,43 @@ class Tensor extends ONNX.Tensor {
return new Tensor(this.type, data, iterDims);
}

tolist() {
// Convert tensor data to a n-dimensional JS list
return reshape(this.data, this.dims)
}

// TODO add .slice()
}


function reshape(data, dimensions) {

const totalElements = data.length;
const dimensionSize = dimensions.reduce((a, b) => a * b);

if (totalElements !== dimensionSize) {
throw Error(`cannot reshape array of size ${totalElements} into shape (${dimensions})`);
}

let reshapedArray = data;

for (let i = dimensions.length - 1; i >= 0; i--) {
reshapedArray = reshapedArray.reduce((acc, val) => {
let lastArray = acc[acc.length - 1];

if (lastArray.length < dimensions[i]) {
lastArray.push(val);
} else {
acc.push([val]);
}

return acc;
}, [[]]);
}

return reshapedArray[0];
}

function transpose(tensor, axes) {
// Calculate the new shape of the transposed array
// and the stride of the original array
Expand Down Expand Up @@ -115,6 +149,8 @@ function cat(tensors) {
return new Tensor(tensorType, data, tensorShape)
}



module.exports = {
Tensor,
transpose,
Expand Down
5 changes: 4 additions & 1 deletion tests/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,9 @@ async function embeddings() {
// Run sentences through embedder
let output = await embedder(sentences)

// Convert Tensor to JS list
output = output.tolist();

// Compute pairwise cosine similarity
// for (let i = 0; i < sentences.length; ++i) {
// for (let j = i + 1; j < sentences.length; ++j) {
Expand All @@ -81,7 +84,7 @@ async function embeddings() {

return isDeepEqual(
pairwiseScores,
[0.8195198760573937, 0.6200714107649917, 0.5930511190112736]
[0.5022028979523243, 0.11238511059270409, 0.09594821582314679]
)
}

Expand Down

0 comments on commit 851815b

Please sign in to comment.