Fix embeddings pipeline (huggingface#36)

ocavue · Mar 20, 2023 · 851815b · 851815b
1 parent f10106a
commit 851815b
Show file tree

Hide file tree

Showing 3 changed files with 101 additions and 36 deletions.
diff --git a/src/pipelines.js b/src/pipelines.js
@@ -30,6 +30,8 @@ const {
     env
 } = require('./env.js');
 
+const { Tensor } = require("./tensor_utils.js");
+
 
 class Pipeline extends Callable {
     constructor(task, tokenizer, model) {
@@ -275,17 +277,68 @@ class TextGenerationPipeline extends Pipeline {
 
 
 class EmbeddingsPipeline extends Pipeline {
-    async _call(texts) {
-        let [inputs, outputs] = await super._call(texts);
+    // Should only be used with sentence-transformers
+    // If you want to get the raw outputs from the model,
+    // use `AutoModel.from_pretrained(...)`
+
+    _mean_pooling(last_hidden_state, attention_mask) {
+        // last_hidden_state: [batchSize, seqLength, embedDim]
+        // attention_mask:    [batchSize, seqLength]
+
+        let shape = [last_hidden_state.dims[0], last_hidden_state.dims[2]];
+        let returnedData = new last_hidden_state.data.constructor(shape[0] * shape[1])
+        let [batchSize, seqLength, embedDim] = last_hidden_state.dims;
+
+        let outIndex = 0;
+        for (let i = 0; i < batchSize; ++i) {
+            let offset = i * embedDim * seqLength;
+
+            for (let k = 0; k < embedDim; ++k) {
+                let sum = 0;
+                let count = 0;
+
+                let attnMaskOffset = i * seqLength;
+                let offset2 = offset + k;
+                // Pool over all words in sequence
+                for (let j = 0; j < seqLength; ++j) {
+                    // index into attention mask
+                    let attn = Number(attention_mask.data[attnMaskOffset + j]);
+
+                    count += attn;
+                    sum += last_hidden_state.data[offset2 + j * embedDim] * attn;
+                }
+
+                let avg = sum / count;
+                returnedData[outIndex++] = avg;
+            }
+        }
 
-        // Get embedding from outputs. This is typically indexed with some number.
-        delete outputs['last_hidden_state'];
-        let embeddingsTensor = Object.values(outputs)[0];
+        return new Tensor(
+            last_hidden_state.type,
+            returnedData,
+            shape
+        )
+    }
+
+    _normalize(tensor) {
+        // Normalise tensors along dim=1
+        // NOTE: only works for tensors of shape [batchSize, embedDim]
+        // Operates in-place
+        for (let batch of tensor) {
+            let norm = Math.sqrt(batch.data.reduce((a, b) => a + b * b))
+
+            for (let i = 0; i < batch.data.length; ++i) {
+                batch.data[i] /= norm;
+            }
+        }
+        return tensor;
+    }
 
-        // TODO - return as tensor?
-        let embeddings = reshape(embeddingsTensor.data, embeddingsTensor.dims);
+    async _call(texts) {
+        let [inputs, outputs] = await super._call(texts);
 
-        return embeddings
+        // Perform mean pooling, followed by a normalization step
+        return this._normalize(this._mean_pooling(outputs.last_hidden_state, inputs.attention_mask));
     }
 
     cos_sim(arr1, arr2) {
@@ -781,33 +834,6 @@ async function pipeline(
 
 }
 
-function reshape(data, dimensions) {
-
-    const totalElements = data.length;
-    const dimensionSize = dimensions.reduce((a, b) => a * b);
-
-    if (totalElements !== dimensionSize) {
-        throw Error(`cannot reshape array of size ${totalElements} into shape (${dimensions})`);
-    }
-
-    let reshapedArray = data;
-
-    for (let i = dimensions.length - 1; i >= 0; i--) {
-        reshapedArray = reshapedArray.reduce((acc, val) => {
-            let lastArray = acc[acc.length - 1];
-
-            if (lastArray.length < dimensions[i]) {
-                lastArray.push(val);
-            } else {
-                acc.push([val]);
-            }
-
-            return acc;
-        }, [[]]);
-    }
-
-    return reshapedArray[0];
-}
 
 function product(...a) {
     // Cartesian product of items

diff --git a/src/tensor_utils.js b/src/tensor_utils.js
@@ -51,9 +51,43 @@ class Tensor extends ONNX.Tensor {
         return new Tensor(this.type, data, iterDims);
     }
 
+    tolist() {
+        // Convert tensor data to a n-dimensional JS list
+        return reshape(this.data, this.dims)
+    }
+
     // TODO add .slice()
 }
 
+
+function reshape(data, dimensions) {
+
+    const totalElements = data.length;
+    const dimensionSize = dimensions.reduce((a, b) => a * b);
+
+    if (totalElements !== dimensionSize) {
+        throw Error(`cannot reshape array of size ${totalElements} into shape (${dimensions})`);
+    }
+
+    let reshapedArray = data;
+
+    for (let i = dimensions.length - 1; i >= 0; i--) {
+        reshapedArray = reshapedArray.reduce((acc, val) => {
+            let lastArray = acc[acc.length - 1];
+
+            if (lastArray.length < dimensions[i]) {
+                lastArray.push(val);
+            } else {
+                acc.push([val]);
+            }
+
+            return acc;
+        }, [[]]);
+    }
+
+    return reshapedArray[0];
+}
+
 function transpose(tensor, axes) {
     // Calculate the new shape of the transposed array
     // and the stride of the original array
@@ -115,6 +149,8 @@ function cat(tensors) {
     return new Tensor(tensorType, data, tensorShape)
 }
 
+
+
 module.exports = {
     Tensor,
     transpose,

diff --git a/tests/index.js b/tests/index.js
@@ -66,6 +66,9 @@ async function embeddings() {
     // Run sentences through embedder
     let output = await embedder(sentences)
 
+    // Convert Tensor to JS list
+    output = output.tolist();
+
     // Compute pairwise cosine similarity
     // for (let i = 0; i < sentences.length; ++i) {
     //     for (let j = i + 1; j < sentences.length; ++j) {
@@ -81,7 +84,7 @@ async function embeddings() {
 
     return isDeepEqual(
         pairwiseScores,
-        [0.8195198760573937, 0.6200714107649917, 0.5930511190112736]
+        [0.5022028979523243, 0.11238511059270409, 0.09594821582314679]
     )
 }