In [7]:
!pip install -q transformers torch datasets einops

[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/43.2 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m43.2/43.2 kB[0m [31m1.4 MB/s[0m eta [36m0:00:00[0m
[?25h

In [2]:
import json
import time

import datasets
import torch
import torch.nn.functional as F
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

In [3]:
with open("all_pokemon_data.json", "rb") as f:
    data = json.load(f)

len(data)

1302

In [4]:
str(data[0])[0:100]

"{'id': 1, 'name': 'bulbasaur', 'height': 7, 'weight': 69, 'base_experience': 64, 'pokemon_v2_pokemon"

Minify each pokemon text to save tokens (via [Stack Overflow](https://stackoverflow.com/questions/33233313/python-json-dumpsval-to-output-minified-json)), and eliminate redundant `pokemon_v2_` labels.


In [5]:
def minify_pokemon(pokemon_json):
    return json.dumps(pokemon_json, separators=(",", ":")).replace("pokemon_v2_", "")


minify_pokemon(data[0])[0:100]

'{"id":1,"name":"bulbasaur","height":7,"weight":69,"base_experience":64,"pokemontypes":[{"type":{"nam'

Set up embedding model.


In [14]:
gpu = True

tokenizer = AutoTokenizer.from_pretrained(
    "nomic-ai/nomic-embed-text-v1.5", model_max_length=8192
)
text_model = AutoModel.from_pretrained(
    "nomic-ai/nomic-embed-text-v1.5", trust_remote_code=True, rotary_scaling_factor=2
)
if gpu:
    text_model.to("cuda")
text_model.eval()



NomicBertModel(
  (embeddings): NomicBertEmbeddings(
    (word_embeddings): Embedding(30528, 768)
    (token_type_embeddings): Embedding(2, 768)
  )
  (emb_drop): Dropout(p=0.0, inplace=False)
  (emb_ln): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
  (encoder): NomicBertEncoder(
    (layers): ModuleList(
      (0-11): 12 x NomicBertBlock(
        (attn): NomicBertAttention(
          (rotary_emb): NomicBertDynamicNTKRotaryEmbedding()
          (Wqkv): Linear(in_features=768, out_features=2304, bias=False)
          (out_proj): Linear(in_features=768, out_features=768, bias=False)
          (drop): Dropout(p=0.0, inplace=False)
        )
        (mlp): NomciBertGatedMLP(
          (fc11): Linear(in_features=768, out_features=3072, bias=False)
          (fc12): Linear(in_features=768, out_features=3072, bias=False)
          (fc2): Linear(in_features=3072, out_features=768, bias=False)
          (norm): Identity()
        )
        (dropout1): Dropout(p=0.0, inplace=False)
    

In [19]:
def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = (
        attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    )
    return torch.sum(token_embeddings * input_mask_expanded, 1) / torch.clamp(
        input_mask_expanded.sum(1), min=1e-9
    )


def get_embeddings(text: str):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="pt")

    if gpu:
        encoded_input = encoded_input.to("cuda")

    with torch.no_grad():
        model_output = text_model(**encoded_input)

    text_embeddings = mean_pooling(model_output, encoded_input["attention_mask"])
    text_embeddings = F.normalize(text_embeddings, p=2, dim=1)
    return text_embeddings

In [20]:
sentences = [
    "search_query: What are cute animals to cuddle with?",
]

get_embeddings(sentences).size()

torch.Size([1, 768])

The traditional methods of saving data to a CSV will not work for embeddings, as they consume a _lot_ of a space (about 34KB per embedding). Therefore, the best way to save the embeddings is as a Parquet file where the embeddings are saved internally as numbers, and with the option for better controll (such as using `float32` instead of `float64`). This results in about 9KB per embedding; a notable reduction.

The downside is that all embeddings must be stored in memory first (as a list of dicts) instead of writing embeddings to disk as you get them.


In [21]:
poke_dicts = []

for poke in tqdm(data):
    poke_minified = minify_pokemon(poke)
    embed = (
        get_embeddings("search_document: " + poke_minified)[0].cpu().detach().numpy()
    )
    poke_dicts.append(
        {
            "id": poke["id"],
            "name": poke["name"],
            "text_embedding": embed,
        }
    )

  0%|          | 0/1302 [00:00<?, ?it/s]

Manually specify the features we want, to reduce final dataset size and make it slightly more efficient.


In [24]:
features = datasets.Features(
    {
        "id": datasets.Value(dtype="int32"),
        "name": datasets.Value(dtype="string"),
        "text_embedding": datasets.Sequence(
            feature=datasets.Value(dtype="float32"), length=768
        ),
    }
)

features

{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'text_embedding': Sequence(feature=Value(dtype='float32', id=None), length=768, id=None)}

In [25]:
dataset = datasets.Dataset.from_list(poke_dicts, features=features)
dataset

Dataset({
    features: ['id', 'name', 'text_embedding'],
    num_rows: 1302
})

In [27]:
dataset[0]["text_embedding"][0:10]

[0.023699194192886353,
 0.098256915807724,
 -0.1136355847120285,
 -0.02942875400185585,
 0.027495862916111946,
 -0.029112081974744797,
 -0.01889587566256523,
 -0.03170907497406006,
 0.02622426487505436,
 -0.05863893777132034]

In [28]:
dataset.features

{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'text_embedding': Sequence(feature=Value(dtype='float32', id=None), length=768, id=None)}

In [29]:
dataset.to_parquet("text_embeddings.parquet", compression="gzip")

Creating parquet from Arrow format:   0%|          | 0/2 [00:00<?, ?ba/s]

4022387

Reload dataset to confirm.


In [30]:
dataset_2 = load_dataset("parquet", data_files="text_embeddings.parquet")["train"]
dataset_2

Generating train split: 0 examples [00:00, ? examples/s]

Dataset({
    features: ['id', 'name', 'text_embedding'],
    num_rows: 1302
})

In [32]:
dataset_2[0]["text_embedding"][0:10]

[0.023699194192886353,
 0.098256915807724,
 -0.1136355847120285,
 -0.02942875400185585,
 0.027495862916111946,
 -0.029112081974744797,
 -0.01889587566256523,
 -0.03170907497406006,
 0.02622426487505436,
 -0.05863893777132034]

In [33]:
dataset_2.features

{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'text_embedding': Sequence(feature=Value(dtype='float32', id=None), length=768, id=None)}