In [1]:
import os
import re
import json
import csv
import time
from tqdm import tqdm

import openai
from dotenv import load_dotenv
import datasets

load_dotenv()

assert os.getenv("OPENAI_API_KEY"), "No OPENAI_API_KEY defined in .env."

openai.api_key = os.getenv("OPENAI_API_KEY")


In [2]:
with open("all_pokemon_data.json", "rb") as f:
    data = json.load(f)

len(data)

1154

In [3]:
str(data[0])[0:100]

"{'id': 1, 'name': 'bulbasaur', 'height': 7, 'weight': 69, 'base_experience': 64, 'pokemon_v2_pokemon"

Minify each pokemon text to save tokens (via [Stack Overflow](https://stackoverflow.com/questions/33233313/python-json-dumpsval-to-output-minified-json)), and eliminate redundant `pokemon_v2_` labels. 

In [4]:
def minify_pokemon(pokemon_json):
    return json.dumps(pokemon_json, separators=(',', ':')).replace("pokemon_v2_", "")

minify_pokemon(data[0])[0:100]

'{"id":1,"name":"bulbasaur","height":7,"weight":69,"base_experience":64,"pokemontypes":[{"type":{"nam'

The traditional methods of saving data to a CSV will not work for embeddings, as they consume a *lot* of a space (about 34KB per embedding). Therefore, the best way to save the embeddings is as a Parquet file where the embeddings are saved internally as numbers, and with the option for better controll (such as using `float32` instead of `float64`). This results in about 9KB per embedding; a notable reduction.

The downside is that all embeddings must be stored in memory first (as a list of dicts) instead of writing embeddings to disk as you get them.

In [5]:
max_request_char = 25000  # to avoid going over 8191 token input limit: required for Mew/Arceus

poke_dicts = []

for poke in tqdm(data):
    r = openai.Embedding.create(
        input=[minify_pokemon(poke)[:max_request_char]], model="text-embedding-ada-002"
    )
    poke_dicts.append(
        {
            "id": poke["id"],
            "name": poke["name"],
            "num_tokens": r["usage"]["total_tokens"],
            "embedding": r["data"][0]["embedding"],
        }
    )
    time.sleep(0.1)


100%|██████████| 1154/1154 [06:15<00:00,  3.07it/s]


Manually specify the features we want, to reduce final dataset size and make it slightly more efficient.

In [6]:
features = datasets.Features(
    {
        "id": datasets.Value(dtype="int32"),
        "name": datasets.Value(dtype="string"),
        "num_tokens": datasets.Value(dtype="int32"),
        "embedding": datasets.Sequence(
            feature=datasets.Value(dtype="float32"), length=1536
        ),
    }
)

features


{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'num_tokens': Value(dtype='int32', id=None),
 'embedding': Sequence(feature=Value(dtype='float32', id=None), length=1536, id=None)}

In [7]:
dataset = datasets.Dataset.from_list(poke_dicts, features=features)
dataset


Dataset({
    features: ['id', 'name', 'num_tokens', 'embedding'],
    num_rows: 1154
})

In [8]:
dataset[0]["embedding"][0:10]

[0.0020267923828214407,
 0.012879939749836922,
 -8.06017778813839e-05,
 -0.007401061709970236,
 -0.03888826444745064,
 0.013899873942136765,
 0.015795905143022537,
 -0.031696420162916183,
 -0.01627971976995468,
 -0.04419715330004692]

In [9]:
dataset.features

{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'num_tokens': Value(dtype='int32', id=None),
 'embedding': Sequence(feature=Value(dtype='float32', id=None), length=1536, id=None)}

In [10]:
dataset.to_parquet("embeddings.parquet")

14672952

Reload dataset to confirm.

In [11]:
dataset_2 = datasets.Dataset.from_parquet("embeddings.parquet")
dataset_2

Using custom data configuration default-142e43c20aee2ea2


Downloading and preparing dataset parquet/default to /Users/maxwoolf/.cache/huggingface/datasets/parquet/default-142e43c20aee2ea2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec...


Downloading data files: 100%|██████████| 1/1 [00:00<00:00, 4181.76it/s]
Extracting data files: 100%|██████████| 1/1 [00:00<00:00, 860.02it/s]
                                                        

Dataset parquet downloaded and prepared to /Users/maxwoolf/.cache/huggingface/datasets/parquet/default-142e43c20aee2ea2/0.0.0/2a3b91fbd88a2c90d1dbbb32b460cf621d31bd5b05b934492fdef7d8d6f236ec. Subsequent calls will reuse this data.




Dataset({
    features: ['id', 'name', 'num_tokens', 'embedding'],
    num_rows: 1154
})

In [12]:
dataset_2[0]["embedding"][0:10]

[0.0020267923828214407,
 0.012879939749836922,
 -8.06017778813839e-05,
 -0.007401061709970236,
 -0.03888826444745064,
 0.013899873942136765,
 0.015795905143022537,
 -0.031696420162916183,
 -0.01627971976995468,
 -0.04419715330004692]

In [13]:
dataset_2.features

{'id': Value(dtype='int32', id=None),
 'name': Value(dtype='string', id=None),
 'num_tokens': Value(dtype='int32', id=None),
 'embedding': Sequence(feature=Value(dtype='float32', id=None), length=1536, id=None)}