In [1]:
import json
import time

import datasets
import torch
import torch.nn.functional as F
from datasets import load_dataset
from tqdm.auto import tqdm
from transformers import AutoModel, AutoTokenizer

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
with open("all_pokemon_data.json", "rb") as f:
    data = json.load(f)

len(data)

1302

In [3]:
str(data[0])[0:100]

"{'id': 1, 'name': 'bulbasaur', 'height': 7, 'weight': 69, 'base_experience': 64, 'pokemon_v2_pokemon"

In [6]:
pikachu = data[24]
# parsed = json.loads(pikachu)

with open("pikachu_example_raw.json", "w") as f:
    f.write(json.dumps(pikachu, indent=2))

Minify each pokemon text to save tokens (via [Stack Overflow](https://stackoverflow.com/questions/33233313/python-json-dumpsval-to-output-minified-json)), and eliminate redundant `pokemon_v2_` labels.


In [7]:
def minify_pokemon(pokemon_json):
    return json.dumps(pokemon_json, separators=(",", ":")).replace("pokemon_v2_", "")


minify_pokemon(data[0])[0:100]

'{"id":1,"name":"bulbasaur","height":7,"weight":69,"base_experience":64,"pokemontypes":[{"type":{"nam'

Set up embedding model.


In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    "nomic-ai/nomic-embed-text-v1.5", model_max_length=8192
)

In [14]:
def get_token_length(text):
    encoded_input = tokenizer(text, padding=True, truncation=True, return_tensors="np")
    return encoded_input.input_ids.shape[1]


get_token_length("I am a pony.")

7

In [15]:
token_counts = [get_token_length(minify_pokemon(x)) for x in data]

In [16]:
token_counts[:10]

[5052, 4940, 5260, 5678, 5439, 6401, 5648, 5416, 5912, 3984]

In [18]:
import csv

with open("token_counts.csv", "w") as f:
    w = csv.writer(f)
    w.writerow(["counts"])
    for row in token_counts:
        w.writerow([row])

In [19]:
import numpy as np

np.array(token_counts).mean()

3919.6136712749617

In [21]:
np.median(np.array(token_counts))

3781.0

In [22]:
import tiktoken

enc = tiktoken.get_encoding("o200k_base")
assert enc.decode(enc.encode("hello world")) == "hello world"

# To get the tokeniser corresponding to a specific model in the OpenAI API:
enc = tiktoken.encoding_for_model("gpt-4o")

In [23]:
enc

<Encoding 'o200k_base'>

In [34]:
def get_token_length_gpt(text):
    encoded_input = enc.encode(text)
    return len(encoded_input)


get_token_length_gpt("I am a pony.")

5

In [35]:
token_counts_gpt = [get_token_length_gpt(minify_pokemon(x)) for x in data]

In [36]:
token_counts_gpt[:10]

[2827, 2855, 2944, 3164, 3014, 3470, 3085, 3020, 3240, 2421]

In [37]:
np.median(np.array(token_counts_gpt))

2010.5