# Dataset Visualization

* Token distribution
* Embedding clustering

In [None]:
import os
import json
import tiktoken
import numpy as np
import matplotlib.pyplot as plt

In [None]:
os.chdir("../")
from modules.chromadb_handler import ChromaDBHandler
from utils.jsons import load_json

## 1. Load Encoding

In [None]:
encoding = tiktoken.get_encoding("cl100k_base")

Use tiktoken.encoding_for_model() to automatically load the correct encoding for a given model name.

In [None]:
encoding = tiktoken.encoding_for_model("gpt-3.5-turbo")

## 2. Turn text into tokens with encoding.encode()

The .encode() method converts a text string into a list of token integers.

In [None]:
encoding.encode("tiktoken is great!")

Count tokens by counting the length of the list returned by .encode().

In [None]:
def num_tokens_from_string(string: str, encoding_name: str) -> int:
    """Returns the number of tokens in a text string."""
    encoding = tiktoken.get_encoding(encoding_name)
    num_tokens = len(encoding.encode(string))
    return num_tokens

In [None]:
num_tokens_from_string("tiktoken is great!", "cl100k_base")

## 3. Loading Golden Dataset

In [None]:
trials = ChromaDBHandler("data/collections/", 'ctrials').collection
trials.count()

In [None]:
dataset = load_json("data/raw/random_t_annotation_500_42.json")

In [None]:
# Selecting the ids only
dataset_ids = list(dataset.keys())
len(dataset_ids)

In [None]:
# Get the documents for the dataset ids
dataset_data = trials.get(ids=dataset_ids, include = ['documents', 'embeddings'])
dataset_docs = dataset_data['documents']

In [None]:
dataset_docs[0]

## 4. Counting tokens for our Golden dataset

In [None]:
tokens = [num_tokens_from_string(doc, "cl100k_base") for doc in dataset_docs]

In [None]:
len(tokens)

## 5. Visualize token distribution

In [None]:
if not os.path.exists("images"):
    os.mkdir("images")

In [None]:
plt.style.use('ggplot')
plt.figure(figsize=(10,6)) # Make it 14x7 inch
plt.hist(tokens, bins=90, linewidth=0.5,  ec="black", color = "steelblue")
plt.title('Histogram of Token Count') 
plt.xlabel('Token count') 
plt.ylabel('Frequency') 
plt.savefig('images/token_count_hist.png')
plt.show()

In [None]:
min(tokens)

In [None]:
short_docs = [i for i in dataset_docs if num_tokens_from_string(i, "cl100k_base") < 100]

In [None]:
len(short_docs)

## Visualize Embeddings

In [None]:
from umap import UMAP
import plotly.express as px
import plotly.io as pio
pio.renderers.default = 'notebook'

In [None]:
# Load embeddings
dataset_embs = np.array(dataset_data['embeddings'])

In [None]:
# Reduce dimensionality of embeddings
projections = UMAP(n_neighbors=10, n_components=2, min_dist=0.1, metric='cosine').fit_transform(dataset_embs)

In [None]:
fig = px.scatter(projections, x=0, y=1)

In [None]:
fig.write_image("images/umap_10neig_2cmp.jpeg")

In [None]:
dataset_docs[5]