## Tokenizer + BERT → Data-processing → OpenAI

Installing an environment like conda is recommended. This notebook last ran on Python 3.8.18 without issues.

In [1]:
!pip install --upgrade accelerate transformers

Collecting accelerate
  Downloading accelerate-0.24.1-py3-none-any.whl (261 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m261.4/261.4 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hCollecting transformers
  Downloading transformers-4.35.0-py3-none-any.whl (7.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m7.9/7.9 MB[0m [31m1.0 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m0m
Collecting huggingface-hub
  Downloading huggingface_hub-0.19.0-py3-none-any.whl (311 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m311.2/311.2 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting regex!=2019.12.17
  Downloading regex-2023.10.3-cp39-cp39-macosx_11_0_arm64.whl (291 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m291.0/291.0 kB[0m [31m1.6 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
Collecting tokenizers<0.15,>=0.14
  Downloading tokenizers-0.14.1-cp39-cp39-macosx_11

In [1]:
from transformers import AutoTokenizer, DistilBertModel
import torch

# https://huggingface.co/distilbert-base-uncased
# https://huggingface.co/docs/transformers/v4.35.0/en/model_doc/distilbert
tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

Some weights of the model checkpoint at distilbert-base-uncased were not used when initializing DistilBertModel: ['vocab_layer_norm.weight', 'vocab_projector.weight', 'vocab_transform.bias', 'vocab_layer_norm.bias', 'vocab_projector.bias', 'vocab_transform.weight']
- This IS expected if you are initializing DistilBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DistilBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [2]:
# load in coco classes from 'coco-classes.json'
import json
with open('coco-classes.json') as f:
  coco_classes = json.load(f)
print(coco_classes)
print(len(coco_classes))

['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
80


### Tokenizer + BERT

In [3]:
# get embedding for each class
# ❗️ note: I am only getting the embedding for the first token in each class
# ❓ question: are we interested in the final contextual embedding for each class? currently, we're looking at the final hidden state.
embeddings = []
for i in range(len(coco_classes)):
    input_ids = torch.tensor(tokenizer.encode(coco_classes[i])).unsqueeze(0)
    outputs = model(input_ids)
    last_hidden_states = outputs[0]
    # skip the first token, which is the [CLS] token
    embeddings.append(last_hidden_states[0][1].tolist())

In [4]:
print(len(embeddings))

80


In [5]:
import numpy as np
# round each val in embedding to 3 decimal places
embeddings = [list(np.around(np.array(e),3)) for e in embeddings]

#### table of embeddings

In [6]:
# create string of all classes and their embeddings & save to text file
# ❗️ note: only taking first 10 axes for now due to context window length
with open("output.txt", "w") as text_file:
    for i in range(len(coco_classes)):
        class_str = f"{coco_classes[i]}: {embeddings[i][:10]}\n"
        text_file.write(class_str)

## visualization

In [9]:
import plotly.express as px
from sklearn.decomposition import PCA
import pandas as pd

In [74]:
# categories from ChatGPT-4
# 1 = transportation/traffic
# 2 = animals/living beings
# 3 - household items/personal belongings
categories = [2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3]
cat_dict = {1: 'transport', 2: 'living', 3: 'household'}

#### PCA on original embeddings

In [112]:
# convert embedding list to dataframe
# Convert to DataFrame
df = pd.DataFrame(embeddings)

print(df.shape)
df.head()  # Display the first 5 rows to check the structure

(80, 768)


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,758,759,760,761,762,763,764,765,766,767
0,-0.375,-0.251,-0.046,-0.12,-0.02,0.713,-0.144,0.427,-0.022,-0.458,...,0.678,-0.576,0.6,-0.078,-0.002,-0.154,0.148,0.055,0.459,-0.115
1,0.998,0.142,-0.379,-0.093,0.538,0.109,-0.232,0.187,-0.462,-0.299,...,0.184,-0.021,0.064,-0.084,0.654,-0.15,-0.194,0.013,0.042,-0.664
2,0.378,-0.014,-0.001,-0.091,0.484,0.287,-0.126,0.121,0.297,-0.319,...,0.229,-0.367,0.22,-0.126,0.479,-0.342,-0.312,-0.393,0.188,-0.158
3,0.97,0.256,-0.249,-0.156,0.342,-0.322,-0.142,-0.129,-0.132,-0.227,...,0.381,0.035,0.134,-0.324,0.404,-0.324,-0.041,-0.021,0.081,-0.858
4,0.187,0.095,-0.452,0.072,0.378,-0.166,-0.381,0.118,0.108,-0.644,...,0.415,0.053,0.227,-0.219,0.252,0.098,0.132,0.044,0.553,-0.133


In [113]:
# perform PCA
pca = PCA(n_components=3)
components = pca.fit_transform(df)

total_var = pca.explained_variance_ratio_.sum() * 100

In [114]:
# new data frame with PCA result + word + category info
new_df = pd.DataFrame(components)
new_df.columns = new_df.columns.astype(str)
new_df.insert(0, 'word', coco_classes)
new_df.insert(1, 'category', [cat_dict[c] for c in categories])
new_df.head()

Unnamed: 0,word,category,0,1,2
0,person,living,-0.536402,2.395109,0.348883
1,bicycle,transport,0.319043,1.262819,2.123303
2,car,transport,0.736521,2.077732,0.465381
3,motorcycle,transport,-0.568163,0.564398,1.689769
4,airplane,transport,-2.447619,0.312609,0.960944


In [124]:
fig = px.scatter_3d(
    new_df, x='0', y='1', z='2', 
    color='category',
    title=f'COCO Class Embeddings (Total Explained Variance: {total_var:.2f}%)',
    hover_data={'word': True, 'category': False, '0': False, '1': False, '2': False},
)
fig.show()

#### now using axes from chatgpt

In [122]:
# create dataframe with 3 axes chosen by chatgpt: 
gpt_df = df[[0, 4, 5]]

# normalize
gpt_df = (gpt_df - gpt_df.mean()) / gpt_df.std()

# convert columns to string
gpt_df.columns = gpt_df.columns.astype(str)

# insert word and category columns
gpt_df.insert(0, 'word', coco_classes)
gpt_df.insert(1, 'category', [cat_dict[c] for c in categories])

# preview df
gpt_df.head()

Unnamed: 0,word,category,0,4,5
0,person,living,-1.927416,-0.883672,2.323592
1,bicycle,transport,1.954417,0.760643,-0.002744
2,car,transport,0.201513,0.601516,0.682832
3,motorcycle,transport,1.875253,0.18307,-1.662763
4,airplane,transport,-0.338494,0.289155,-1.061921


In [123]:
# plot results
fig = px.scatter_3d(
    gpt_df, x='0', y='4', z='5',
    color='category',
    title=f'COCO Class Embeddings (GPT4)',
    hover_data={'word': True, 'category': False, '0': False, '4': False, '5': False},
)
fig.show()