## Tokenizer + BERT → Data-processing → OpenAI

Installing an environment like conda is recommended. This notebook last ran on Python 3.8.18 without issues.

In [None]:
!pip install --upgrade accelerate transformers

In [1]:
from transformers import AutoTokenizer, DistilBertModel
import torch

tokenizer = AutoTokenizer.from_pretrained("distilbert-base-uncased")
model = DistilBertModel.from_pretrained("distilbert-base-uncased")

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
# load in coco classes from 'coco-classes.json'
import json
with open('coco-classes.json') as f:
  coco_classes = json.load(f)
print(coco_classes)
print(len(coco_classes))

['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
80


In [3]:
# concatenate all classes into a single string
coco_classes_string = '. '.join(coco_classes)
print(coco_classes_string)

person. bicycle. car. motorcycle. airplane. bus. train. truck. boat. traffic light. fire hydrant. stop sign. parking meter. bench. bird. cat. dog. horse. sheep. cow. elephant. bear. zebra. giraffe. backpack. umbrella. handbag. tie. suitcase. frisbee. skis. snowboard. sports ball. kite. baseball bat. baseball glove. skateboard. surfboard. tennis racket. bottle. wine glass. cup. fork. knife. spoon. bowl. banana. apple. sandwich. orange. broccoli. carrot. hot dog. pizza. donut. cake. chair. couch. potted plant. bed. dining table. toilet. tv. laptop. mouse. remote. keyboard. cell phone. microwave. oven. toaster. sink. refrigerator. book. clock. vase. scissors. teddy bear. hair drier. toothbrush


### Tokenizer + BERT

In [4]:
inputs = tokenizer(coco_classes_string, return_tensors="pt")
outputs = model(**inputs)

last_hidden_states = outputs.last_hidden_state

In [5]:
print(inputs)
print(outputs)
print(last_hidden_states.shape)   # last_hidden_states = outputs.last_hidden_state

{'input_ids': tensor([[  101,  2711,  1012, 10165,  1012,  2482,  1012,  9055,  1012, 13297,
          1012,  3902,  1012,  3345,  1012,  4744,  1012,  4049,  1012,  4026,
          2422,  1012,  2543, 26018,  3372,  1012,  2644,  3696,  1012,  5581,
          8316,  1012,  6847,  1012,  4743,  1012,  4937,  1012,  3899,  1012,
          3586,  1012,  8351,  1012, 11190,  1012, 10777,  1012,  4562,  1012,
         29145,  1012, 21025, 27528,  7959,  1012, 13383,  1012, 12977,  1012,
          2192, 16078,  1012,  5495,  1012, 15940,  1012, 10424,  2483, 11306,
          1012,  8301,  2015,  1012,  4586,  6277,  1012,  2998,  3608,  1012,
         20497,  1012,  3598,  7151,  1012,  3598, 15913,  1012, 17260,  6277,
          1012, 14175,  6277,  1012,  5093, 14513,  3388,  1012,  5835,  1012,
          4511,  3221,  1012,  2452,  1012,  9292,  1012,  5442,  1012, 15642,
          1012,  4605,  1012, 15212,  1012,  6207,  1012, 11642,  1012,  4589,
          1012, 22953, 21408,  3669,  

In [6]:
print("Length of inputs string: ", inputs['input_ids'].shape)

# print index, token, and matching token id
for i in range(len(inputs['input_ids'][0])):
    print(f'{i}: {inputs["input_ids"][0][i]}: {tokenizer.decode([inputs["input_ids"][0][i]])}')

Length of inputs string:  torch.Size([1, 194])
0: 101: [CLS]
1: 2711: person
2: 1012: .
3: 10165: bicycle
4: 1012: .
5: 2482: car
6: 1012: .
7: 9055: motorcycle
8: 1012: .
9: 13297: airplane
10: 1012: .
11: 3902: bus
12: 1012: .
13: 3345: train
14: 1012: .
15: 4744: truck
16: 1012: .
17: 4049: boat
18: 1012: .
19: 4026: traffic
20: 2422: light
21: 1012: .
22: 2543: fire
23: 26018: hydra
24: 3372: ##nt
25: 1012: .
26: 2644: stop
27: 3696: sign
28: 1012: .
29: 5581: parking
30: 8316: meter
31: 1012: .
32: 6847: bench
33: 1012: .
34: 4743: bird
35: 1012: .
36: 4937: cat
37: 1012: .
38: 3899: dog
39: 1012: .
40: 3586: horse
41: 1012: .
42: 8351: sheep
43: 1012: .
44: 11190: cow
45: 1012: .
46: 10777: elephant
47: 1012: .
48: 4562: bear
49: 1012: .
50: 29145: zebra
51: 1012: .
52: 21025: gi
53: 27528: ##raf
54: 7959: ##fe
55: 1012: .
56: 13383: backpack
57: 1012: .
58: 12977: umbrella
59: 1012: .
60: 2192: hand
61: 16078: ##bag
62: 1012: .
63: 5495: tie
64: 1012: .
65: 15940: suitcase
66: 1

### Data-processing

In [7]:
class_embeddings = []  # should have 80 elements

# initialize a variable with tensor of zeros with shape (1, 768)
current_class_embedding = []

# ❗️ Making the assumption that the mean of the token embeddings of the word is the word embedding

# Do a for loop over the tokenized input
for i in range(len(inputs['input_ids'][0])):
  current_token = inputs['input_ids'][0][i]
  # get the hidden state of the current token
  hidden_state = last_hidden_states[0][i]
  # if token is [CLS] then skip
  if current_token == 101:
    continue
  # if token is '.' or [SEP] then average the current_class_embedding along the 768 dim and append it to class_embeddings
  elif current_token == 1012  or current_token == 102:
    current_class_embedding = torch.stack(current_class_embedding)
    # if current_class_embedding is dimension 1, then add a dimension to it
    if len(current_class_embedding.shape) == 1:
      current_class_embedding = current_class_embedding.unsqueeze(0)
    class_embeddings.append(torch.mean(current_class_embedding, dim=0))
    current_class_embedding = []
  # else concatenate the current_class_embedding and the current token's hidden state
  else:
    current_class_embedding.append(hidden_state)

class_embeddings = torch.stack(class_embeddings)
print(class_embeddings.shape)


torch.Size([80, 768])


#### (A) One slice in dictionary

In [11]:
def dict_of_slice_n(class_embeddings, n):
  # Get the nth slice across all class embeddings
  slice_n = class_embeddings[:, n]
  print(slice_n.shape)
  # Create a dictionary with the coco classes as keys and the corresponding index of slice as values
  class_embeddings_dict = {}
  for i in range(len(coco_classes)):
    # reduce the precision of the slice to 3 decimal places
    class_embeddings_dict[coco_classes[i]] = round(slice_n[i].item(), 3)
    # original precision:
    # class_embeddings_dict[coco_classes[i]] = slice_0[i].item()
  return class_embeddings_dict

slice_dict = dict_of_slice_n(class_embeddings, 0)
print(len(slice_dict))
print(slice_dict)

torch.Size([80])
80
{'person': -0.284, 'bicycle': 0.556, 'car': 0.397, 'motorcycle': 0.64, 'airplane': 0.222, 'bus': 0.115, 'train': 0.386, 'truck': 0.255, 'boat': 0.442, 'traffic light': 0.184, 'fire hydrant': -0.011, 'stop sign': 0.023, 'parking meter': 0.034, 'bench': 0.004, 'bird': 0.079, 'cat': -0.253, 'dog': -0.037, 'horse': 0.06, 'sheep': -0.105, 'cow': 0.007, 'elephant': -0.166, 'bear': -0.127, 'zebra': 0.18, 'giraffe': -0.069, 'backpack': 0.276, 'umbrella': 0.015, 'handbag': -0.123, 'tie': 0.459, 'suitcase': 0.074, 'frisbee': -0.44, 'skis': -0.192, 'snowboard': 0.06, 'sports ball': -0.174, 'kite': 0.126, 'baseball bat': -0.205, 'baseball glove': -0.17, 'skateboard': 0.158, 'surfboard': 0.225, 'tennis racket': 0.11, 'bottle': 0.146, 'wine glass': 0.153, 'cup': -0.302, 'fork': 0.089, 'knife': -0.091, 'spoon': -0.2, 'bowl': -0.163, 'banana': -0.295, 'apple': -0.398, 'sandwich': -0.006, 'orange': -0.513, 'broccoli': -0.399, 'carrot': -0.638, 'hot dog': -0.263, 'pizza': -0.273, 'do

#### (B) Multiple slices, list of lists

In [18]:
# Get the first n slice across all class embeddings
select_n = 5
slice_0 = class_embeddings[:, :select_n]
# print(slice_0)
print(slice_0.shape)

# Convert to python list
slice_0 = slice_0.T.tolist()
# reduce the precision of the slice to 3 decimal places
for i in range(len(slice_0)):
  slice_0[i] = [round(value, 3) for value in slice_0[i]]
# print(slice_0)
# print(len(slice_0))

# Prepend the list with the coco classes
slice_0.insert(0, coco_classes)
# print(slice_0)
print(len(slice_0))

# print each row of the list in new line
for row in slice_0:
  print(row)


torch.Size([80, 5])
6
['person', 'bicycle', 'car', 'motorcycle', 'airplane', 'bus', 'train', 'truck', 'boat', 'traffic light', 'fire hydrant', 'stop sign', 'parking meter', 'bench', 'bird', 'cat', 'dog', 'horse', 'sheep', 'cow', 'elephant', 'bear', 'zebra', 'giraffe', 'backpack', 'umbrella', 'handbag', 'tie', 'suitcase', 'frisbee', 'skis', 'snowboard', 'sports ball', 'kite', 'baseball bat', 'baseball glove', 'skateboard', 'surfboard', 'tennis racket', 'bottle', 'wine glass', 'cup', 'fork', 'knife', 'spoon', 'bowl', 'banana', 'apple', 'sandwich', 'orange', 'broccoli', 'carrot', 'hot dog', 'pizza', 'donut', 'cake', 'chair', 'couch', 'potted plant', 'bed', 'dining table', 'toilet', 'tv', 'laptop', 'mouse', 'remote', 'keyboard', 'cell phone', 'microwave', 'oven', 'toaster', 'sink', 'refrigerator', 'book', 'clock', 'vase', 'scissors', 'teddy bear', 'hair drier', 'toothbrush']
[-0.284, 0.556, 0.397, 0.64, 0.222, 0.115, 0.386, 0.255, 0.442, 0.184, -0.011, 0.023, 0.034, 0.004, 0.079, -0.253, -

### OpenAI API (optional)

Might be better to just use the GUI. If we want to directly manipulate the outputs we may need to do some precise prompt engineering. OpenAI has a JSON feature that we could look into.

In [14]:
# install OpenAI api
!pip install --upgrade openai

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [16]:
my_api_key = "your api key here"

import openai
openai.api_key = my_api_key

In [21]:
completion = openai.ChatCompletion.create(
  model="gpt-3.5-turbo",
  messages=[
    # {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": f"For the following lists, the first list contains words that have been put into DistilBERT – let us call this a label. Each of the subsequent lists contains the embedding value from DistilBERT for one dimension (out of 768) across the labels. By comparing the values for each label for each list, please interpret the likely concepts that each list, that is the dimension/axis of the embedding, encodes. Each of the 50 rows should encode a different concept. \n\n First, count the number of lists excluding the first (the labels list) and report on the number. \n '''There are <N> dimensions''' \n\n Then, the main output should take this form for row 'n', from 'Row 1' to 'Row N': \n '''Row n: <encoded concept>. <one sentence rationale for interpretation>''' \n\n {slice_0}"}
  ]
)

print(completion.choices[0].message)

# log the stringified output into a txt file by appending it to the end of the file
with open("output.txt", "a") as f:
  f.write(str(completion.choices[0].message))

{
  "role": "assistant",
  "content": "There are 6 dimensions.\n\nRow 1: 3D shape. The values for this dimension vary significantly across the labels, indicating that it encodes information about the three-dimensional shape of the objects.\nRow 2: Mobility. The values for this dimension are mostly positive, suggesting that it encodes information about the mobility or movement associated with the objects.\nRow 3: Edibility. The values for this dimension are a mix of positive and negative, but they are generally low, indicating that it encodes information about the edibility of the objects.\nRow 4: Size. The values for this dimension range from negative to positive, suggesting that it encodes information about the size or scale of the objects.\nRow 5: Consumer goods. The values for this dimension are mostly negative, indicating that it encodes information about whether the objects are commonly used consumer goods.\nRow 6: Natural vs. Man-made. The values for this dimension vary significa