# Investigating whether GPT embeddings are correct

In [2]:
print('Notebook is working.')
%load_ext autoreload
%autoreload 2
import os
os.environ['TRANSFORMERS_CACHE'] = '/atlas/u/pkalluri/.cache'
import sys
sys.path.insert(0, os.path.abspath('../../..'))  # distances --> vis-wiki --> analysis-and-vis --> src
# This form of import is reasonable research practice because many directories may want to use the same utils,
# but note that this is a bad practice for publishing packages because directories should be modular, with all utils inside them.
# Consider changing before publicly publishing code.
# load
from utils import references as refs
import pickle
import numpy as np
from utils.Token import Token
from utils.misc_util import select_layers
from collections import Counter
# process
import random
from utils.ModelType import ModelType, get_generic, berts, gpts
from utils.MyModel import MyModel
# calculate distances
from utils.acts_util import get_angles, get_euclidean_distances
import pandas as pd
# vis
import plotly.express as px
from utils.plotly_util import get_error_bands, combine_figs
from IPython.display import HTML as html_print

Notebook is working.


In [3]:
# Helpers
def color_str(s, color='black'):
    return "<text style=background:{}>{}</text>".format(color, s)
print_color = lambda s, color: html_print(color_str(str(s), color=color))
print_pink = lambda s: print_color(s, color='pink')
print_green = lambda s: print_color(s, color='pink')

## Params

In [4]:
# Model and data params
model_type = ModelType.gpt2
dataset_dir = f"/atlas/u/pkalluri/bert-vis/big-data/{get_generic(model_type)}/{model_type.value}/"
n_layers = None  # None results in analyzing all layers

In [5]:
model = MyModel(model_type=model_type)
dataset_dir = os.path.abspath(dataset_dir)
dataset_toks = [Token(doc,pos,model_type) for doc,pos in pickle.load(open(os.path.join(dataset_dir, refs.toks_fn),'rb'))]
types_counts = Counter([tok.type for tok in dataset_toks])
dataset_acts = np.load(os.path.join(dataset_dir, refs.acts_fn))
layers = select_layers(list(dataset_acts), n_layers)

## Minimal investigation

In [6]:
# Nice for finding short sentences in the dataset
[(i,tok.text) for i, tok in enumerate(dataset_toks) if tok.len < 6]

[(1799, '[[August]] 1'),
 (1800, 'August[[ 1]]'),
 (2137, '[[<]]br>'),
 (2138, '<[[br]]>'),
 (2139, '<br[[>]]'),
 (8936, '[[<]]br>'),
 (8937, '<[[br]]>'),
 (8938, '<br[[>]]'),
 (11905, '[[There]] is 1 street.'),
 (11906, 'There[[ is]] 1 street.'),
 (11907, 'There is[[ 1]] street.'),
 (11908, 'There is 1[[ street]].'),
 (11909, 'There is 1 street[[.]]'),
 (13048, '[[R]]. Cohen)'),
 (13049, 'R[[.]] Cohen)'),
 (13050, 'R.[[ Cohen]])'),
 (13051, 'R. Cohen[[)]]'),
 (13263, '[[19]]44 in comics'),
 (13264, '19[[44]] in comics'),
 (13265, '1944[[ in]] comics'),
 (13266, '1944 in[[ comics]]'),
 (13311, '[[Ac]]ademy.'),
 (13312, 'Ac[[ad]]emy.'),
 (13313, 'Acad[[emy]].'),
 (13314, 'Academy[[.]]'),
 (14800, '[[e]].g.'),
 (14801, 'e[[.]]g.'),
 (14802, 'e.[[g]].'),
 (14803, 'e.g[[.]]'),
 (15236, '[[<]]br>'),
 (15237, '<[[br]]>'),
 (15238, '<br[[>]]'),
 (16386, '[[The]] six ships were:'),
 (16387, 'The[[ six]] ships were:'),
 (16388, 'The six[[ ships]] were:'),
 (16389, 'The six ships[[ were]]:'),
 (

In [57]:
tokenizer = model.tokenizer # for convenience
def my_decode(doc):
    return [id_ for type_ in doc for id_ in tokenizer.encode(type_) if tokenizer.encode(type_)]

In [61]:
# Choose a tok and load act
# tok_id = 19513
tok_id = 200
layer = 'arr_0'
tok = dataset_toks[tok_id]
tok_act = dataset_acts[layer][tok_id]
print('Loading from file')
print(tok.doc)
print_color(tok_act[:4], 'pink')

Loading from file
['R', 'an', 'ney', ' was', ' selected', ' the', ' first', ' president', ' of', ' the', ' association', '.']


In [62]:
# From untokenized string, calculate this act. 
# Tokenization of middle tokens is printed in a strage format but act matches!
doc_str = ''.join(tok.doc)
print("My input: ", doc_str)
print()

# Step 1 it does internally
tokenization = model.tokenizer(doc_str, return_tensors="pt")['input_ids'][0].tolist()
print("Model input ids: ", tokenization)
print()

# Let it run the full process
_, doc_acts = model.get_doc_acts(doc_str)
_doc_acts = doc_acts[layer]
print_color(_doc_acts[tok.pos][:4], 'pink')

My input:  Ranney was selected the first president of the association.

Model input ids:  [49, 272, 1681, 373, 6163, 262, 717, 1893, 286, 262, 8112, 13]



In [63]:
# Do these normal tokens work
print("My input: ", tok.doc)
print()

# What it does internally
print("Model input ids: ", my_encode(tok.doc))
print()

tok_act = model.get_tok_acts(tok)[layer]
print_color(tok_act[:4], color='pink')

My input:  ['R', 'an', 'ney', ' was', ' selected', ' the', ' first', ' president', ' of', ' the', ' association', '.']

Model input ids:  [49, 272, 1681, 373, 6163, 262, 717, 1893, 286, 262, 8112, 13]



In [None]:
# Apply each filter, constructing the subcorpora of interest
subcorpora = {filter_:{tok_id:[] for tok_id in tok_ids}  for filter_ in filters}
# e.g. a subcorpus is filter ("same type") as this token ("...caught...")
for candidate_tok_id in range(len(dataset_toks)):  # scan through dataset for relevant tokens
    candidate_tok = dataset_toks[candidate_tok_id]
    for tok_id in tok_ids:
        tok = dataset_toks[tok_id]
        for filter_, f in filters.items():
            subcorpus = subcorpora[filter_][tok_id]
            if len(subcorpus) < n_samples and f(tok, candidate_tok):  
                # continue gathering points relevant to this subcorpus
                subcorpus.append(candidate_tok_id)

## Create custom corpora to compare to 
e.g. You may be interested in comparing this token to the same type dropped into random contexts