# Analyze distances from tokens to regions of interest

In [None]:
print('Notebook is working.')
%load_ext autoreload
%autoreload 2
import os
os.environ['TRANSFORMERS_CACHE'] = '/atlas/u/pkalluri/.cache'
import sys
sys.path.insert(0, os.path.abspath('../../..'))  # distances --> vis-wiki --> analysis-and-vis --> src
# This form of import is reasonable research practice because many directories may want to use the same utils,
# but note that this is a bad practice for publishing packages because directories should be modular, with all utils inside them.
# Consider changing before publicly publishing code.
# load
from utils import references as refs
import pickle
import numpy as np
from utils.Token import Token
from utils.misc_util import select_layers
from collections import Counter
# process
import random
from utils.ModelType import ModelType, get_generic, berts, gpts
from utils.MyModel import MyModel
# calculate distances
from utils.acts_util import get_angles, get_euclidean_distances
import pandas as pd
# vis
import plotly.express as px
from utils.plotly_util import get_error_bands, combine_figs

## Params

In [None]:
# Model and data params
model_type = ModelType.bert_base_cased
dataset_dir = f"/atlas/u/pkalluri/bert-vis/big-data/{get_generic(model_type)}/{model_type.value}/"

In [None]:
# Other params

# Number of center tokens to analyze
n_tokens = 10

# Any center tokens, or specific kinds of tokens? e.g. Do you want to know if frequent tokens get farther from their type balls?
filt = ''  # None or 'frequent.__' or 'partial' or 'top'

# How many samples would you like to compare to? i.e. How big would you like the subcorpora and custom corpora to be? 
n_samples = 10

n_layers = None  # None results in analyzing all layers

# Where to save the data
output_dir = 'distances'

# Tag to attach to saved data. e.g. to indicate something special about this run.
tag = ''

## Set up setup, data model, and choose center tokens

In [None]:
model = MyModel(model_type=model_type)
dataset_dir = os.path.abspath(dataset_dir)
dataset_toks = [Token(doc,pos,model_type) for doc,pos in pickle.load(open(os.path.join(dataset_dir, refs.toks_fn),'rb'))]
types_counts = Counter([tok.type for tok in dataset_toks])
dataset_acts = np.load(os.path.join(dataset_dir, refs.acts_fn))
layers = select_layers(list(dataset_acts), n_layers)

In [None]:
# Restrict, constructing a smaller corpus
if filt in ['', ' ', None]:
    f = lambda tok: not tok.is_special
elif filt=='partial':
    f = lambda tok: tok.is_partial
elif filt.startswith('frequent'): # e.g. get elements arund the 90th percentile
    start = float(filt.split('frequent')[1]) * len(types_counts)
    valid_types = types_counts.most_common()[::-1][start:start+n_samples]
    f = lambda tok: tok.type in valid_types and not tok.is_special
elif filt=='top':
    top_n, _ = zip(*types_counts.most_common(n_tokens+2))
    f = lambda tok: tok.type in top_n and not tok.is_special
# Note: these tokens are all unique - however there may be duplicates of the same type
candidate_ids = list(filter(lambda tok_id: f(dataset_toks[tok_id]), range(len(dataset_toks))))
# print(candidate_ids)
tok_ids = random.sample(candidate_ids, n_tokens) # Sampling from the valid corpus
print('Types: ', ' '.join([dataset_toks[tok_id].type for tok_id in tok_ids]))

## Choose subcorpora to compare to
Choose subcorpora of entire dataset, to see how far away a token is from this subcorpus.

e.g. you might interested in how far away tokens are from their type ball (tokens of same type).

In [None]:
# Define
filters = {}
filters['Random tokens'] = lambda main_tok, candidate_tok: not candidate_tok.is_edge
filters[f'Same type, in naturally occurring contexts'] = lambda tok1, tok2: tok1 != tok2 and tok1.same_type(tok2)
filters[f'Same successor, in naturally occurring contexts'] = (
    lambda tok1, tok2: tok1.same_next(tok2) and not tok1.same_type(tok2) and not tok2.is_edge)
filters[f'Same bigram, natural'] = (
    lambda tok1, tok2: tok1.same_type(tok2) and tok1.same_next(tok2) and not tok2.is_edge)
filters[f'Partner in bigram, natural'] = (
    lambda tok1, tok2: tok2.prev == tok1.type and tok1.next == tok2.type and not tok2.is_edge)

In [None]:
# Apply each filter, constructing the subcorpora of interest
subcorpora = {filter_:{tok_id:[] for tok_id in tok_ids}  for filter_ in filters}
# e.g. a subcorpus is filter ("same type") as this token ("...caught...")
for candidate_tok_id in range(len(dataset_toks)):  # scan through dataset for relevant tokens
    candidate_tok = dataset_toks[candidate_tok_id]
    for tok_id in tok_ids:
        tok = dataset_toks[tok_id]
        for filter_, f in filters.items():
            subcorpus = subcorpora[filter_][tok_id]
            if len(subcorpus) < n_samples and f(tok, candidate_tok):  
                # continue gathering points relevant to this subcorpus
                subcorpus.append(candidate_tok_id)

## Create custom corpora to compare to 
e.g. You may be interested in comparing this token to the same type dropped into random contexts

In [None]:
# Define
recipes = {}
random_toks = random.choices(
    list(filter(lambda tok: not tok.is_special and not tok.is_edge, dataset_toks)), 
    k=n_samples)
recipes[f'Same type, dropped into random contexts'] = lambda tok: tok.in_contexts(random_toks)
recipes[f'Same successor, dropped into random contexts'] = (
    lambda tok: [rand_tok.replace(tok.next, pos=rand_tok.pos+1) for rand_tok in random_toks])
recipes[f'Same bigram, dropped into random contexts'] = (
    lambda tok: [rand_tok.replace(tok.type).replace(tok.next, pos=rand_tok.pos+1) for rand_tok in random_toks])
random_types = random.choices(
    list(filter(lambda type_: not Token.is_type_special(type_, model_type), types_counts.keys())), 
    k=n_samples)
recipes[f'Random types, dropped into same context'] = lambda tok: tok.with_types(random_types)
def random_types_in_natural_contexts(tok):
    natural_toks = random.choices(
        list(filter(lambda candidate_tok: candidate_tok.same_type(tok) and not candidate_tok.is_edge, 
                    dataset_toks)), k=n_samples)
    return [natural_tok.replace(random_type) for natural_tok, random_type in zip(natural_toks, random_types)]
recipes[f'Random types, dropped in naturally occurring context'] = random_types_in_natural_contexts

In [None]:
# Apply recacts_utiles, to construct the custom corpora to compare to
custom_corpora = {}
print('Constructing corpora...')
for recipe, f in recipes.items():
    print(recipe)
    custom_corpora[recipe] = {tok_id: f(dataset_toks[tok_id]) for tok_id in tok_ids}  # a custom corpus is e.g. "tokens in doc" of this specific doc
print('\nPulling activations...')
custom_corpora_acts = {}
for recipe in recipes:
    print(recipe)
    custom_corpora_acts[recipe] = {tok_id: model.get_toks_acts(custom_toks) for tok_id, custom_toks in custom_corpora[recipe].items()}        

## Calculating distances to tokens of interest

In [None]:
# Params
get_distances = get_angles # metric
n_layers=None
layers = select_layers(list(dataset_acts), n_layers)

In [None]:
# Calculate
distances = pd.DataFrame()
dim = dataset_acts[layers[0]][0].shape[0]
for layer in layers:
    print(layer)
    _dataset_acts = dataset_acts[layer]
    for tok_id in tok_ids:
        tok = dataset_toks[tok_id]
        _tok_act = _dataset_acts[tok_id]
        # filters on dataset
        for filter_ in filters:
            subcorpus_ids = subcorpora[filter_][tok_id]
            subcorpus_distances = {
                'other token': [dataset_toks[id_] for id_ in subcorpus_ids],
                'distance': get_distances(_dataset_acts[subcorpus_ids], _tok_act),
                'layer': [layer,] * len(subcorpus_ids),
                'key': [filter_,] * len(subcorpus_ids),
                'main token': [tok,] * len(subcorpus_ids)}
            distances = pd.concat([distances, pd.DataFrame(subcorpus_distances)], ignore_index=True)
            del subcorpus_distances
        # custom corpora
        for recipe in recipes:
            custom_corpus = custom_corpora[recipe][tok_id]
            _custom_corpus_acts = custom_corpora_acts[recipe][tok_id][layer]
            custom_corpus_distances = {
                'other token': custom_corpus,
                'distance': get_distances(_custom_corpus_acts, _tok_act),
                'layer': [layer,] * len(custom_corpus),
                'key': [recipe,] * len(custom_corpus),
                'main token': [tok,] * len(custom_corpus)}
            distances = pd.concat([distances, pd.DataFrame(custom_corpus_distances)], ignore_index=True)
            del custom_corpus_distances

In [None]:
# Save
def tok_text(tok, k=5):
    words = (tok.text if type(tok) is not str else "").split(' ')
    return '<br>'.join([' '.join(words[i:i+k]) for i in range(0,len(words), k)])
distances['main token text'] = distances['main token'].apply(tok_text)
distances['other token text'] = distances['other token'].apply(tok_text)
distances['layer'] = distances['layer'].apply(lambda arr: arr.split('_')[1] if len(arr.split('_'))>1 else arr)
# Save
if not n_layers: n_layers = len(list(dataset_acts))
out_dir_path = os.path.join(dataset_dir, output_dir)
if not os.path.exists(out_dir_path):
    os.mkdir(out_dir_path)
save_path = os.path.join(out_dir_path,
    f'types_vs_contexts_{n_tokens}{filt}tokens_to_{n_samples}neighbors_across_{n_layers}layers{tag}.csv')
distances.to_csv(save_path)


# Visualize

## Set up

In [None]:
# Params
data_path = '' # Path to data. If empty, defaults to above created save path.
bands = True  # Add quartile bands around the lines
keys = [
#     'Random tokens', 
    'Same type, in naturally occurring contexts',
#     'Same predecessor, in naturally occurring contexts',
#     'Same successor, in naturally occurring contexts',
    'Same type, dropped into random contexts',
    'Random types, dropped into same context',
#     'Random types, dropped in naturally occurring context'
    ]

In [None]:
layers = distances['layer'].unique().tolist()
if not data_path: data_path = save_path
# distances - pull from csv
for layer in layers:
    median_rand = distances[(distances.key=='Random tokens') & (distances.layer==layer)]['distance'].median()
    distances.loc[distances.layer==layer, 'normalized distance'] = distances[distances.layer==layer]['distance']/median_rand
median = pd.concat([pd.DataFrame(dict(layer=layers)), 
                   pd.DataFrame({key: [distances[(distances.layer==layer) & (distances.key==key)]['normalized distance'].quantile(.5) for layer in layers] for key in keys})],
                   axis=1)
q1 = pd.concat([pd.DataFrame(dict(layer=layers)), 
                   pd.DataFrame({key: [distances[(distances.layer==layer) & (distances.key==key)]['normalized distance'].quantile(.25) for layer in layers] for key in keys})],
                   axis=1)
q3 = pd.concat([pd.DataFrame(dict(layer=layers)), 
                   pd.DataFrame({key: [distances[(distances.layer==layer) & (distances.key==key)]['normalized distance'].quantile(.75) for layer in layers] for key in keys})],
                   axis=1)

## Line graph

In [None]:
title = ''
lines = px.line(median, x='layer', y=keys, width=900, height=500, labels={"variable": ""}, title=title).update_traces(line=dict(width=3))
bands = get_error_bands(keys, q1,q3, layers)

# Tweaks
upper_bound = px.line(x=layers, y=[1,]*len(layers), color_discrete_sequence=['white',])
fig = combine_figs([lines, bands, upper_bound])
fig.update_yaxes(title='Distance', range=[0, 1.1], showticklabels=False, showgrid=False, nticks=2)
fig.update_xaxes(title=f'Layers in {get_generic(model_type).upper()}', showticklabels=True, tickvals=layers)
fig.update_layout(title_x=0.1, title_y=.85)
fig

In [None]:
split=False
n_rows = len(distances['main token'].unique()) if split else 1
facet_rows = 'main token' if split else None
fig2 = px.box(distances[(distances.key.isin(keys))], x='layer', y='normalized distance', color='key', 
               title=title, hover_name='other token text', hover_data={col: False for col in ['layer', 'key', 'normalized distance']}, 
               width=800, height=n_rows*500,
              facet_row=facet_rows,
                boxmode='overlay',
              notched=False,
                points=False,
              )
upper_bound = px.line(x=layers, y=[1,]*len(layers), color_discrete_sequence=['white',])
fig2 = combine_figs([fig2, upper_bound])
fig2.update_yaxes(title='Distance', range=[0,1.1], showticklabels=False, showgrid=False, nticks=2)
fig2.update_xaxes(title=f'Layers in {model_name.upper()}', showticklabels=True, tickvals=layers)
fig2.update_layout(title_x=0.1, title_y=.85)
fig2