# Analyze my region

## Load imports

In [None]:
print('Active notebook.')
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.insert(0, os.path.abspath('../../..'))  # distances --> vis-wiki --> analysis-and-vis --> src
# This form of import is reasonable research practice because many directories may want to use the same utils,
# but note that this is a bad practice for publishing packages because directories should be modular, with all utils inside them.
# Consider changing before publicly publishing code.
# load
from utils import references as refs
import pickle
import numpy as np
from utils.Token import Token
from utils.misc_util import select_layers
from collections import Counter
# process
import random
random.seed(0)
from utils.ModelType import ModelType, get_generic, berts, gpts
from utils.MyModel import MyModel
# calculate distances
from utils.acts_util import get_angles, get_euclidean_distances, spherize
import pandas as pd
from utils.FastNearestNeighbors import FastNearestNeighbors
# vis
import plotly.express as px
from plotly.colors import qualitative as color_sequences

## Set parameters

In [None]:
# YOU MUST SET THESE TO YOUR PATHS TO POINT TO YOUR DATA LOCATIONS
model_type = ModelType.bert_base_cased
dataset_dir = f"/atlas/u/pkalluri/bert-vis/big-data/{get_generic(model_type)}/{model_type.value}/"
glove_file = f"/atlas/u/pkalluri/bert-vis/big-data/glove/glove.840B.300d.txt"
os.environ['TRANSFORMERS_CACHE'] = '/atlas/u/pkalluri/.cache'

In [None]:
# YOU CAN LEAVE THESE PARAMETERS AS THE DEFAULTS IF DESIRED
# Number of regions to analyze
n_regions = 10

# How many samples would you like to compare to? (akin to how big of a region)
n_neighbors = 100

# Token filter. Any center tokens or specific kinds of tokens?
# Filter can be: None, top (top tokens only), partial (partial words), of a range a-b, e.g. 10-20 (tokens between the 10th and 20th percentile)
filt = '4-5'

n_layers = None  # None results in analyzing all layers

# Number of GLOVE bands to split and investigate
n_bands=10

# # Where to save the data, relative to the data directory
# output_dir = 'distances'
# # Tag to attach to saved data. e.g. to indicate something special about this run.
# tag = ''

## Get tokens and neighborhoods

In [None]:
# Read corpus
model = MyModel(model_type=model_type)
dataset_dir = os.path.abspath(dataset_dir)
dataset_toks = [Token(doc,pos,model_type) for doc,pos in pickle.load(open(os.path.join(dataset_dir, refs.toks_fn),'rb'))]
dataset_acts = np.load(os.path.join(dataset_dir, refs.acts_fn))
words_counts = Counter([tok.word for tok in dataset_toks])
layers = select_layers(list(dataset_acts), n_layers)

In [None]:
# Filter tokens
if filt in ['', ' ', None]:
    f = lambda tok: True
elif filt == 'partial':
    f = lambda tok: tok.is_partial
elif filt == 'top':
    top_n, _ = zip(*types_counts.most_common(n_tokens + 10))
    f = lambda tok: tok.type in top_n
elif filt[0].isdigit():  # e.g. 10-20: get instances in between and top 10th percentile and 20th percentile
    top_bound, bottom_bound = filt.split('-')
    top_bound = int(int(top_bound) / 100 * len(words_counts))  # smaller number (nearer to top 1st percentile)
    bottom_bound = int(int(bottom_bound) / 100 * len(words_counts))  # larger number (nearer to bottom words)
    valid_words, _ = zip(*words_counts.most_common()[top_bound:bottom_bound])
    f = lambda tok: tok.type in valid_words
# Note: these tokens are all unique - however there may be duplicates of the same type
candidate_ids = list(filter(lambda tok_id: f(dataset_toks[tok_id]) and dataset_toks[tok_id].is_valid, range(len(dataset_toks))))
toks_ids = random.sample(candidate_ids, n_regions)  # Sampling from the valid corpus
print('Words: ', ' '.join([dataset_toks[tok_id].word for tok_id in toks_ids]))
# for id_ in toks_ids: print(types_counts[dataset_toks[id_].word])

In [None]:
# get KNNs

# Normally commented out. Overrides earlier set number of layers. Useful for debugging
# n_layers = 6
# layers = select_layers(list(dataset_acts), n_layers)

neighborhoods = pd.DataFrame(columns=[])
# dataset_neighbors = {}
for layer in layers:
    print(f'Analyzing layer {layer}.')
    _acts = spherize(dataset_acts[layer])  
    # spherizing implicitly changes the distance metric to cosine distance
    print('Fitting nearest neighbors model.')
    knn_model = FastNearestNeighbors().fit(_acts)
    toks_acts = [_acts[tok_id] for tok_id in toks_ids]
    del _acts
    print('Finding neighbors.')
    _, neighborhoods_ids = knn_model.kneighbors(toks_acts, n_neighbors=n_neighbors, return_distance=True)
    del knn_model
    for tok_id, neighborhood_ids in zip(toks_ids, neighborhoods_ids):
        neighborhood = pd.DataFrame(columns=['layer', 'token_id', 'neighbor_rank', 'neighbor_id'])
        neighborhood['neighbor_rank'] = range(n_neighbors)
        neighborhood['neighbor_id'] = neighborhood_ids
        neighborhood['token_id'] = tok_id
        neighborhood['layer'] = layer.split('_')[1]
        neighborhoods = pd.concat([neighborhoods, neighborhood], ignore_index=True)

## Tag neighbors with interesting traits
e.g. Is this neighbor the same word? Is it close in GLOVE space?

In [None]:
# Tag with interesting traits for upcoming visualization
neighborhoods['token'] = neighborhoods.token_id.apply(lambda tok_id: dataset_toks[tok_id])
neighborhoods['text'] = neighborhoods.token.apply(lambda tok: tok.text)
neighborhoods['word'] = neighborhoods.token.apply(lambda tok: tok.word)
neighborhoods['neighbor_token'] = neighborhoods.neighbor_id.apply(lambda tok_id: dataset_toks[tok_id])
neighborhoods['neighbor_text'] = neighborhoods.neighbor_token.apply(lambda tok: tok.text)
neighborhoods['neighbor_word'] = neighborhoods.neighbor_token.apply(lambda tok: tok.word)
# neighborhoods['word_count'] = neighborhoods.neighbor_token.apply(lambda tok: words_counts[tok.word])
def get_pair_text(pair, k=5):
    words1 = pair.text.split(' ')
    text1 =  '<br>'.join([' '.join(words1[i:i+k]) for i in range(0,len(words1), k)])
    words2 = pair.neighbor_text.split(' ')
    text2 =  '<br>'.join([' '.join(words2[i:i+k]) for i in range(0,len(words2), k)]) 
    return f'{text1} <--> {text2}'
neighborhoods['pair'] = neighborhoods.apply(get_pair_text, axis=1)
def get_key(pair):
    if pair.word == pair.neighbor_word: key = 'Same word'
    elif pair.token.doc == pair.neighbor_token.doc: key = 'Same sentence'
    elif pair.token.prev == pair.neighbor_token.prev: key = 'Same prev'
    else: key = 'Other'
    return key
neighborhoods['key'] = neighborhoods.apply(get_key, axis=1)

In [None]:
# Tag with glove distance
def get_glove_embs(fp, n_glove_embs=10000):
    """Load the specified number of glove embeddings."""
    gloveID_to_word = []
    word_to_gloveID = {}
    embs = np.array([])
    for gloveID, word_emb in enumerate(list(open(glove_file))[:n_glove_embs]):
        if gloveID % 1000 == 0: print(f'Processing {gloveID}.')
        word, emb = word_emb.split(' ', 1)
        gloveID_to_word.append(word)
        word_to_gloveID[word] = gloveID
        emb = [float(val) for val in emb.split(' ')]
        embs = np.array([emb]) if not embs.any() else np.append(embs, [emb], axis=0)
    return gloveID_to_word, word_to_gloveID, embs
gloveID_to_word, word_to_gloveID, glove_embs = get_glove_embs(glove_file)
def get_glove_emb(word):
    """Get glove embedding of word"""
    return glove_embs[word_to_gloveID[word]]
def get_glove_distance(word1, word2, default=0):
    """
    Get glove distance between two words. 
    If either is not in loaded glove embeddings, return default.
    """
    if word1 in word_to_gloveID and word2 in word_to_gloveID:
        return np.abs(np.linalg.norm(get_glove_emb(word1)-get_glove_emb(word2)))
    else:
        return 0

In [None]:
neighborhoods['glove_distance'] = neighborhoods.apply(
    lambda pair: get_glove_distance(pair.word, pair.neighbor_word), axis=1)

In [None]:
# Tag with glove distance band
thresholds = [0] + [neighborhoods[(neighborhoods.glove_distance!=0)].glove_distance.quantile(i/n_bands) for i in range(1, n_bands+1)]
def get_glove_band(glove_dist):
    for i, threshold in enumerate(thresholds):
        if glove_dist <= threshold: 
            return i
neighborhoods['glove_band'] = neighborhoods.glove_distance.apply(get_glove_band)

In [None]:
# IN PROGRESS: TAGG BY HOW CLOSE NEIGHBORS ARE IN WORD EMBEDDING DISTANCE OR LAYER 0 DISTANCE
# def get_init_distance(tok_id, neighbor_id, data=neighborhoods):
#     init_distance = data[(data.layer=='arr_0') & 
#                          (data.token_id==tok_id) & 
#                          (data.neighbor_id==neighbor_id)].reset_index().neighbor_rank
#     if len(init_distance) != 0:
#         return init_distance[0]
#     else:
#         return 101
# neighborhoods['initial_distance'] = neighborhoods.apply(
#     lambda pair: get_init_distance(pair.token_id, pair.neighbor_id), axis=1)
# def get_pre_distance(pair):
#     init_distance = view[(view.layer=='arr_0') & 
#                          (view.token_id==pair.token_id) & 
#                          (view.neighbor_id==pair.neighbor_id)].reset_index().neighbor_rank
#     if len(init_distance) != 0:
#         return init_distance[0]
#     else:
#         return 101
# neighborhoods['initial_distance'] = neighborhoods.apply(get_init_distance, axis=1)

## Visualizations

### Simple visualization of approaching tokens

In [None]:
px.box(neighborhoods, x='layer', y='neighbor_rank', color='key', hover_name='pair', 
       width=800, height=300, labels=dict(neighbor_rank='', layer='Layer'))

In [None]:
fig = px.strip(neighborhoods, x='layer', y='neighbor_rank', color='key', hover_name='pair', 
               stripmode='overlay', labels=dict(neighbor_rank='', layer='Layer'))
fig.update_traces(marker=dict(symbol='circle', size=10))

In [None]:
# If you want to ee each word seperately. 
# This is useful because different words (especially different frequency bands) act very differently.
fig = px.strip(neighborhoods, x='layer', y='neighbor_rank', color='key', facet_row='word',
       hover_name='pair', width=800, height=1200, labels=dict(neighbor_rank='', layer='Layer'))
fig.for_each_annotation(lambda a: a.update(text='\"'+a.text.split('=')[1]+'\"'))

### Heatmap of approaching tokens

In [None]:
token_i = {token: i for i, token in enumerate(toks_ids)}
neighborhoods['x'] = neighborhoods.apply(lambda row: float(row.layer) + .08 * token_i[row.token_id], axis=1)

In [None]:
def key_view(max_rank=n_neighbors, data=neighborhoods, height=None):
    """Grab neighbors with rank less than max_rank and show.
    This is different than a strip because each column of points corresponds to one point.
    This is important because otherwise you see all colors at all ranks without being able to tell why that is.
    In other words, this vis combines the above summary and word visualizations."""
    miniview = data[(data.neighbor_rank < max_rank)]
    height = height if height else max_rank
    fig = px.scatter(miniview, x='x', y='neighbor_rank', color='key', hover_name='pair',
             height=height, template='plotly_white',
             category_orders=dict(key=['Same word', 'Other', 'Same sentence']),
             opacity=1, labels=dict(neighbor_rank='', x='Layer'),
             symbol_sequence=['circle'], range_y=[0,max(max_rank, 100)])
    return fig.update_traces(marker=dict(size=7))

In [None]:
key_view(10)

In [None]:
key_view(100)

In [None]:
key_view(n_neighbors, height=500)

### Heatmap of which glove distances are approaching

In [None]:
def glove_view(max_rank=n_neighbors, data=neighborhoods):
    miniview = data[(data.neighbor_rank < max_rank)]
    fig = px.scatter(miniview, x='x', y='neighbor_rank', color='glove_distance', hover_name='pair', template='plotly_white',
         labels=dict(neighbor_rank='', layerish='Layer', glove_distance='Glove distance', glove_distance_v2='Glove distance'), 
         color_continuous_scale=px.colors.sequential.Rainbow[::-1], symbol_sequence=['circle'], range_y=[0,max(max_rank,100)])
    return fig

In [None]:
glove_view(10)

In [None]:
glove_view(100)

In [None]:
glove_view(n_neighbors)

### Heatmap of which glove bands are approaching

In [None]:
def band_view(max_rank=n_neighbors, data=neighborhoods):
    miniview = data[(data.neighbor_rank < max_rank)]
    miniview['glove_band_name'] = miniview.glove_band.astype(str)
    fig = px.scatter(miniview, x='x', y='neighbor_rank', color='glove_band_name', hover_name='pair',
         template='plotly_white',  # color_discrete_sequence=['red', 'yellow', 'limegreen', 'blue'] + ['black']*10, 
         category_orders=dict(glove_band_name=[str(i) for i in range(n_bands+1)]),
         labels=dict(neighbor_rank='', layerish='Layer', glove_band='Glove band', glove_band_name='Glove band'), 
         hover_data=['glove_distance'], range_y=[0,max(max_rank,100)])
    return fig

In [None]:
band_view(10)

In [None]:
band_view(100)

In [None]:
band_view(n_neighbors)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 
data = view
counts = pd.DataFrame()
tok_region_sizes = {tok: len(data[(data.token==tok) & (data.layer==layer) & (data.key=='Same word')]) for tok in data.token.unique()}
for layer in layers:
    print('Layer', layer)
    for key in data.key.unique():
        _counts = pd.Series()
        _counts['layer'] = layer
        _counts['word'] = ''
        _counts['key'] = key
        _counts['count'] = len(data[(data.layer == layer) & (data.key == key)])
        counts = counts.append(_counts, ignore_index=True)
        for tok in data.token.unique():
            _counts = pd.Series()
            _counts['layer'] = layer
            _counts['text'] = f'{tok.word}-{len(tok.doc)}'
            _counts['key'] = key
            count = len(data[(view.layer == layer) & 
                             (view.key == key) & 
                             (view.token == tok) & 
                             (view.neighbor_rank < tok_region_sizes[tok])])
            _counts['count'] = count
            _counts['fraction'] = count / tok_region_sizes[tok]
            counts = counts.append(_counts, ignore_index=True)

In [None]:
# fig = px.bar(counts[counts.word != ''], x='layer', y='count', color='key', height=900,
#        facet_row='text')
fig = px.bar(counts[counts.word != ''], x='text', y='fraction', color='key', height=400,
       facet_col='layer', category_orders=dict(key=['Same word']))
fig.for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

In [None]:
px.bar(counts[counts.word != ''], x='layer', y='fraction', 
       color='key', height=400, category_orders=dict(key=['Same word']),
       width=600)

In [None]:
import warnings
warnings.filterwarnings("ignore", category=UserWarning) 
data = view[(view.neighbor_rank < 10000)]
counts = pd.DataFrame()
tok_region_sizes = {tok: len(data[(data.token==tok) & (data.layer==layer) & (data.key=='Same word')]) for tok in data.token.unique()}
for layer in layers:
    print('Layer', layer)
    for glove_band in data.glove_band.unique():
        _counts = pd.Series()
        _counts['layer'] = layer
        _counts['word'] = ''
        _counts['glove_band'] = glove_band
        _counts['count'] = len(data[(data.layer == layer) & (data.glove_band == glove_band)])
        counts = counts.append(_counts, ignore_index=True)
        for tok in data.token.unique():
            _counts = pd.Series()
            _counts['layer'] = layer
            _counts['text'] = f'{tok.word}-{len(tok.doc)}'
            _counts['glove_band'] = glove_band
            points = data[(view.layer == layer) & 
                             (view.glove_band == glove_band) & 
                             (view.token == tok)]
            _counts['count'] = len(points)
            _counts['fraction'] = len(points[(view.neighbor_rank < tok_region_sizes[tok])]) / tok_region_sizes[tok]
            counts = counts.append(_counts, ignore_index=True)

In [None]:
px.bar(counts[(counts.word != '')], x='layer', y='count', 
       color='glove_band', height=400, barmode='group',
       width=600)

### Distribution of approaching glove bands
In the nearest 10 tokens, what is the distribution of glove bands?
What about in the nearest 100 tokens? So on.

In [None]:
distance_band = view[(view.neighbor_rank < 5) & (view.glove_distance != 0)]
fig = px.histogram(distance_band, x='glove_distance', facet_col='layer', barmode='group', color='glove_band',
             category_orders=dict(layer=layers, glove_band=list(range(n_bands))),
             labels=dict(glove_band='Glove distance', glove_distance='')).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig

In [None]:
distance_band = view[(view.neighbor_rank < 10) & (view.glove_distance != 0)]
fig = px.histogram(distance_band, x='glove_distance', facet_col='layer', barmode='group', color='glove_band',
             category_orders=dict(layer=layers, glove_band=list(range(n_bands))),
             labels=dict(glove_band='Glove distance', glove_distance='')).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig

In [None]:
distance_band = view[(view.neighbor_rank < 100) & (view.glove_distance != 0)]
fig = px.histogram(distance_band, x='glove_distance', facet_col='layer', barmode='group', color='glove_band',
             category_orders=dict(layer=layers, glove_band=list(range(n_bands))),
             labels=dict(glove_band='Glove distance', glove_distance='')).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))
fig

In [None]:
distance_band = view[(view.neighbor_rank < 1000) & (view.glove_distance != 0)]
px.histogram(distance_band, x='glove_distance', facet_col='layer', barmode='group', color='glove_band',
             category_orders=dict(layer=layers, glove_band=list(range(n_bands))),
             labels=dict(glove_band='Glove distance', glove_distance='')).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

In [None]:
distance_band = view[(view.neighbor_rank < 10000) & (view.glove_distance != 0)]
px.histogram(distance_band, x='glove_distance', facet_col='layer', barmode='group', color='glove_band',
             category_orders=dict(layer=layers, glove_band=list(range(n_bands))),
             labels=dict(glove_band='Glove distance', glove_distance='')).for_each_annotation(lambda a: a.update(text=a.text.split("=")[-1]))

In [None]:
n_big_bands = 4
neighborhoods['glove_group'] = (neighborhoods.glove_band*n_big_bands/n_bands).astype(int)

def distribution_of_bands_view(max_rank=n_neighbors, data=neighborhoods):
    model_band = data[(data.neighbor_rank < max_rank) & (data.glove_group >= 0)]
    return px.histogram(model_band, x='layer', color='glove_group', 
            category_orders=dict(layer=layers, glove_group=list(range(n_big_bands))),
              color_discrete_sequence=color_sequences.Plotly, labels=dict(glove_group='Glove band', layer='Layer')
                       )

In [None]:
distribution_of_bands_view(5)

In [None]:
distribution_of_bands_view(10)

In [None]:
distribution_of_bands_view(100)

In [None]:
distribution_of_bands_view(1000)

### What happens to each glove band

In [None]:
fig = px.strip(neighborhoods[(neighborhoods.neighbor_rank<10)], x='layer', y='neighbor_rank', facet_col='glove_group', facet_col_wrap=2,
       color='glove_group', 
    category_orders=dict(layer=layers,glove_group=list(range(len(neighborhoods.glove_group.unique())))), 
    height=1000,
    labels=dict(neighbor_rank='Distance (rank)', glove_group='Glove band'))
fig.for_each_annotation(lambda a: a.update(text=a.text.replace('=',' ')))
# fig.update_xaxes(nticks=len(layers))