# Visualize comparison between type and context

In [None]:
print('Notebook is working.')
%load_ext autoreload
%autoreload 2
import os
import sys
sys.path.insert(0, os.path.abspath('../../..'))  # distances --> vis-wiki --> analysis-and-vis --> src
# This form of import is a good research practice because many directories may want to use the same utils,
# but note that this is a bad practice for publishing packages because directories should be modular, with all utils inside them.
# Consider changing before publishing.
# load
import pandas as pd
# vis
import plotly.express as px
from plotly import graph_objects as go
from utils.plotly_util import get_error_bands, combine_figs

## Select data

In [None]:
# Params
data_path = '/atlas/u/pkalluri/bert-vis/big-data/bert/bert-base-cased/distances/types_vs_contexts_5tokens_to_5neighbors_across_13layers.csv'
model_name = 'bert'
title = ''

In [None]:
# Load
distances = pd.read_csv(data_path)
keys = distances['key'].unique()
keys

In [None]:
# Choose keys of interest
of_interest = lambda key: (
    (key.startswith('Same type') and not key.endswith('random contexts'))
    or (key.startswith('Same successor') and not key.endswith('random contexts'))
    or (key.startswith('Random types'))
    or (key.startswith('Partner'))
    or (key.startswith('Same bigram') and not key.endswith('random contexts'))
)
keys = list(filter(of_interest, keys))
print(keys)

## Prepare data

In [None]:
layers = distances['layer'].unique().tolist()
for layer in layers:
    median_rand = distances[(distances.key=='Random tokens') & (distances.layer==layer)]['distance'].median()
    distances.loc[distances.layer==layer, 'normalized distance'] = distances[distances.layer==layer]['distance']/median_rand
median = pd.concat([pd.DataFrame(dict(layer=layers)), 
                   pd.DataFrame({key: [distances[(distances.layer==layer) & (distances.key==key)]['normalized distance'].quantile(.5) for layer in layers] for key in keys})],
                   axis=1)
q1 = pd.concat([pd.DataFrame(dict(layer=layers)), 
                   pd.DataFrame({key: [distances[(distances.layer==layer) & (distances.key==key)]['normalized distance'].quantile(.25) for layer in layers] for key in keys})],
                   axis=1)
q3 = pd.concat([pd.DataFrame(dict(layer=layers)), 
                   pd.DataFrame({key: [distances[(distances.layer==layer) & (distances.key==key)]['normalized distance'].quantile(.75) for layer in layers] for key in keys})],
                   axis=1)

## Vis

In [None]:
# Params
include_bands = True  # Add quartile bands around the lines
include_lines = True
# Simple line plots
parts = []
upper_bound = px.line(x=layers, y=[1,]*len(layers), color_discrete_sequence=['white',], width=900, height=500)
parts.append(upper_bound)
if include_lines:
    lines = px.line(median, x='layer', y=keys, width=900, height=500, labels={"variable": ""}, title=title).update_traces(line=dict(width=3))
    parts.append(lines)
if include_bands:
    bands = get_error_bands(keys, q1,q3, layers)
    parts.append(bands)
# Tweaks
fig = combine_figs(parts)
fig.update_yaxes(title='Distance', range=[0, 1.1], showticklabels=False, showgrid=False, nticks=2)
fig.update_xaxes(title=f'Layers in {model_name.upper()}', showticklabels=True, tickvals=layers)
fig.update_layout(title_x=0.1, title_y=.85)
fig

In [None]:
# Detailed plots
split=False
n_rows = len(distances['main token'].unique()) if split else 1
facet_rows = 'main token' if split else None
fig2 = px.box(distances[(distances.key.isin(keys))].sample(8000, replace=True), x='layer', y='normalized distance', color='key', 
               title=title, hover_name='other token text', hover_data={col: False for col in ['layer', 'key', 'normalized distance']}, 
               width=700, height=n_rows*500,
               facet_row=facet_rows,
               #stripmode='overlay',
               #notched=False,
               #points=False,
               category_orders=dict(key=['Same type, in naturally occurring contexts',
                                         'Same type, dropped into random contexts',
                                          'Random types, dropped into same context']))
upper_bound = px.line(x=layers, y=[1,]*len(layers), color_discrete_sequence=['white',])
fig2 = combine_figs([fig2, upper_bound])
fig2.update_yaxes(title='Distance', range=[0,1.1], showticklabels=False, showgrid=False, nticks=2)
fig2.update_xaxes(title=f'Layers in {model_name.upper()}', showticklabels=True, tickvals=layers)
fig2.update_layout(title_x=0.1, title_y=.85)
fig2

In [None]:
# Combo
combine_figs([fig2, fig])