In [8]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [9]:
# Parameters
data_dir = '../../bucket/wikipedia/1000docs_19513contexts_30maxtokens/'
contexts_filename = 'contexts.pickle'
acts_filename = 'activations.npz'
n_nearest_neighbors = 10
KNN_models_filename = f'KNN_models_K={n_nearest_neighbors}.pickle'
# layers = ['arr_0','arr_3','arr_6', 'arr_9', 'arr_12']  # which layers to visualize
layers = [f'arr_{i}' for i in range(13)]
# layers = ['arr_0']  # good for debugging
reductions = [('KernelPCA',2)]
view_vis_as_html = False  # If True, running the vis will also generate an interactive html file and open it

In [3]:
# Imports
# LOAD
import pickle
import numpy as np
import os
import sys
project_path = os.path.abspath('../..')
sys.path.insert(0, project_path)
from src.utils import acts_util
# TAG
import nltk
import re
import pandas as pd
from src.utils import context_util
# VIS
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
if view_vis_as_html:
    output_file('visualize-movement.html')
from bokeh.models import Label, LabelSet, Div, ColumnDataSource, Legend, LegendItem
from bokeh.models import HoverTool, CustomJS, PanTool, BoxZoomTool, WheelZoomTool, ResetTool, TapTool, OpenURL
from bokeh.models.glyphs import Circle
from bokeh.layouts import gridplot
from bokeh import events
from bokeh.palettes import Inferno, Category10, Category20, Category20c, Pastel1, Pastel2, Bokeh, Plasma
from src.utils import vis_util, html_util

# Loading contexts and acts

In [4]:
# Load contexts and layer_to_acts
with open(os.path.join(os.path.abspath(data_dir), contexts_filename), 'rb') as f:
    contexts = pickle.load(f)
acts_npz = np.load(os.path.join(data_dir, acts_filename))

In [5]:
layer_to_acts = {layer: acts_npz[layer] for layer in layers}

In [6]:
# # Reductions
# reduced_acts = acts.copy()
# for layer in layers:
#     print(layer)
#     for reduction, dim in reductions:
#         curr_acts = reduced_acts[layer]
#         reduced_acts[layer] = acts_util.reduce_activations(curr_acts, reduction, dim)

NameError: name 'KNN_models_filename' is not defined

# Dimensionality reduce doc and KNN

# Single doc

In [None]:
doc_number = 101
doc_ids = context_util.get_doc_ids(contexts, doc_number)
doc, _ = contexts[doc_ids[0]]
print(context_util.doc_str(doc))

In [None]:
n_nearest_neighbors = 10
KNN_models_filename = f'KNN_models_K={n_nearest_neighbors}.pickle'

# K Nearest Neighbor models
with open(os.path.join(os.path.abspath(data_dir), KNN_models_filename), 'rb') as f:
    layer_to_KNN_model = pickle.load(f)
layer_to_neighbors = {}  # for each tok in document, map it to its nearest neighbors' ids
for layer in layers:
    acts = layer_to_acts[layer]
    KNN_model = layer_to_KNN_model[layer]
    neighbors_distances, neighbors_ids = KNN_model.kneighbors(acts[doc_ids])
    layer_to_neighbors[layer] = neighbors_ids

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
# Fresh vis
columns = []
layer_name_column = [None] + [Div(text=layer, align=('center', 'center')) for layer in layers]
columns.append(layer_name_column)

# optionally focus on subset of doc
start_pos, end_pos = 2,-2
phrase = doc[start_pos:end_pos]
phrase_ids = doc_ids[start_pos:end_pos]
# set vis params
palette = Category20[20]
# create a column of plots
plot_column = []
plot_column.append(Div(text=' '.join([f'{reduction}{dim}' for reduction, dim in reductions]), align=('center', 'center'))) # column header
for layer in layers:
    # fit this layer's dimensionality reduction model
    phrase_neighbors = layer_to_neighbors[layer][start_pos:end_pos]
    ids_to_fit = phrase_ids + flatten(phrase_neighbors)  # multiple options here
    acts = layer_to_acts[layer]
    acts_to_fit = acts[ids_to_fit]  # init
    fit_reducers = []
    for reduction, dim in reductions:
        fit_reducer, acts_to_fit = acts_util.fit_reducer(acts_to_fit, reduction, dim)
        fit_reducers.append(fit_reducer)
    
    # reduce and prep the document's points
    phrase_contexts = [contexts[context_id] for context_id in phrase_ids]
    phrase_reduced_acts = acts[phrase_ids]  # init
    for reducer in fit_reducers:
        phrase_reduced_acts = reducer.transform(phrase_reduced_acts)
    phrase_points = {
        'x': phrase_reduced_acts[:,0],
        'y': phrase_reduced_acts[:,1],
        'color': [palette[tok_idx] for tok_idx in range(len(phrase))],
        'line color': ['black'] * len(phrase),
        'line width': [1] * len(phrase),
        'label': [[f'[{pos}]'] for doc, pos in phrase_contexts],
        'hover label': [context_util.context_str(*context, marker=html_util.highlighter(color='yellow')) for context in phrase_contexts]
        }

    # reduce and prep the neighbors
    processed_neighbors = []
    neighbor_points = {'x':[], 'y':[], 'color':[], 'legend label':[], 'label':[], 'hover label':[]}
    for tok_idx in range(len(phrase)):
        tok_pos = tok_idx + start_pos  # position relative to entire doc
        tok_neighbors = phrase_neighbors[tok_idx][1:]  # skip zeroeth neighbor; that's the token itself
        tok_neighbors_contexts = [contexts[neighbor] for neighbor in tok_neighbors]
        tok_neighbors_reduced_acts = acts[tok_neighbors]  # init
        for reducer in fit_reducers:
            tok_neighbors_reduced_acts = reducer.transform(tok_neighbors_reduced_acts)
            
        # visualize different kinds of neighbors differently
        for neighbor_idx, neighbor in enumerate(tok_neighbors):
            if neighbor in phrase_ids:  # update existing phrase point
                phrase_point_idx = phrase_ids.index(neighbor)
                phrase_points['label'][phrase_point_idx] += f'{tok_pos}'
                phrase_points['line color'][phrase_point_idx] = 'aqua'
                phrase_points['line width'][phrase_point_idx] = 3
            elif neighbor in processed_neighbors:  # update existing neighbor point
                neighbor_point_idx = processed_neighbors.index(neighbor)
                neighbor_points['label'][neighbor_point_idx] += f'{tok_pos}'
                neighbor_points['color'][neighbor_point_idx] = 'aqua'
            else:  # new neighbor
                neighbor_context = contexts[neighbor]
                neighbor_reduced_acts = tok_neighbors_reduced_acts[neighbor_idx]
                neighbor_points['x'].append(neighbor_reduced_acts[0])
                neighbor_points['y'].append(neighbor_reduced_acts[1])
                neighbor_points['color'].append(palette[tok_idx])
                neighbor_points['legend label'].append([f'[{tok_pos}] {doc[tok_pos]}'])
                neighbor_points['label'].append([f'{tok_pos}'])
                neighbor_points['hover label'].append(
                    context_util.context_str(*neighbor_context, marker=html_util.highlighter(color='lightgrey')))
                processed_neighbors.append(neighbor)    
    neighbor_points['label'] = [label if len(label)>1 else '' for label in neighbor_points['label']]
    
    # plot 
    phrase_points_source = ColumnDataSource(phrase_points)
    neighbor_points_source = ColumnDataSource(neighbor_points)
    p = vis_util.empty_plot(width=400, height=250, darkmode=False)
    p.add_layout(Legend(), 'right')
    p.circle(x='x', y='y', color='color', size=10, legend_group='legend label', source=neighbor_points_source)
    p.add_layout(LabelSet(x='x', y='y', text='label', x_offset=2, y_offset=2, text_font_size='10pt', source=neighbor_points_source))
    p.triangle(x='x', y='y', color='color', line_color='line color', size=15, line_width='line width', source=phrase_points_source)
    p.add_layout(LabelSet(x='x', y='y', text='label', x_offset=2, y_offset=2, text_font_size='10pt', source=phrase_points_source))
    zoom_tool = WheelZoomTool()
    p.tools = [PanTool(), zoom_tool, BoxZoomTool(), ResetTool(), HoverTool(tooltips=vis_util.custom_bokeh_tooltip('hover label'))]
    p.toolbar.active_scroll = zoom_tool
    plot_column.append(p)
columns.append(plot_column)
show(gridplot(zip(*columns)))

# Use KNN as dimensionality reduction

In [None]:
# One doc
doc_number = 101
doc_ids = context_util.get_doc_ids(contexts, doc_number)
doc, _ = contexts[doc_ids[0]]
doc, doc_ids = doc, doc_ids
print(context_util.doc_str(doc))

In [None]:
n_nearest_neighbors = 10
KNN_models_filename = f'KNN_models_K={n_nearest_neighbors}.pickle'
with open(os.path.join(os.path.abspath(data_dir), KNN_models_filename), 'rb') as f:
    layer_to_KNN_model = pickle.load(f)

In [None]:
# layer_to_KNN = {}
# for layer in layers:
#     acts = layer_to_acts[layer]
#     KNN_model = layer_to_KNN_model[layer]
#     neighbors_distances, neighbors_ids = KNN_model.kneighbors(acts)
#     layer_to_KNN[layer] = neighbors_ids

In [None]:
layer_to_doc_KNN = {}
for layer in layers:
    acts = layer_to_acts[layer]
    KNN_model = layer_to_KNN_model[layer]
    neighbors_distances, neighbors_ids = KNN_model.kneighbors(acts[doc_ids])
    layer_to_doc_KNN[layer] = neighbors_ids

In [None]:
reductions = [('PCA',2)]
layer_to_doc_reduced_KNN = layer_to_doc_KNN.copy()
# do reductions
for layer in layers:
    print(layer)
    for reduction, dim in reductions:
        curr_vals = layer_to_doc_reduced_KNN[layer]
        layer_to_doc_reduced_KNN[layer] = acts_util.reduce_activations(curr_vals, reduction, dim)

In [None]:
columns = []
columns.append([None] + [Div(text=layer, align=('center', 'center')) for layer in layers])   # layer names

In [None]:
palette = Inferno[256][::20]
green_highlighter = lambda tok: html_util.highlight_html(tok, color='limegreen')
plot_column = [Div(text=' '.join([f'{reduction}{dim}' for reduction, dim in reductions]), align=('center', 'center'))]
for layer in layers:
    p = vis_util.empty_plot(dim=300, darkmode=True)
    points = layer_to_doc_reduced_KNN[layer]  
    doc_contexts = [contexts[context_idx] for context_idx in doc_ids]
    doc_source = ColumnDataSource(
        {
            'x': points[:,0],
            'y': points[:,1],
            'color': palette[:len(reduced_acts)],
            'label': [pos for doc, pos in doc_contexts],
            'hover label': [context_util.context_str(*context, marker=green_highlighter) for context in doc_contexts]
        }
    )
    p.circle(x='x', y='y', color='color', size=5, source=doc_source)
    p.add_layout(LabelSet(x='x', y='y', text='label', x_offset=2, y_offset=2, source=doc_source, render_mode='canvas', text_font_size='10pt'))
    
    # add lines connecting document tokens in order
    for point_start_idx in range(len(points)):
        endpoints = points[point_start_idx: point_start_idx+2]
        xs, ys = zip(*endpoints)
        p.line(x=xs, y=ys, color=palette[point_start_idx], line_width=4)
    p.tools = [PanTool(), WheelZoomTool(), BoxZoomTool(), ResetTool(), HoverTool(tooltips=custom_tooltip('{hover label}'))]
    plot_column.append(p)
columns.append(plot_column)
show(gridplot(zip(*columns)))

In [None]:
palette = list(Inferno[256][::20])
p = vis_util.empty_plot(darkmode=True, dim=200)
layer_to_doc_travel = {}

for tok_idx, tok in enumerate(doc):
    travelled_distance = []
    prev_layer = layers[0]
    for layer_idx, layer in enumerate(layers):
        direction = layer_to_doc_KNN[prev_layer][tok_idx] - layer_to_doc_KNN[layer][tok_idx]
        travelled_distance.append(np.linalg.norm(direction))
#         prev_layer = layer
    p.line(x=range(len(travelled_distance)), y=travelled_distance, color=palette[tok_idx])
show(p)

In [None]:
palette = list(Inferno[256][::20])
p = vis_util.empty_plot(darkmode=True, dim=200)
layer_to_doc_travel = {}

for tok_idx, tok in enumerate(doc):
    travelled_distance = []
    prev_layer = layers[0]
    for layer_idx, layer in enumerate(layers):
        direction = layer_to_doc_KNN[prev_layer][tok_idx] - layer_to_doc_KNN[layer][tok_idx]
        travelled_distance.append(np.linalg.norm(direction))
        prev_layer = layer
    p.line(x=range(len(travelled_distance)), y=travelled_distance, color=palette[tok_idx])
show(p)