In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
# Parameters
data_dir = '../../../big-data/wiki-large/standard/'
contexts_filename = 'contexts.pickle'
acts_filename = 'activations.npz'
# layers = ['arr_0','arr_3','arr_6', 'arr_9', 'arr_12']  # which layers to visualize
layers = [f'arr_{i}' for i in range(13)]
# layers = ['arr_0']  # good for debugging
reductions = [('KernelPCA',2)]
view_vis_as_html = False  # If True, running the vis will also generate an interactive html file and open it

In [None]:
# Imports
# LOAD
import math
import pickle
import numpy as np
import os
import sys
project_path = os.path.abspath('../../..')
sys.path.insert(0, project_path)
from src.utils import acts_util
# TAG
import nltk
import re
import pandas as pd
from src.utils import context_util
# VIS
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()
if view_vis_as_html:
    output_file('visualize-movement.html')
from bokeh.models import Label, LabelSet, Div, ColumnDataSource, Legend, LegendItem, Range1d
from bokeh.models import HoverTool, CustomJS, PanTool, BoxZoomTool, WheelZoomTool, ResetTool, TapTool, OpenURL
from bokeh.models.glyphs import Circle
from bokeh.layouts import gridplot
from bokeh import events
from bokeh.palettes import Inferno, Category10, Category20, Category20c, Pastel1, Pastel2, Bokeh, Plasma
from src.utils import vis_util, html_util

# Loading contexts and acts

In [None]:
# Load contexts and layer_to_acts
with open(os.path.join(os.path.abspath(data_dir), contexts_filename), 'rb') as f:
    contexts = pickle.load(f)
layer_to_acts = np.load(os.path.join(data_dir, acts_filename))
layers = acts.files # change to fewer layers if you want
layers = list(layers)[:2] # for debugging

In [None]:
doc_number = 101
doc_ids = context_util.get_doc_ids(contexts, doc_number)
doc, _ = contexts[doc_ids[0]]
print(context_util.doc_str(doc))

# Visualize neighborhoods in 2D

In [None]:
n_nearest_neighbors = 10
KNN_models_filename = f'knn_models.pickle'
    
layer_to_neighbors = {}  # for each tok in document, map it to its nearest neighbors' ids
with open(os.path.join(os.path.abspath(data_dir), KNN_models_filename), 'rb') as f:
    for layer in layers:
        acts = layer_to_acts[layer]
        KNN_model = pickle.load(f)
        neighbors_distances, neighbors_ids = KNN_model.kneighbors(acts[doc_ids])
        layer_to_neighbors[layer] = neighbors_ids

In [None]:
def flatten(l):
    return [item for sublist in l for item in sublist]

In [None]:
# Fresh vis
columns = []
layer_name_column = [None] + [Div(text=layer, align=('center', 'center')) for layer in layers]
columns.append(layer_name_column)

# optionally focus on subset of doc
start_pos, end_pos = 2,-2
phrase = doc[start_pos:end_pos]
phrase_ids = doc_ids[start_pos:end_pos]
# set vis params
palette = Category20[20]
# create a column of plots
plot_column = []
plot_column.append(Div(text=' '.join([f'{reduction}{dim}' for reduction, dim in reductions]), align=('center', 'center'))) # column header
for layer in layers:
    # fit this layer's dimensionality reduction model
    phrase_neighbors = layer_to_neighbors[layer][start_pos:end_pos]
    ids_to_fit = phrase_ids + flatten(phrase_neighbors)  # multiple options here
    acts = layer_to_acts[layer]
    acts_to_fit = acts[ids_to_fit]  # init
    fit_reducers = []
    for reduction, dim in reductions:
        fit_reducer, acts_to_fit = acts_util.fit_reducer(acts_to_fit, reduction, dim)
        fit_reducers.append(fit_reducer)
    
    # reduce and prep the document's points
    phrase_contexts = [contexts[context_id] for context_id in phrase_ids]
    phrase_reduced_acts = acts[phrase_ids]  # init
    for reducer in fit_reducers:
        phrase_reduced_acts = reducer.transform(phrase_reduced_acts)
    phrase_points = {
        'x': phrase_reduced_acts[:,0],
        'y': phrase_reduced_acts[:,1],
        'color': [palette[tok_idx] for tok_idx in range(len(phrase))],
        'line color': ['black'] * len(phrase),
        'line width': [1] * len(phrase),
        'label': [[f'[{pos}]'] for doc, pos in phrase_contexts],
        'hover label': [context_util.context_str(*context, marker=html_util.highlighter(color='yellow')) for context in phrase_contexts]
        }

    # reduce and prep the neighbors
    processed_neighbors = []
    neighbor_points = {'x':[], 'y':[], 'color':[], 'legend label':[], 'label':[], 'hover label':[]}
    for tok_idx in range(len(phrase)):
        tok_pos = tok_idx + start_pos  # position relative to entire doc
        tok_neighbors = phrase_neighbors[tok_idx][1:]  # skip zeroeth neighbor; that's the token itself
        tok_neighbors_contexts = [contexts[neighbor] for neighbor in tok_neighbors]
        tok_neighbors_reduced_acts = acts[tok_neighbors]  # init
        for reducer in fit_reducers:
            tok_neighbors_reduced_acts = reducer.transform(tok_neighbors_reduced_acts)
            
        # visualize different kinds of neighbors differently
        for neighbor_idx, neighbor in enumerate(tok_neighbors):
            if neighbor in phrase_ids:  # update existing phrase point
                phrase_point_idx = phrase_ids.index(neighbor)
                phrase_points['label'][phrase_point_idx] += f'{tok_pos}'
                phrase_points['line color'][phrase_point_idx] = 'aqua'
                phrase_points['line width'][phrase_point_idx] = 3
            elif neighbor in processed_neighbors:  # update existing neighbor point
                neighbor_point_idx = processed_neighbors.index(neighbor)
                neighbor_points['label'][neighbor_point_idx] += f'{tok_pos}'
                neighbor_points['color'][neighbor_point_idx] = 'aqua'
            else:  # new neighbor
                neighbor_context = contexts[neighbor]
                neighbor_reduced_acts = tok_neighbors_reduced_acts[neighbor_idx]
                neighbor_points['x'].append(neighbor_reduced_acts[0])
                neighbor_points['y'].append(neighbor_reduced_acts[1])
                neighbor_points['color'].append(palette[tok_idx])
                neighbor_points['legend label'].append([f'[{tok_pos}] {doc[tok_pos]}'])
                neighbor_points['label'].append([f'{tok_pos}'])
                neighbor_points['hover label'].append(
                    context_util.context_str(*neighbor_context, marker=html_util.highlighter(color='lightgrey')))
                processed_neighbors.append(neighbor)    
    neighbor_points['label'] = [label if len(label)>1 else '' for label in neighbor_points['label']]
    
    # plot 
    phrase_points_source = ColumnDataSource(phrase_points)
    neighbor_points_source = ColumnDataSource(neighbor_points)
    p = vis_util.empty_plot(width=400, height=250, darkmode=False)
    p.add_layout(Legend(), 'right')
    p.circle(x='x', y='y', color='color', size=10, legend_group='legend label', source=neighbor_points_source)
    p.add_layout(LabelSet(x='x', y='y', text='label', x_offset=2, y_offset=2, text_font_size='10pt', source=neighbor_points_source))
    p.triangle(x='x', y='y', color='color', line_color='line color', size=15, line_width='line width', source=phrase_points_source)
    p.add_layout(LabelSet(x='x', y='y', text='label', x_offset=2, y_offset=2, text_font_size='10pt', source=phrase_points_source))
    zoom_tool = WheelZoomTool()
    p.tools = [PanTool(), zoom_tool, BoxZoomTool(), ResetTool(), vis_util.hover_tool('hover label')]
    p.toolbar.active_scroll = zoom_tool
    plot_column.append(p)
columns.append(plot_column)
show(gridplot(zip(*columns)))

# Prepare to visualize connections based on neighborhoods

In [None]:
n_nearest_neighbors = 1000
KNN_models_filename = f'KNN_models_K={n_nearest_neighbors}.pickle'

layer_to_neighbors = {}  # for each tok in document, map it to its nearest neighbors' ids
with open(os.path.join(os.path.abspath(data_dir), KNN_models_filename), 'rb') as f:
    for layer in layers:
        acts = layer_to_acts[layer]
        KNN_model = pickle.load(f)
        neighbors_distances, neighbors_ids = KNN_model.kneighbors(acts[doc_ids])
        layer_to_neighbors[layer] = neighbors_ids

In [None]:
start_pos, end_pos = 0,-1
phrase = doc[start_pos:end_pos]
phrase_ids = doc_ids[start_pos:end_pos]

In [None]:
def fraction_overlapping(l1, l2):
    assert len(l1)==len(l2)
    n_overlapping = len(set(l1).intersection(set(l2)))
    return (n_overlapping/len(l1))

In [None]:
# based on fraction of neighbors that are overlapping
layer_to_tok_connections = {layer: np.ones((len(phrase), len(phrase)))  for layer in layers}  # init
for layer in layers:
    tok_connections = layer_to_tok_connections[layer]
    neighbors = layer_to_neighbors[layer]
    for tok_i in range(len(phrase)):
        tok_i_neighbors = set(neighbors[tok_i])
        for tok_j in range(tok_i+1, len(phrase)):
            tok_j_neighbors = set(neighbors[tok_j])
            connection_strength = fraction_overlapping(tok_i_neighbors, tok_j_neighbors)
            tok_connections[tok_i, tok_j] = connection_strength
            tok_connections[tok_j, tok_i] = connection_strength

In [None]:
# based on to what degree they are each other's nearest neighbors
layer_to_tok_connections = {layer: np.ones((len(phrase), len(phrase)))  for layer in layers}  # init
for layer in layers:
    tok_connections = layer_to_tok_connections[layer]
    neighbors = layer_to_neighbors[layer]
    for tok_i in range(len(phrase)):
        tok_i_neighbors = neighbors[tok_i]
        for tok_j in range(len(phrase)):
            if tok_i != tok_j:
                tok_j_id = phrase_ids[tok_j]
                neighbor_pos = np.where(tok_i_neighbors == tok_j_id)[0]
                if neighbor_pos.size > 0:
                    connection = 10 * 1/neighbor_pos[0]
                else:
                    connection = 0
                tok_connections[tok_i, tok_j] = connection


In [None]:
# Based on neighborhood changing or not
layer_to_tok_movement = {}
prev_layer = layers[0]
for layer in layers:
    prev_neighbors = layer_to_neighbors[prev_layer]
    curr_neighbors = layer_to_neighbors[layer]
    layer_to_tok_movement[layer] = [1-fraction_overlapping(prev_neighbors[tok_idx], curr_neighbors[tok_idx]) 
                                    for tok_idx in range(len(phrase))]
    prev_layer = layer

In [None]:
plots = []
for layer in layers:
    # prepare to draw toks
    tok_movement = layer_to_tok_movement[layer]
    toks_source = ColumnDataSource({'x': range(len(phrase)), 'y': [1.5]*len(phrase), 'label': phrase, 
                                    'alpha': [(1-tok_movement[tok_idx]) for tok_idx in range(len(phrase))],
                                    'hover label': phrase
                                   })
    # p.add_layout(LabelSet(x='x', y='y', y_offset='4', text='label', text_font_size='10pt', text_align='center', source=toks_source))
    
    # prepare to draw edges
    tok_connections = layer_to_tok_connections[layer]
    pairs = [(tok_i, tok_j) for tok_i in range(len(phrase)) for tok_j in range(tok_i+1, len(phrase))]
    edges_info = {}
    edges_info['x'] = [(tok_i+tok_j)/2 for tok_i,tok_j in pairs]
    edges_info['y'] = [1] * len(pairs)
    edges_info['r'] = [abs(tok_i-tok_j)/2 for tok_i,tok_j in pairs]
    edges_info['alpha'] = [(tok_connections[tok_i, tok_j]-.1) for tok_i, tok_j in pairs]
    edges_info['hover label'] = ['xyz' for tok_i, tok_j in pairs]
    
    p = vis_util.empty_plot(dim=250)
    p.arc(x='x', y='y', radius='r', width=2, start_angle=math.pi, end_angle=0, alpha='alpha', source=ColumnDataSource(edges_info))
    cover_source = ColumnDataSource({'x': range(len(phrase)), 'y': [1]*len(phrase)})
    p.line(x='x', y='y', color='white', line_width=10, source=cover_source)  # visually mask top few pixels of arcs
    tok_points = p.circle(x='x', y='y', color='red', alpha='alpha', size=5, source=toks_source)
    p.circle(x='x', y='y', color=None, size=5, line_color='red', source=toks_source)
    p.y_range = Range1d(-8,2)
    wheelzoomtool = WheelZoomTool()
    hover_tool = HoverTool(tooltips=vis_util.custom_bokeh_tooltip('hover label', border=False), renderers=[tok_points])
    p.tools = [PanTool(), wheelzoomtool, ResetTool(), hover_tool]
    p.toolbar.active_scroll = zoom_tool
    plots.append(p)
    p.outline_line_color = None
layer_labels = [Div(text=layer, align='center') for layer in layers]
show(gridplot(zip(*[layer_labels, plots]), toolbar_options={'logo': None}))

In [None]:
plots = []
for layer in layers:
    # prepare to draw toks
    tok_movement = layer_to_tok_movement[layer]
    toks_source = ColumnDataSource({'x': range(len(phrase)), 'y': [1.5]*len(phrase), 'label': phrase, 
                                    'alpha': [(1-tok_movement[tok_idx]) for tok_idx in range(len(phrase))],
                                    'hover label': phrase
                                   })
    # p.add_layout(LabelSet(x='x', y='y', y_offset='4', text='label', text_font_size='10pt', text_align='center', source=toks_source))
    
    # prepare to draw edges
    tok_connections = layer_to_tok_connections[layer]
    pairs = [(tok_i, tok_j) for tok_i in range(len(phrase)) for tok_j in range(tok_i+1, len(phrase))]
    edges_info = {}
    edges_info['x'] = [(tok_i+tok_j)/2 for tok_i,tok_j in pairs]
    edges_info['y'] = [1] * len(pairs)
    edges_info['r'] = [abs(tok_i-tok_j)/2 for tok_i,tok_j in pairs]
    edges_info['alpha'] = [(tok_connections[tok_i, tok_j]-.1) for tok_i, tok_j in pairs]
    edges_info['hover label'] = ['xyz' for tok_i, tok_j in pairs]
    
    p = vis_util.empty_plot(dim=250)
    p.arc(x='x', y='y', radius='r', width=2, start_angle=math.pi, end_angle=0, alpha='alpha', source=ColumnDataSource(edges_info))
    cover_source = ColumnDataSource({'x': range(len(phrase)), 'y': [1]*len(phrase)})
    p.line(x='x', y='y', color='white', line_width=10, source=cover_source)  # visually mask top few pixels of arcs
    tok_points = p.circle(x='x', y='y', color='red', alpha='alpha', size=5, source=toks_source)
    p.circle(x='x', y='y', color=None, size=5, line_color='red', source=toks_source)
    p.y_range = Range1d(-8,2)
    wheelzoomtool = WheelZoomTool()
    hover_tool = HoverTool(tooltips=vis_util.custom_bokeh_tooltip('hover label', border=False), renderers=[tok_points])
    p.tools = [PanTool(), wheelzoomtool, ResetTool(), hover_tool]
    p.toolbar.active_scroll = zoom_tool
    plots.append(p)
    p.outline_line_color = None
layer_labels = [Div(text=layer, align='center') for layer in layers]
show(gridplot(zip(*[layer_labels, plots]), toolbar_options={'logo': None}))

In [None]:
rows = [[Div(text=tok, align='center') for tok in phrase],]  # header row
for layer in layers:
    plot_row = []
    for tok_i in range(len(phrase)):
        # prep toks
        toks_source = ColumnDataSource({'x': range(len(phrase)), 'y': [1.5]*len(phrase), 'label': phrase, 
                                        'alpha': [(1-tok_movement[tok_idx]) for tok_idx in range(len(phrase))],
                                        'hover label': phrase
                                       })        
        tok_points = p.circle(x='x', y=1, color=None, width=.2, source=toks_source)
        # p.add_layout(LabelSet(x='x', y='y', y_offset='4', text='label', text_font_size='10pt', text_align='center', source=toks_source))

        # prep edges
        tok_connections = layer_to_tok_connections[layer]
        pairs = [(tok_i, tok_j) for tok_j in range(len(phrase))]
        edges_info = {}
        edges_info['x'] = [(tok_i+tok_j)/2 for tok_i,tok_j in pairs]
        edges_info['y'] = [1] * len(pairs)
        edges_info['r'] = [abs(tok_i-tok_j)/2 for tok_i,tok_j in pairs]
        edges_info['alpha'] = [(tok_connections[tok_i, tok_j]-.3) for tok_i, tok_j in pairs]
        edges_info['hover label'] = [f'{phrase[min(tok_i,tok_j)]}-{phrase[max(tok_i,tok_j)]}' for tok_i,tok_j in pairs]
        
        # plot
        p = vis_util.empty_plot(dim=100)
        p.arc(x='x', y='y', radius='r', width=2, start_angle=math.pi, end_angle=0, alpha='alpha', source=ColumnDataSource(edges_info))
        cover_source = ColumnDataSource({'x': range(len(phrase)), 'y': [1]*len(phrase)})
        p.line(x='x', y='y', color='white', line_width=10, source=cover_source)  # visually mask top few pixels of arcs
        tok_points = p.circle(x='x', y='y', color='red', alpha='alpha', size=5, source=toks_source)
        p.circle(x='x', y='y', color=None, size=5, line_color='red', source=toks_source)
        p.circle(x=tok_i, y=1.8, color='limegreen')
        p.x_range = Range1d(0,len(phrase)-1)
        p.y_range = Range1d(-8,2)
        hover_tool = HoverTool(tooltips=vis_util.custom_bokeh_tooltip('hover label', border=False), renderers=[tok_points])
        p.tools = [PanTool(), WheelZoomTool(), ResetTool(), hover_tool]
        plot_row.append(p)
    rows.append(plot_row)

layer_names_col = [Div(text=layer, align='center') for layer in layers]
columns = [[Div()], *list(zip(*rows))]
show(gridplot(rows))