# separability Insight into OPT models
Tests to see if it is possible to remove coding ability from Meta OPT model for different scales.
Current methods are:
- look at activation frequency of MLP mid layers
- Look at 'crossover threshold' of Attention heads

In [None]:
try: # if in google colab, download necessary python files
  import google.colab 
  ! pip install -qq separability
  ! git clone https://github.com/pesvut/opt-tools.git && mv ./opt-tools/src/*.py .
except ModuleNotFoundError:
  pass

In [None]:
import torch
import numpy as np
import pandas as pd
import einops
import matplotlib.pyplot as plt
import wandb
#import seaborn as sns

from separability import Model
from separability.data_classes import RunDataHistory
from separability.activations import prune_and_evaluate, evaluate_all, get_top_frac

In [None]:
# Configure initial model and tests
model_size, token_limit  = "facebook/galactica-125m", 1000
pre_removals = []

# Removals parameters
project             = "seperability-pile-code"
focus, cripple      = "pile", "code"
datasets            = [focus, cripple]

In [None]:
# Prepare data logging
wandb.init(project=project, entity="seperability", mode="offline")
c = wandb.config
c.update({
})
# Prepare data logging
wandb.init(project=project, entity="seperability")
c = wandb.config
c.update({
    "model_size"  : model_size,
    "token_limit" : token_limit,
    "run_pre_test": False,
    "ff_frac"  : 0.03,
    "ff_eps"   : 1e-3,
    "attn_frac": 0.005,
    "attn_eps" : 1e-4,
    "cripple": cripple,
    "focus"  : focus,
    "attn_prune_type": "pre_out",
    "svd_attn": False,
    "svd_combine_biases": False,
    "do_attn_mean_offset": False,
    "attn_scoring": "abs",
    "ff_scoring": "freq",
    "attn_prune_heads": False,
    "delete_residual_biases": False,
    "do_attn_mean_offset": False,
})


# Load model and show details about model
history = RunDataHistory(datasets)
opt = Model( c.model_size, limit=c.token_limit, dtype=torch.float16 )

# Pre-pruning of model
opt.delete_ff_keys_from_files(pre_removals)

In [None]:
print(opt.model.prune_heads)


In [None]:
# Evaluate model before removal of any neurons
if c.run_pre_test:
    history.add( evaluate_all( opt, 1e5, datasets ) )
    print( history.df.T )

In [None]:

data = prune_and_evaluate( opt, c.ff_frac, c.attn_frac, c.ff_eps, c.attn_eps, save=True,
    do_attn_mean_offset=c.do_attn_mean_offset, attn_scoring=c.attn_scoring,
    attn_prune_heads=c.attn_prune_heads, cripple=c.cripple, focus=c.focus )
history.add( data )

In [None]:
ff_scores   = history.history[-1].raw["ff_scores"]
attn_scores = history.history[-1].raw["attn_scores"].reshape((opt.n_layers, opt.d_model))

def indices_to_edges(indices):
    edges = np.zeros(len(indices)*2)
    edge_indices = np.zeros_like(edges)
    edges[0::2] = indices
    edges[1::2] = indices
    edge_indices[0::2] = np.arange(len(indices))
    edge_indices[1::2] = np.arange(len(indices))+1
    return edges, edge_indices

def plot_scores(scores, title):
    s = np.array([ np.sort(arr)[::-1] for arr in scores ]).T
    width = s.shape[0]
    s_indices = np.array([ width - np.searchsorted(arr[::-1], 1) for arr in s.T]).T
    s_edges, s_edge_indices = indices_to_edges(s_indices)
    
    criteria, threshold = get_top_frac(torch.tensor(s, dtype=torch.float32), 0.05)
    c_indices = np.array([ width - np.searchsorted(arr[::-1], threshold) for arr in s.T]).T
    c_edges, c_edge_indices = indices_to_edges(c_indices)
    
    criteria, threshold = get_top_frac(torch.tensor(s, dtype=torch.float32), 0.01)
    t_indices = np.array([ width - np.searchsorted(arr[::-1], threshold) for arr in s.T]).T
    t_edges, t_edge_indices = indices_to_edges(t_indices)
    
    plt.figure(figsize=(10, 4))
    sns.heatmap(np.log10( s+0.001 ), center=0, vmin=-1, cmap="coolwarm")
    plt.plot(t_edge_indices, t_edges, color="black", linewidth=1, alpha=1, label="Top 1%")
    plt.plot(c_edge_indices, c_edges, color="purple", alpha=0.5, label="Top 5%")
    plt.plot(s_edge_indices, s_edges, color="black", linestyle=":", alpha=0.5, label='"Neutral"')
    plt.legend()
    plt.title(title)
    plt.xlabel("Layer")
    plt.ylabel("Neuron")
    plt.show()
    
plot_scores(ff_scores,   "Feed Forward Scores (Activation Frequency Ratio, Log10)")
plot_scores(attn_scores, "Attention Scores (Mean Abs Activation Ratio, Log10)")


In [None]:
# First do some pruning of the feed forward layers
for i in range(20):
    data = prune_and_evaluate( opt, c.ff_frac, c.attn_frac, c.ff_eps, save=True,
        attn_scoring=c.attn_scoring, cripple=c.cripple, focus=c.focus )
    history.add( data )

In [None]:
print(history.history[-1])

In [None]:
print(history.df.T)

In [None]:
print(history.df.T.to_csv())