In [1]:
import numpy as np
import cv2
import time
import matplotlib.pyplot as plt
import matplotlib
from matplotlib.animation import FuncAnimation
import matplotlib.colors as mcolors
from matplotlib.colors import LogNorm
from IPython.display import HTML
import torch
from tqdm import tqdm 
import os
import pandas as pd
import umap
from tqdm import tqdm
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.decomposition import PCA
from sklearn import svm
import re
from transformers import LlamaTokenizer, LlamaForCausalLM, GenerationConfig
from sklearn.preprocessing import OneHotEncoder

In [2]:
tokenizer = LlamaTokenizer.from_pretrained('../../models/llama-7b-hf', local_files_only=True)

The tokenizer class you load from this checkpoint is not the same type as the class this function is called from. It may result in unexpected tokenization. 
The tokenizer class you load from this checkpoint is 'LLaMATokenizer'. 
The class this function is called from is 'LlamaTokenizer'.


In [3]:
def extract_number(filename):
    match = re.search(r'\d+', filename)
    return int(match.group()) if match else float('inf')

def sort_files(activations_directory):
  activation_folder_list = os.listdir(activations_directory)
  activation_folder_list = sorted(activation_folder_list, key=extract_number, reverse=False)
  return activation_folder_list 


In [43]:

# Assuming a data set with variable length tokens, token_output_focus determines which token to focus on for each element of the data set
# Assumes filename corresponds to the class of the prompt
def build_activations_array(activations_directory, activation_folder_list, token_output_focus = "first", first_n_files=None):
  if first_n_files is None:
    first_n_files = len(activation_folder_list)

  # Array of arrays 
  all_prompt_activations = [] 
  all_prompt_filenames = []
  all_prompt_tokens = []

  # Load the CSV file into a pandas DataFrame
  df = pd.read_csv(activations_directory+'Task.csv')

  # Create a dictionary that maps question IDs to classes
  id_to_class = df.set_index('Question_ID')['Class'].to_dict()
  id_to_q_type = df.set_index('Question_ID')['Question_Type'].to_dict()
#   print(id_to_class)
#   print(id_to_q_type)

  activation_folder_list = [f for f in activation_folder_list if os.path.splitext(f)[1] == '.pt']

  for activation_file in tqdm((activation_folder_list[:first_n_files]), position=0, leave=True):
      # Extract the question ID from the filename
      question_id = int(re.search(r"^Question_(\d+)_", activation_file).group(1))
      
      # Check if the question type is 'character_qa_close' before proceeding
      if id_to_q_type[question_id] == 'character_qa_close':
#       if 1:
#           Use the dictionary to get the corresponding class

          activation_path = os.path.join(activations_directory,str(activation_file))
          data = torch.load(activation_path, map_location=torch.device('cpu'))


          hidden_states = data['hidden_states']
          output_response = data['output'].split("Response:")[1]
          tokenized_output_response = tokenizer.encode(output_response)

          

          if token_output_focus == "first":
            token_id = 3 # Disregard first hidden state (which inclued all input? tokens), and disregard first two tokens (['', '<0x0A>',)
          if token_output_focus == "last":
              token_id = len(hidden_states) - 1
              token_text = tokenizer.decode(tokenized_output_response[token_id])
#               Check if token_text is not one of the specified strings
              if token_text not in ["Yes", "yes", "No", "no"]:
                  token_id = len(hidden_states) - 2 # n-1 to avoid full stop
                  token_text = tokenizer.decode(tokenized_output_response[token_id])
                  if token_text not in ["Yes", "yes", "No", "no"]:
                    print("output neither yes or no")
                    continue

#           filename = id_to_class[question_id] +'_'+ id_to_q_type[question_id]+'_'+token_text
          filename = id_to_class[question_id]
          all_prompt_filenames.append(filename)
          
          token_hidden_states = hidden_states[token_id]

          # Initialize an empty dictionary to store activations
          activations = []

          # iterate through all layers for each token's hidden states
          for layer_id, layers in enumerate(token_hidden_states):
            # print("Layer: "+str(layer_id))
            for beam_id, beams in enumerate(layers):
              # print("Beam: "+str(beam_id))
              for token_activation_id, token_activations in enumerate(beams):
                # print(token_activations.shape)
                token_activations_np = token_activations.numpy()  # Detach and convert to NumPy array
                activations.extend(token_activations_np)
          token_text = tokenizer.decode(tokenized_output_response[token_id])
          all_prompt_activations.append(activations)
          all_prompt_tokens.append(token_text)

  return all_prompt_activations, all_prompt_filenames, all_prompt_tokens

In [44]:
activations_directory = "/home/gridsan/wzulfikar/activations/dataset_trex/"
#Sort files
activation_folder_list = sort_files(activations_directory)
#Set params
token_output_focus = "last"
first_n_files = 800

all_prompt_activations, all_prompt_filenames, all_prompt_tokens = build_activations_array(activations_directory, activation_folder_list, token_output_focus = token_output_focus, first_n_files = first_n_files)

100%|██████████| 800/800 [00:14<00:00, 53.89it/s]


In [38]:
unique_strings = set(all_prompt_filenames)

for unique_string in unique_strings:
    print(unique_string)

Ada Lovelace
Vincent Van Gogh
Princess Leia
William Shakespeare
Albert Einstein
Sherlock Holmes
Marie Curie
Harry Potter
Lady Gaga
Isaac Newton
Barney the Dinosaur
Socrates
Cleopatra
Hermione Granger


In [39]:
clustering_data = np.array(all_prompt_activations).T
target = np.array(all_prompt_filenames)[:, np.newaxis]
target = OneHotEncoder().fit_transform(target)

print(clustering_data.shape)
print(target.shape)


(209920, 379)
(379, 14)


In [46]:
classes = set(all_prompt_filenames)
all_prompt_activations_with_class = {k:[] for k in classes}
for j, activation in enumerate(all_prompt_activations):
    k = all_prompt_filenames[j]
    all_prompt_activations_with_class[k].append(np.array(activation))

print(all_prompt_activations_with_class.keys())

dict_keys(['Ada Lovelace', 'Vincent Van Gogh', 'Princess Leia', 'William Shakespeare', 'Albert Einstein', 'Sherlock Holmes', 'Marie Curie', 'Harry Potter', 'Lady Gaga', 'Isaac Newton', 'Barney the Dinosaur', 'Socrates', 'Cleopatra', 'Hermione Granger'])


## UMAP

In [67]:
from cuml.manifold.umap import UMAP as cumlUMAP
import time
start = time.time()

cuml_umap = cumlUMAP(n_components=2, n_neighbors=16, init="spectral")
embedding = cuml_umap.fit_transform(clustering_data)
print("Time taken", time.time()-start)

Time taken 13.812620401382446


In [68]:
print(embedding.shape)

(209920, 2)


## K Means

In [None]:
# Assuming embedding is your data and its shape is [n_neurons, features]
# n_neurons = embedding.shape[0]
# features = embedding.shape[1]
import cudf
import cuml

start = time.time()

embedding = clustering_data

n_neurons = embedding.shape[0]
features = embedding.shape[1]
print(n_neurons, features)

# Convert your data to cuDF DataFrame, because cuML works with cuDF DataFrame
df = cudf.DataFrame(embedding, columns=[f'feature_{i+1}' for i in range(features)])

# Initialize the KMeans model
kmeans = cuml.KMeans(n_clusters=16)

print("Fitting")
# Fit the model
kmeans.fit(df)

# Get cluster predictions
df['cluster'] = kmeans.predict(df)

print("Time taken ", time.time() - start)

209920 379
Fitting
Time taken  0.3917968273162842


In [69]:
np.unique(kmeans.labels_.to_numpy(), return_counts=1)

(array([ 0,  1,  2,  3,  4,  5,  6,  7,  8,  9, 10, 11, 12, 13, 14, 15],
       dtype=int32),
 array([    33,     33, 119558,     33,     33,      7,     66,     79,
            33,    280,    261,      7,     21,     50,  44537,  44889]))

In [53]:
# # Assuming embedding is your data and its shape is [n_neurons, features]
# # n_neurons = embedding.shape[0]
# # features = embedding.shape[1]
# import cudf
# import cuml

# n_neurons = embedding.shape[0]
# features = 2

# # Convert your data to cuDF DataFrame, because cuML works with cuDF DataFrame
# df = cudf.DataFrame(embedding, columns=[f'feature_{i+1}' for i in range(features)])

# # Initialize the KMeans model
# kmeans = cuml.KMeans(n_clusters=16)

# print("Fitting")
# # Fit the model
# kmeans.fit(df)

# # Get cluster predictions
# df['cluster'] = kmeans.predict(df)

# # Plotting
# plt.figure(figsize=(10, 8))
# for i in range(16):
#     cluster_data = df[df['cluster'] == i].to_pandas()
#     plt.scatter(cluster_data['feature_1'], cluster_data['feature_2'], label=f'Cluster {i+1}')

# plt.title('KMeans Clustering with cuML')
# plt.xlabel('Feature 1')
# plt.ylabel('Feature 2')
# plt.legend()
# plt.show()


In [66]:
import numpy as np
import pandas as pd
import cudf
import cuml
from collections import defaultdict
import matplotlib.pyplot as plt

correlation_results = defaultdict(list)


for class_name, trials in all_prompt_activations_with_class.items():
    for cluster_id in range(16):  # Assuming you have 16 clusters
        cluster_neurons = df.loc[df['cluster'] == cluster_id].index.to_numpy()
        num_neurons = len(cluster_neurons)

        correlation_sum = 0
        for trial in trials:
            cluster_activations = trial[cluster_neurons]
#             correlation_sum += np.corrcoef(trial, cluster_activations)[0, 1]
            correlation_sum += np.mean(cluster_activations)

        average_correlation = correlation_sum / len(trials)
        correlation_results[class_name].append((cluster_id, num_neurons, average_correlation))

# Now let's get top 5 clusters for each class
for class_name, correlations in correlation_results.items():
    correlations.sort(key=lambda x: x[2], reverse=True)  # Sort by correlation value in descending order
    top_5_clusters = correlations[:5]

    print(f"For class {class_name}, the top 5 clusters are:")
    for cluster_id, num_neurons, correlation in top_5_clusters:
        print(f"Cluster ID: {cluster_id}, Number of neurons: {num_neurons}, Mean activation: {correlation}")
    print()


For class Ada Lovelace, the top 5 clusters are:
Cluster ID: 2, Number of neurons: 33, Mean activation: 7026.3781174879805
Cluster ID: 5, Number of neurons: 33, Mean activation: 3171.427283653846
Cluster ID: 4, Number of neurons: 33, Mean activation: 1210.3545673076924
Cluster ID: 9, Number of neurons: 139, Mean activation: 76.16211964533879
Cluster ID: 15, Number of neurons: 1555, Mean activation: 22.863487830528847

For class Vincent Van Gogh, the top 5 clusters are:
Cluster ID: 2, Number of neurons: 33, Mean activation: 4658.08642578125
Cluster ID: 5, Number of neurons: 33, Mean activation: 2101.1958512931033
Cluster ID: 4, Number of neurons: 33, Mean activation: 800.3592403017242
Cluster ID: 9, Number of neurons: 139, Mean activation: 50.432722420528016
Cluster ID: 15, Number of neurons: 1555, Mean activation: 15.289070918642242

For class Princess Leia, the top 5 clusters are:
Cluster ID: 0, Number of neurons: 4, Mean activation: 107.77678571428571
Cluster ID: 12, Number of neurons

In [72]:
# Create a list to store dictionaries
data = []
n_clusters = 16

for class_name, trials in all_prompt_activations_with_class.items():
    print(class_name)
    for trial_id, trial in enumerate(trials):
        trial_dict = {'class_id': class_name}
        for cluster_id in range(n_cluster):  # Assuming you have 16 clusters
            cluster_neurons = df.loc[df['cluster'] == cluster_id].index.to_numpy()

            cluster_activations = trial[cluster_neurons]
            trial_dict['cluster_' + str(cluster_id)] = np.mean(cluster_activations)  # Add mean activation to the dictionary
            
        # Append the dictionary to the list
        data.append(trial_dict)

# Convert list of dictionaries to DataFrame
mean_cluster_activations_per_trial = pd.DataFrame(data)


Ada Lovelace
Vincent Van Gogh
Princess Leia
William Shakespeare
Albert Einstein
Sherlock Holmes
Marie Curie
Harry Potter
Lady Gaga
Isaac Newton
Barney the Dinosaur
Socrates
Cleopatra
Hermione Granger


In [74]:
print(mean_cluster_activations_per_trial[54:65])

            class_id  cluster_0  cluster_1  cluster_2  cluster_3  cluster_4  \
54  Vincent Van Gogh  -7.746094  -4.570312   0.025162  -1.652344  -7.472656   
55     Princess Leia  -9.835938  -2.484375   0.027832  -2.628906  -6.992188   
56     Princess Leia  -8.218750  -0.486572   0.030640  -2.257812  -6.382812   
57     Princess Leia  -9.570312  -1.730469   0.022629  -2.804688  -6.949219   
58     Princess Leia  -8.632812  -1.383789   0.020721  -2.425781  -5.832031   
59     Princess Leia  -8.265625  -1.383789   0.025436  -1.871094  -6.785156   
60     Princess Leia  -8.648438  -1.253906   0.031097  -0.976074  -5.886719   
61     Princess Leia -10.593750  -2.443359   0.026123  -2.814453  -6.660156   
62     Princess Leia  -7.894531  -1.133789   0.034790  -0.731445  -5.609375   
63     Princess Leia  -7.937500  -0.992188   0.029724  -1.014648  -6.179688   
64     Princess Leia  -8.429688  -1.458984   0.031372  -0.639648  -6.355469   

    cluster_5  cluster_6  cluster_7  cluster_8  clu

In [None]:
import statsmodels.api as sm

# Get unique class IDs
class_ids = mean_cluster_activations_per_trial['class_id'].unique()

# For each class ID, run a regression
for class_id in class_ids:
    # Prepare the data
    class_df = mean_cluster_activations_per_trial[mean_cluster_activations_per_trial['class_id'] == class_id]
    X = class_df.drop('class_id', axis=1)
    y = class_df['class_id']

    # Add a constant to the predictors
    X = sm.add_constant(X)

    # Perform OLS regression
    model = sm.OLS(y, X).fit()
    
    # Display the summary statistics of the regression model
    print(f"Class ID: {class_id}")
    print(model.summary())
    print("\n-----------------------------\n")

In [84]:
from cuml.linear_model import LogisticRegression
import cupy as cp

# Get unique class IDs
class_ids = mean_cluster_activations_per_trial['class_id'].unique()

# For each class ID, run a logistic regression
for class_id in class_ids:
    # Prepare the data
    y = mean_cluster_activations_per_trial['class_id'] == class_id
    X = cp.asarray(mean_cluster_activations_per_trial.drop('class_id', axis=1)).astype(cp.float32)
    y = cp.asarray(y.astype(int))  # Convert boolean values to int (1 for class of interest, 0 for all others)

    # Perform Logistic Regression
    model = LogisticRegression()
    model.fit(X, y)
    
    # Display the coefficients of the logistic regression model
    print(f"Class ID: {class_id}")
    print(f"Intercept: {model.intercept_}")
    print(f"Coefficients: {model.coef_}")
    # print the coefficients in descending order and the corresponding cluster ids
    print(f"Top 5 clusters: {cp.argsort(model.coef_)[0, ::-1][:5]}")
    print("\n-----------------------------\n")


[W] [18:23:52.275339] L-BFGS line search failed (code 4); stopping at the last valid step
Class ID: Ada Lovelace
Intercept: [-6.4875245e-08]
Coefficients: [[-6.00275598e-05  1.92644977e-04 -1.97161176e-09 -1.33744747e-04
  -2.27602886e-05  4.78152378e-06  1.11343925e-05  2.96447115e-06
   1.61020780e-05 -1.08026006e-06  8.65113350e-07  4.80426525e-06
   5.18768275e-06  4.08482992e-06  1.72263000e-07 -1.73606125e-07]]
Top 5 clusters: [ 1  8  6 12 11]

-----------------------------

[W] [18:23:54.052978] L-BFGS line search failed (code 4); stopping at the last valid step
Class ID: Vincent Van Gogh
Intercept: [-5.580048e-08]
Coefficients: [[-6.0078582e-05  1.9257356e-04 -1.6866875e-09 -1.3373610e-04
  -2.2813798e-05  5.3574108e-06  1.1137217e-05  2.9502542e-06
   1.6205873e-05 -1.0924251e-06  8.6834223e-07  5.7107059e-06
   5.3478343e-06  4.1862841e-06  1.7063915e-07 -1.7221967e-07]]
Top 5 clusters: [ 1  8  6 11  5]

-----------------------------

Class ID: Princess Leia
Intercept: [-23.3