In [1]:
%reset
%matplotlib inline
%config InlineBackend.figure_format = 'retina'  # makes figs nicer!

import functools
import os
import torch
import transformers

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

from numpy import linalg
from sklearn.decomposition import PCA
from tqdm.notebook import tqdm
from transformers import AutoTokenizer


sns.set(style='whitegrid',font_scale=1.2)

Once deleted, variables cannot be recovered. Proceed (y/[n])? 
Nothing done.


In [18]:

# Count model parameters
def count_parameters(model):
    """credit: https://stackoverflow.com/questions/49201236/check-the-total-number-of-parameters-in-a-pytorch-model"""
    
    total_params = 0
    for name, parameter in model.named_parameters():
        
        # if the param is not trainable, skip it
        if not parameter.requires_grad:
            continue
        
        # otherwise, count it towards your number of params
        params = parameter.numel()
        total_params += params
    print(f"Total Trainable Params: {total_params}")
    
    return total_params

from sklearn.decomposition import PCA





## TODO: make this a class

## Step 2
def pca_normalization(points):
    """points: (m samples x n dimensions)"""
    
    pca = PCA(n_components=len(np.transpose(points)))
    points = pca.fit_transform(points)

    return np.transpose(points)

## Step 3
def get_diag_of_cov(points):
    """points: (n dims x m samples)"""
    
    n = np.shape(points)[0]
    cov = np.cov(points)
    cov_diag = cov[np.diag_indices(n)]

    return cov_diag

## Step 4
def normalize_diagonal(cov_diag):

    n = len(cov_diag)
    cov_diag_normalized = (cov_diag*np.sqrt(n))/np.linalg.norm(cov_diag)

    return cov_diag_normalized

## Step 5
def get_isotropy_defect(cov_diag_normalized):

    n = len(cov_diag_normalized)
    iso_diag = np.eye(n)[np.diag_indices(n)]
    l2_norm = np.linalg.norm(cov_diag_normalized - iso_diag)
    normalization_constant = np.sqrt(2*(n-np.sqrt(n)))
    isotropy_defect = l2_norm/normalization_constant

    return isotropy_defect

## Interlude
def get_kdims(isotropy_defect, points): 
    
    n = np.shape(points)[0]
    k = ((n-(isotropy_defect**2)*(n-np.sqrt(n)))**2) / n
    
    return k

def get_fraction_dims(k, points):
    
    n = np.shape(points)[0]
    phi = k/n
    
    return phi

## Step 6
def get_IsoScore(isotropy_defect, points):

    n = np.shape(points)[0]
    the_score = ((n-(isotropy_defect**2)*(n-np.sqrt(n)))**2-n)/(n*(n-1))

    return the_score



### Load the readability dataset

In [12]:
df_all = pd.read_csv("../data/processed/modified/gpt-4-1106-preview_modified.csv")
df_all.head(3)

Unnamed: 0,Original,GPT_Response,Goal
0,"Mr. Scott's dog Smart was so trained, that he ...",Mr. Scott had trained his dog Smart so well th...,easier
1,"Mr. Scott's dog Smart was so trained, that he ...","Mr. Scott's canine, denominated Smart, exhibit...",harder
2,Impeachment is a process in which an official ...,Impeachment is when an official is charged wit...,easier


In [13]:
## Separate the dataset

sub_easy = df_all[df_all["Goal"]=="easier"]
sub_hard = df_all[df_all["Goal"]=="harder"]

### Filter for Goal Difficulty, Embed the excerpts, Compute the IsoScore for each

In [20]:
MODELS = ["FacebookAI/roberta-base"#,
          #"openai-community/gpt2",
          #"allenai/OLMoE-1B-7B-0924"
         ]

# Decide which device to allocate models to
DEVICE = torch.device("mps" if torch.backends.mps.is_available() else "cpu")



##### results for just the original excerpts

### testing code

In [27]:
df = sub_easy #grabbed just one of the subset dataframes (the originals are the same for both)
EXCERPT_TYPE = "Original" #column name from df to grab excerpt from

mpath = MODELS[0]

# Load model & tokenizer from HuggingFace 
model = transformers.AutoModel.from_pretrained(mpath,output_hidden_states=True)
model.to(DEVICE) #allocate model to desired device

tokenizer = transformers.AutoTokenizer.from_pretrained(mpath)

# Get some model details to save later
mname = mpath.split("/")[-1]
n_layers = model.config.num_hidden_layers
n_params = count_parameters(model)

# Iterate through data passages
results = []

row = df.iloc[0]

excerpt = row[EXCERPT_TYPE]
        
# Tokenize excerpt
inputs = tokenizer(excerpt, return_tensors="pt").to(DEVICE)

# Run model
with torch.no_grad():
    output = model(**inputs)
    hidden_states = output.hidden_states

# Iterate through model layers
isotropy = []
kdims = []

layer = 0

# Grab layer-specific embeddings
# shape (num tokens x num embed dims)
layer_embed = hidden_states[layer][0]

layer_embed.shape

Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-base and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Total Trainable Params: 124645632


torch.Size([236, 768])

In [28]:
layer_embed.cpu().detach().shape

torch.Size([236, 768])

In [29]:
### ISSUE: can't pca-normalize up to n embedding dimensions when 
# num samples < embedding dimensions

pca_layer_embed = pca_normalization(layer_embed.cpu().detach())
cov_diag = get_diag_of_cov(pca_layer_embed)
cov_diag_normalized = normalize_diagonal(cov_diag)
isotropy_defect = get_isotropy_defect(cov_diag_normalized)
k = get_kdims(isotropy_defect, pca_layer_embed)
phi = get_fraction_dims(k, pca_layer_embed)
isoscore = get_IsoScore(isotropy_defect, pca_layer_embed)

ValueError: n_components=768 must be between 0 and min(n_samples, n_features)=236 with svd_solver='full'

In [None]:
### Designing function - IN PROGRESS

df = sub_easy #grabbed just one of the subset dataframes (the originals are the same for both)

EXCERPT_TYPE = "Original" #column name from df to grab excerpt from

for mpath in tqdm(MODELS, colour="cornflowerblue"):
    
    print(mpath)
    
    # Load model & tokenizer from HuggingFace 
    model = transformers.AutoModel.from_pretrained(mpath,output_hidden_states=True)
    model.to(DEVICE) #allocate model to desired device
    
    tokenizer = transformers.AutoTokenizer.from_pretrained(mpath)
    
    # Get some model details to save later
    mname = mpath.split("/")[-1]
    n_layers = model.config.num_hidden_layers
    n_params = count_parameters(model)
    
    # Iterate through data passages
    results = []
    for ix,row in tqdm(df.iterrows(),total=df.shape[0],colour="hotpink"): 
        
        excerpt = row[EXCERPT_TYPE]
        
        # Tokenize excerpt
        inputs = tokenizer(excerpt, return_tensors="pt").to(DEVICE)
        
        # Run model
        with torch.no_grad():
            output = model(**inputs)
            hidden_states = output.hidden_states
            
        # Iterate through model layers
        isotropy = []
        kdims = []
        
        for layer in range(n_layers+1): 
            
            # Grab layer-specific embeddings
            layer_embed = hidden_states[layer][0]
            
            # Compute k dimensions uniformly used
            
            
            # Compute the IsoScore (Rudman et al. 2022)

In [12]:
### Example code

# random_array_1 = np.random.normal(size=100)
# random_array_2 = np.random.normal(size=100)
# random_array_3 = np.random.normal(size=100)

# # Computing the IsoScore for points sampled from a line (dim=1) in R^3
# point_cloud_line = np.array([random_array_1, np.zeros(100), np.zeros(100)])

# pca_points = pca_normalization(np.transpose(point_cloud_line))

# cov_diag = get_diag_of_cov(pca_points)


# cov_diag_normalized = normalize_diagonal(cov_diag)


# isotropy_defect = get_isotropy_defect(cov_diag_normalized)

# k = get_kdims(isotropy_defect, pca_points)

# phi = get_fraction_dims(k, pca_points)

# the_score = get_IsoScore(isotropy_defect,np.transpose(point_cloud_line))


# # the_score = IsoScore.IsoScore(np.transpose(point_cloud_line))
# print(f"IsoScore for 100 points sampled from this line in R^3 is {the_score}.")
# print(f"k dimensions used uniformly for 100 points from line is {round(k)}.")
# print(f"phi fraction dimensions used uniformly for 100 points from line is {phi}.")