In [27]:
# Copyright 2025 Luke Moffett
# Licensed under the Apache License, Version 2.0

import pandas as pd
import matplotlib.pyplot as plt
import os
import tqdm
import torch

from clz_or_cls import analysis

device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device
tqdm.tqdm.pandas()

plt.rcParams['figure.dpi'] = 100
%matplotlib inline

In [28]:
from pathlib import Path
results_basepath = Path(os.environ['CORC_RESULTS_DIR']) / 'abstract'

def get_results_csvs(results_dir=results_basepath):
    """
    Needs to be parameterized to differentiate between word and sentence experiments
    """

    results = {}
    for model in results_dir.iterdir():
        if model.stem not in results.keys():
            results[model.stem] = {}
        for size in model.iterdir():
            if size.stem not in results[model.stem].keys():
                results[model.stem][size.stem] = {}
            for clazz in size.iterdir():
                if clazz.suffix == ".csv":
                    results[model.stem][size.stem][clazz.stem] = clazz
                else:
                    print("skipping", clazz.stem)
    return results  

sentence_result_csvs = get_results_csvs(results_dir=results_basepath/'abstract-recovery-sentence')
sentence_result_csvs

{'llama2': {'13B': {'typo': PosixPath('results/abstracts/abstract-recovery-sentence/llama2/13B/typo.csv'),
   'phonetic': PosixPath('results/abstracts/abstract-recovery-sentence/llama2/13B/phonetic.csv'),
   'visual': PosixPath('results/abstracts/abstract-recovery-sentence/llama2/13B/visual.csv')},
  '7B': {'typo': PosixPath('results/abstracts/abstract-recovery-sentence/llama2/7B/typo.csv'),
   'phonetic': PosixPath('results/abstracts/abstract-recovery-sentence/llama2/7B/phonetic.csv'),
   'visual': PosixPath('results/abstracts/abstract-recovery-sentence/llama2/7B/visual.csv')}},
 'mistral': {'7B': {'phonetic': PosixPath('results/abstracts/abstract-recovery-sentence/mistral/7B/phonetic.csv'),
   'typo': PosixPath('results/abstracts/abstract-recovery-sentence/mistral/7B/typo.csv'),
   'visual': PosixPath('results/abstracts/abstract-recovery-sentence/mistral/7B/visual.csv')}}}

In [29]:
word_result_csvs = get_results_csvs(results_dir=results_basepath/'abstract-recovery-word')
word_result_csvs

{'llama2': {'13B': {'phonetic': PosixPath('results/abstracts/abstract-recovery-word/llama2/13B/phonetic.csv'),
   'typo': PosixPath('results/abstracts/abstract-recovery-word/llama2/13B/typo.csv'),
   'visual': PosixPath('results/abstracts/abstract-recovery-word/llama2/13B/visual.csv')},
  '7B': {'phonetic': PosixPath('results/abstracts/abstract-recovery-word/llama2/7B/phonetic.csv'),
   'typo': PosixPath('results/abstracts/abstract-recovery-word/llama2/7B/typo.csv'),
   'visual': PosixPath('results/abstracts/abstract-recovery-word/llama2/7B/visual.csv')}},
 'mistral': {'7B': {'typo': PosixPath('results/abstracts/abstract-recovery-word/mistral/7B/typo.csv'),
   'phonetic': PosixPath('results/abstracts/abstract-recovery-word/mistral/7B/phonetic.csv'),
   'visual': PosixPath('results/abstracts/abstract-recovery-word/mistral/7B/visual.csv')}}}

In [30]:
sentence_df = analysis.load_and_concatenate_csvs(sentence_result_csvs, levels=['model', 'size', 'class'])
sentence_df.groupby(['model', 'size', 'class']).size()

model    size  class   
llama2   13B   phonetic    5000
               typo        5000
               visual      5000
         7B    phonetic    5000
               typo        5000
               visual      5000
mistral  7B    phonetic    5000
               typo        5000
               visual      5000
dtype: int64

In [31]:
word_df = analysis.load_and_concatenate_csvs(word_result_csvs, levels=['model', 'size', 'class'])
word_df.groupby(['model', 'size', 'class']).size()

model    size  class   
llama2   13B   phonetic    5000
               typo        4980
               visual      5000
         7B    phonetic    5000
               typo        5000
               visual      5000
mistral  7B    phonetic    5000
               typo        5000
               visual      4995
dtype: int64

In [33]:
sentence_df['perturbed'] = sentence_df['word']
sentence_df['predicted'] = sentence_df['recovered']
# just pick a particular example
sentence_df['clean'] = sentence_df.merge(word_df.loc[(word_df['model'] == 'mistral') & (word_df['class'] == 'phonetic') & (word_df['size'] == '7B')], how='left', on=['idx'], suffixes=('_s', '_w'))['clean']
sentence_df

Unnamed: 0,idx,word,recovered,batch,model,size,class,perturbed,predicted,clean
0,0,gaes,gaps,0,llama2,13B,typo,gaes,gaps,games
1,1,ebnmdidges,edges,0,llama2,13B,typo,ebnmdidges,edges,embeddings
2,2,drneicesag,decaying,0,llama2,13B,typo,drneicesag,decaying,decreasing
3,3,diaynmc,dynamic,0,llama2,13B,typo,diaynmc,dynamic,dynamic
4,4,%tr$adit<ional,traditional,0,llama2,13B,typo,%tr$adit<ional,traditional,traditional
...,...,...,...,...,...,...,...,...,...,...
4995,4995,o☊iginal,original,999,mistral,7B,visual,o☊iginal,original,original
4996,4996,ḹӘnguƋge,Knowledge-aware ḹƋge Model Attribut,999,mistral,7B,visual,ḹӘnguƋge,Knowledge-aware ḹƋge Model Attribut,language
4997,4997,diamОnd,diamond,999,mistral,7B,visual,diamОnd,diamond,diamond
4998,4998,hourՏ,hour,999,mistral,7B,visual,hourՏ,hour,hours


In [34]:
from nltk.stem import PorterStemmer
import nltk
nltk.download('punkt')

# Initialize the Porter Stemmer
porter = PorterStemmer()

for __df in [sentence_df, word_df]:
    __df['clean_stem'] = __df['clean'].fillna('').apply(porter.stem)
    __df['predicted_stem'] = __df['predicted'].fillna('').apply(porter.stem)

[nltk_data] Downloading package punkt to
[nltk_data]     /home/users/lam135/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [35]:
exp_df = pd.concat([sentence_df, word_df], keys=['sentence', 'word']).reset_index().rename({'level_0': 'experiment'}, axis=1).set_index(['experiment', 'model', 'size', 'class', 'idx']).drop(['level_1', 'Unnamed: 0'], axis=1).fillna('')
exp_df[['clean', 'perturbed', 'predicted']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,clean,perturbed,predicted
experiment,model,size,class,idx,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
sentence,llama2,13B,typo,0,games,gaes,gaps
sentence,llama2,13B,typo,1,embeddings,ebnmdidges,edges
sentence,llama2,13B,typo,2,decreasing,drneicesag,decaying
sentence,llama2,13B,typo,3,dynamic,diaynmc,dynamic
sentence,llama2,13B,typo,4,traditional,%tr$adit<ional,traditional
...,...,...,...,...,...,...,...
word,mistral,7B,visual,4995,original,o☊iginal,original
word,mistral,7B,visual,4996,language,ḹӘnguƋge,english
word,mistral,7B,visual,4997,diamond,diamОnd,diamond
word,mistral,7B,visual,4998,hours,hourՏ,hour


In [36]:
exp_df['match'] = (exp_df['clean'].str.lower().str.strip() == exp_df['predicted'].str.lower().str.strip())

accuracy_exp_model = exp_df.groupby(level=['experiment', 'model', 'size', 'class'])['match'].mean().round(3) * 100
acc_table = accuracy_exp_model.reset_index().pivot(index=['model', 'size', 'class'], columns='experiment', values='match')
acc_table['context_boost'] = acc_table['sentence'] - acc_table['word']
acc_table

Unnamed: 0_level_0,Unnamed: 1_level_0,experiment,sentence,word,context_boost
model,size,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
llama2,13B,phonetic,76.7,60.5,16.2
llama2,13B,typo,73.7,53.1,20.6
llama2,13B,visual,73.8,57.2,16.6
llama2,7B,phonetic,65.2,54.8,10.4
llama2,7B,typo,63.3,50.0,13.3
llama2,7B,visual,62.4,47.7,14.7
mistral,7B,phonetic,74.4,61.3,13.1
mistral,7B,typo,62.6,51.3,11.3
mistral,7B,visual,62.3,48.3,14.0


In [37]:
accuracy_exp_model_avg = exp_df.groupby(level=['experiment', 'model', 'size'])['match'].mean().round(3) * 100
acc_avg_table = accuracy_exp_model_avg.reset_index().pivot(index=['model', 'size'], columns='experiment', values='match')
acc_avg_table['context_boost'] = acc_avg_table['sentence'] - acc_avg_table['word']
acc_avg_table

Unnamed: 0_level_0,experiment,sentence,word,context_boost
model,size,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
llama2,13B,74.7,56.9,17.8
llama2,7B,63.6,50.9,12.7
mistral,7B,66.4,53.7,12.7


In [38]:
acc_table.mean()

experiment
sentence         68.266667
word             53.800000
context_boost    14.466667
dtype: float64

In [39]:
exp_df['match_stem'] = (exp_df['clean_stem'].str.lower().str.strip() == exp_df['predicted_stem'].str.lower().str.strip())

accuracy_exp_model = exp_df.groupby(level=['experiment', 'model', 'size', 'class'])['match_stem'].mean().round(3) * 100
acc_stem_table = accuracy_exp_model.reset_index().pivot(index=['model', 'size', 'class'], columns='experiment', values='match_stem')
acc_stem_table['context_boost'] = acc_stem_table['sentence'] - acc_stem_table['word']
acc_stem_table

Unnamed: 0_level_0,Unnamed: 1_level_0,experiment,sentence,word,context_boost
model,size,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
llama2,13B,phonetic,82.3,69.2,13.1
llama2,13B,typo,76.6,58.4,18.2
llama2,13B,visual,77.7,62.9,14.8
llama2,7B,phonetic,70.8,64.1,6.7
llama2,7B,typo,65.9,55.5,10.4
llama2,7B,visual,66.8,53.9,12.9
mistral,7B,phonetic,80.0,71.2,8.8
mistral,7B,typo,65.7,57.6,8.1
mistral,7B,visual,67.6,55.1,12.5


In [40]:
acc_stem_table.mean()

experiment
sentence         72.600000
word             60.877778
context_boost    11.722222
dtype: float64

In [41]:
acc_table['context_boost'].mean()

14.466666666666669

In [42]:
TARGET_ACCURACY = .655

In [43]:
from nltk.metrics.distance  import edit_distance
exp_df['edit'] = exp_df.apply(lambda x: edit_distance(x['clean'], x['predicted']), axis=1)

edit_exp_model = exp_df.groupby(level=['experiment', 'model', 'size', 'class'])['edit'].mean().round(2)
edit_table = edit_exp_model.reset_index().pivot(index=['model', 'size', 'class'], columns='experiment', values='edit')
edit_table['context_boost'] = edit_table['sentence'] - edit_table['word']
edit_table

Unnamed: 0_level_0,Unnamed: 1_level_0,experiment,sentence,word,context_boost
model,size,class,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
llama2,13B,phonetic,0.87,1.49,-0.62
llama2,13B,typo,1.23,1.96,-0.73
llama2,13B,visual,1.16,1.56,-0.4
llama2,7B,phonetic,1.44,1.73,-0.29
llama2,7B,typo,1.89,2.16,-0.27
llama2,7B,visual,1.82,2.07,-0.25
mistral,7B,phonetic,0.83,1.35,-0.52
mistral,7B,typo,1.55,2.12,-0.57
mistral,7B,visual,1.44,2.14,-0.7


In [44]:
merged_df = sentence_df.merge(word_df, on=['model', 'size', 'class', 'idx'], suffixes=('_s', '_w'))
merged_df['match_w'] = (merged_df['clean_w'].str.lower().str.strip() == merged_df['predicted_w'].str.lower().str.strip())
merged_df['match_s'] = (merged_df['clean_s'].str.lower().str.strip() == merged_df['predicted_s'].str.lower().str.strip())

merged_df['match_stem_w'] = (merged_df['clean_stem_w'].str.lower().str.strip() == merged_df['predicted_stem_w'].str.lower().str.strip())
merged_df['match_stem_s'] = (merged_df['clean_stem_s'].str.lower().str.strip() == merged_df['predicted_stem_s'].str.lower().str.strip())

In [45]:
word_recovered_df = merged_df[merged_df['match_w']]
word_failed_df = merged_df[~merged_df['match_w']]

In [46]:
def acc_table(df, match_col='match'):
    acc_df = df.groupby(['model', 'size', 'class'])[match_col].mean().round(3) * 100
    acc_table = acc_df.reset_index().pivot(index=['model', 'size'], columns='class', values=match_col)
    # table['n'] = 
    size_df = df.groupby(['model', 'size', 'class'])[match_col].size().apply(lambda x: f'{x:,}')
    size_table = size_df.reset_index().pivot(index=['model', 'size'], columns='class', values=match_col)
    return pd.concat([acc_table, size_table], keys=('acc', 'n'), axis=1)

def comp_table(recovered_df, failed_df, match_col='match'):

    table_recovered = acc_table(recovered_df, match_col=match_col)
    table_failed = acc_table(failed_df, match_col=match_col)

    full_table = pd.concat([table_recovered, table_failed], keys=('success', 'fail')).reset_index().rename({'level_0': 'word_result'}, axis=1).set_index(['model', 'size', 'word_result']).swaplevel(0,1,axis=1).sort_index().sort_index(axis=1)
    return full_table

comp_table(word_recovered_df, word_failed_df, match_col='match_s')

Unnamed: 0_level_0,Unnamed: 1_level_0,class,phonetic,phonetic,typo,typo,visual,visual
Unnamed: 0_level_1,Unnamed: 1_level_1,Unnamed: 2_level_1,acc,n,acc,n,acc,n
model,size,word_result,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2
llama2,13B,fail,52.9,1976,51.8,2337,49.6,2142
llama2,13B,success,92.2,3024,93.1,2643,91.9,2858
llama2,7B,fail,44.5,2258,42.0,2500,42.6,2614
llama2,7B,success,82.2,2742,84.5,2500,84.1,2386
mistral,7B,fail,46.7,1933,34.5,2434,36.0,2583
mistral,7B,success,91.9,3067,89.4,2566,90.4,2412


In [47]:
acc_table_stem_recovered = acc_table(word_recovered_df, match_col='match_stem_s')
acc_table_stem_failed = acc_table(word_failed_df, match_col='match_stem_s')

pd.concat([acc_table_stem_recovered, acc_table_stem_failed], keys=('word_recovered', 'word_failed'), axis=1).swaplevel(0,1,axis=1).sort_index(axis=1).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,acc,acc,acc,acc,acc,acc,n,n,n,n,n,n
Unnamed: 0_level_1,Unnamed: 1_level_1,word_failed,word_failed,word_failed,word_recovered,word_recovered,word_recovered,word_failed,word_failed,word_failed,word_recovered,word_recovered,word_recovered
Unnamed: 0_level_2,class,phonetic,typo,visual,phonetic,typo,visual,phonetic,typo,visual,phonetic,typo,visual
model,size,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3,Unnamed: 12_level_3,Unnamed: 13_level_3
llama2,13B,63.3,57.0,57.0,94.7,94.0,93.3,1976,2337,2142,3024,2643,2858
llama2,7B,53.9,46.2,49.7,84.6,85.6,85.5,2258,2500,2614,2742,2500,2386
mistral,7B,57.7,39.7,44.5,94.0,90.3,92.3,1933,2434,2583,3067,2566,2412


In [48]:
def edit_table(df):
    df = df.copy()
    df['edit'] = df.apply(lambda x: edit_distance(x['clean_s'], x['predicted_s']), axis=1)
    edit_df = df.groupby(['model', 'size', 'class'])['edit'].mean()
    return edit_df.reset_index().pivot(index=['model', 'size'], columns='class', values='edit')

edit_table_recovered = edit_table(word_recovered_df)
edit_table_failed = edit_table(word_failed_df)

pd.concat([edit_table_recovered, edit_table_failed], keys=('word_recovered', 'word_failed'), axis=1).swaplevel(0,1,axis=1).sort_index(axis=1).round(2)

Unnamed: 0_level_0,class,phonetic,phonetic,typo,typo,visual,visual
Unnamed: 0_level_1,Unnamed: 1_level_1,word_failed,word_recovered,word_failed,word_recovered,word_failed,word_recovered
model,size,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2
llama2,13B,1.8,0.26,2.27,0.32,2.27,0.32
llama2,7B,2.47,0.6,3.18,0.59,2.91,0.63
mistral,7B,1.73,0.26,2.79,0.37,2.49,0.32
