In [30]:
# Copyright 2025 Luke Moffett
# Licensed under the Apache License, Version 2.0

import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import tqdm
import torch

from IPython.display import display
from clz_or_cls import analysis, hot

tqdm.tqdm.pandas()
device = torch.device('cuda') if torch.cuda.is_available() else torch.device('cpu')
device

plt.rcParams['figure.dpi'] = 100
%matplotlib inline

#### Load Original Dataset

In [None]:
hot_df = hot.hot_metadata_df()
hot_df.head(3)

In [None]:
csvs = analysis.get_hot_classification_csvs()
csvs

In [33]:
result_df = analysis.load_and_concatenate_csvs(csvs, levels=['model', 'size', 'xshot', 'ratio', 'train', 'test', 'class'])

result_df['ID'] = result_df['ID'].astype(int)
result_df['ratio'] = result_df['ratio'].astype(float)

result_df.head()

Unnamed: 0,idx,ID,score,explanation,batch,model,size,xshot,ratio,train,test,class
0,0,0,0.2,\nThe main factors in determining whether or n...,0,llama2,7B,0-shot,0.5,0shot,phonetic,offensive
1,1,1,0.6,\nThe main factors in determining whether or n...,0,llama2,7B,0-shot,0.5,0shot,phonetic,offensive
2,2,2,0.6,\nThe main factors in determining whether or n...,0,llama2,7B,0-shot,0.5,0shot,phonetic,offensive
3,3,3,0.2,\nThe main factors in determining whether or n...,0,llama2,7B,0-shot,0.5,0shot,phonetic,offensive
4,4,4,0.6,\nThe main factors in determining whether or n...,0,llama2,7B,0-shot,0.5,0shot,phonetic,offensive


In [34]:
orig_cols = result_df.columns
idx_cols = ['model', 'size', 'xshot', 'train', 'test', 'ID', 'idx', 'ratio']

keep_cols = [col for col in orig_cols if col not in idx_cols]

all_class_df = result_df[result_df['class'] == 'hateful']
all_class_df = all_class_df.set_index(idx_cols)
all_class_df.columns = pd.MultiIndex.from_tuples(
   [('hateful', col) for col in keep_cols]
)

for clazz in ['offensive', 'toxic']:
    class_results = result_df[result_df['class'] == clazz]
    class_results = class_results.set_index(idx_cols)
    class_results.columns = pd.MultiIndex.from_tuples([(clazz, col) for col in keep_cols])
    all_class_df = all_class_df.merge(class_results, left_index=True, right_index=True, how='outer')

all_class_df.columns = ["_".join(a) for a in all_class_df.columns.to_flat_index()]
all_class_df.index = all_class_df.index.droplevel('xshot').droplevel('train').droplevel('idx')
all_class_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,hateful_score,hateful_explanation,hateful_batch,hateful_class,offensive_score,offensive_explanation,offensive_batch,offensive_class,toxic_score,toxic_explanation,toxic_batch,toxic_class
model,size,test,ID,ratio,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
llama2,7B,phonetic,0,0.5,0.2,\nThe main factors in determining whether or n...,0,hateful,0.2,\nThe main factors in determining whether or n...,0,offensive,0.5,\nThe main factors in determining whether or n...,0,toxic
llama2,7B,phonetic,1,0.5,0.6,\nThe main factors in determining whether or n...,0,hateful,0.6,\nThe main factors in determining whether or n...,0,offensive,0.8,\nThe main factors in determining whether or n...,0,toxic
llama2,7B,phonetic,2,0.5,0.6,\nThe main factors in determining whether or n...,0,hateful,0.6,\nThe main factors in determining whether or n...,0,offensive,0.6,\nThe main factors in determining whether or n...,0,toxic
llama2,7B,phonetic,3,0.5,0.2,\nThe main factors in determining whether or n...,0,hateful,0.2,\nThe main factors in determining whether or n...,0,offensive,0.2,\nThe main factors in determining whether or n...,0,toxic
llama2,7B,phonetic,4,0.5,0.5,\nThe main factors in determining whether or n...,0,hateful,0.6,\nThe main factors in determining whether or n...,0,offensive,0.8,\nThe main factors in determining whether or n...,0,toxic
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mistral,7B-shielded,visual,3476,0.5,0.1,\nThe main factors in determining whether or n...,141,hateful,0.1,\nThe main factors in determining whether or n...,141,offensive,0.1,\nThe main factors in determining whether or n...,141,toxic
mistral,7B-shielded,visual,3477,0.5,0.3,\nThe main factors in determining whether or n...,141,hateful,0.7,\nThe main factors in determining whether or n...,141,offensive,0.3,\nThe main factors in determining whether or n...,141,toxic
mistral,7B-shielded,visual,3478,0.5,0.1,\nThe main factors in determining whether or n...,141,hateful,0.1,\nThe main factors in determining whether or n...,141,offensive,0.1,\nThe main factors in determining whether or n...,141,toxic
mistral,7B-shielded,visual,3479,0.5,0.1,\nThe main factors in determining whether or n...,141,hateful,0.1,\nThe main factors in determining whether or n...,141,offensive,0.1,\nThe main factors in determining whether or n...,141,toxic


In [None]:
labeled_df = hot_df.merge(all_class_df.reset_index(), on='ID')
labeled_df = labeled_df.set_index(['model', 'size', 'test', 'ratio', 'ID']).drop('Unnamed: 0_x', axis=1)
labeled_df.head(3)

In [36]:
from sklearn.metrics import accuracy_score
def find_optimal_threshold(true_labels, predicted_scores):
    best_threshold = 0.0
    best_accuracy = 0.0

    for threshold in np.linspace(0, 1, 101):
        # Convert probabilities to binary predictions based on the current threshold
        predicted_labels = (predicted_scores >= threshold).astype(int)

        # Calculate accuracy
        accuracy = accuracy_score(true_labels, predicted_labels)

        # Check if this threshold gives a better accuracy
        if accuracy > best_accuracy:
            best_accuracy = accuracy
            best_threshold = threshold

    return best_threshold, best_accuracy

thresholds = {}
for model_name in labeled_df.index.get_level_values('model').unique():
    model = labeled_df.loc[model_name, :, 'clean']
    thresholds[model_name] = {}
    for clz in ['hateful', 'offensive', 'toxic']:
        optimal_threshold, optimal_accuracy = find_optimal_threshold(model[f'{clz}_hard'], model[f'{clz}_score'])
        print(f"{clz} Optimal Threshold: {optimal_threshold}, Accuracy: {optimal_accuracy}")
        thresholds[model_name][clz] = optimal_threshold

thresholds

hateful Optimal Threshold: 0.91, Accuracy: 0.8045936395759717
offensive Optimal Threshold: 0.41000000000000003, Accuracy: 0.7724381625441696
toxic Optimal Threshold: 0.7000000000000001, Accuracy: 0.8522968197879859
hateful Optimal Threshold: 0.81, Accuracy: 0.8134275618374558
offensive Optimal Threshold: 0.7000000000000001, Accuracy: 0.757243816254417
toxic Optimal Threshold: 0.91, Accuracy: 0.8212014134275618


{'llama2': {'hateful': 0.91,
  'offensive': 0.41000000000000003,
  'toxic': 0.7000000000000001},
 'mistral': {'hateful': 0.81, 'offensive': 0.7000000000000001, 'toxic': 0.91}}

In [None]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, confusion_matrix, auc, roc_auc_score

def metric_package(model, size, test, ratio, column):
    df = labeled_df.loc[model, size, test, ratio]
    labels_all = df[f'{column}_hard']
    predictions_all = df[f'{column}_thresh']
    predictions_soft = df[f'{column}_score']

    if 'shielded' in size:
        clean_size = size.split('-')[0]
    else:
        clean_size = size

    def safe_auc(true, score):
        try:
            return round(roc_auc_score(true, score), 2)
        except ValueError:
            return np.nan

    def safe_count(field, value, norm=False):
        try:
            return field.value_counts(normalize=norm)[value]
        except KeyError:
            return 0
        
    if test != 'clean':
        clean_df = labeled_df.loc[model, clean_size, 'clean', 0.0]
        pairwise_delta = (df[f'{column}_score'].astype(float) - clean_df[f'{column}_score'].astype(float)).abs().mean()
    else:
        pairwise_delta = np.nan

    return (column,
            'Yes',
            safe_count(labels_all, True),\
            safe_count(labels_all, True, norm=True),\
            safe_count(predictions_all, True),\
            safe_count(predictions_all, True, norm=True),\
            predictions_soft.mean(),\
            predictions_soft.median(),\
            pairwise_delta,\
            round(precision_score(labels_all, predictions_all), 2),\
            round(recall_score(labels_all, predictions_all), 2),\
            round(f1_score(labels_all, predictions_all), 2),\
            round(accuracy_score(labels_all, predictions_all), 2),\
            safe_auc(labels_all, predictions_soft),\
            confusion_matrix(predictions_all, labels_all)),\
            (column,
            'No',
            safe_count(labels_all, False),\
            safe_count(labels_all, False, norm=True),\
            safe_count(predictions_all, False),\
            safe_count(predictions_all, False, norm=True),\
            predictions_soft.mean(),\
            predictions_soft.median(),\
            pairwise_delta,\
            round(precision_score(~labels_all, ~predictions_all), 2),\
            round(recall_score(~labels_all, ~predictions_all), 2),\
            round(f1_score(~labels_all, ~predictions_all), 2),\
            round(accuracy_score(labels_all, predictions_all), 2),\
            safe_auc(labels_all, predictions_soft),\
            confusion_matrix(predictions_all, labels_all))

from itertools import chain
def metric_table(model, size, test, ratio, thresholds={'hateful': .7, 'offensive': .7, 'toxic':.8}):

    labeled_df['hateful_thresh'] = labeled_df['hateful_score'] >= thresholds['hateful']
    labeled_df['offensive_thresh'] = labeled_df['offensive_score'] >= thresholds['offensive']
    labeled_df['toxic_thresh'] = labeled_df['toxic_score'] >= thresholds['toxic']

    categories = ['hateful', 'offensive', 'toxic']
    metrics = list(chain(*[metric_package(model, size, test, ratio, hot) for hot in categories]))
    return pd.DataFrame(metrics, columns=['Category', 'Class', 'Support', 'Support %', 'Prediction', 'Prediction %', 'Mean Score', 'Med Score', '∆ Score', 'Precision', 'Recall', 'F1-Score', 'Accuracy', 'AUC', 'Confusion']).set_index(['Category', 'Class'])

replication_metrics = metric_table('mistral', '7B', 'visual', .125, thresholds=thresholds['mistral'])
replication_metrics

In [38]:
replication_metrics[['Support', 'Precision', 'Recall', 'F1-Score', 'Accuracy']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Support,Precision,Recall,F1-Score,Accuracy
Category,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hateful,Yes,553,0.5,0.24,0.32,0.8
hateful,No,2277,0.84,0.94,0.89,0.8
offensive,Yes,878,0.45,0.69,0.54,0.64
offensive,No,1952,0.82,0.62,0.7,0.64
toxic,Yes,507,0.0,0.0,0.0,0.82
toxic,No,2323,0.82,1.0,0.9,0.82


In [39]:
hot_table_source_paper_df = pd.DataFrame({
    'Category': ['hateful', 'hateful', 'offensive', 'offensive', 'toxic', 'toxic'],
    'Class': ['yes', 'no', 'yes', 'no', 'yes', 'no'],
    'Support': [404, 3077, 862, 2616, 801, 2675],
    'Precision': [0.45, 0.92, 0.67, 0.86, 0.47, 0.94],
    'Recall': [0.34, 0.79, 0.55, 0.91, 0.86, 0.71],
    'F1-score': [0.39, 0.93, 0.61, 0.89, 0.61, 0.81],
    'Accuracy': [0.87, 0.87, 0.82, 0.82, 0.75, 0.75]
}).set_index(['Category', 'Class'])
hot_table_source_paper_df

Unnamed: 0_level_0,Unnamed: 1_level_0,Support,Precision,Recall,F1-score,Accuracy
Category,Class,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
hateful,yes,404,0.45,0.34,0.39,0.87
hateful,no,3077,0.92,0.79,0.93,0.87
offensive,yes,862,0.67,0.55,0.61,0.82
offensive,no,2616,0.86,0.91,0.89,0.82
toxic,yes,801,0.47,0.86,0.61,0.75
toxic,no,2675,0.94,0.71,0.81,0.75


# Results

In [40]:
labeled_df.reset_index().groupby(['model', 'size', 'test', 'ratio']).count()[['ID']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,ID
model,size,test,ratio,Unnamed: 4_level_1
llama2,13B,clean,0.0,2830
llama2,13B,phonetic,0.125,2830
llama2,13B,phonetic,0.25,2830
llama2,13B,phonetic,0.5,2830
llama2,13B,typo,0.125,2830
llama2,13B,typo,0.25,2830
llama2,13B,typo,0.5,2830
llama2,13B,visual,0.125,2830
llama2,13B,visual,0.25,2830
llama2,13B,visual,0.5,2830


In [None]:
# Assuming df is your dataframe
# Group by 'Dataset', 'Prompting', and 'Ratio' and iterate over the groups
keys = []
metric_dfs = []
for (model, size, test, ratio), group in labeled_df.groupby(level=['model', 'size', 'test', 'ratio']):
    keys.append((model, size, test, float(ratio)))
    metrics = metric_table(model, size, test, ratio, thresholds=thresholds[model])
    metric_dfs.append(metrics)

total_metric_table = pd.concat(metric_dfs, keys=keys, names=['model', 'size', 'test', 'ratio'])
total_metric_table

In [42]:
total_metric_table.loc['mistral', :, :, 0.0]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Support,Support %,Prediction,Prediction %,Mean Score,Med Score,∆ Score,Precision,Recall,F1-Score,Accuracy,AUC,Confusion
size,test,Category,Class,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
7B,clean,hateful,Yes,553,0.195406,133,0.046996,0.361519,0.3,,0.59,0.14,0.23,0.81,0.79,"[[2223, 474], [54, 79]]"
7B,clean,hateful,No,2277,0.804594,2697,0.953004,0.361519,0.3,,0.82,0.98,0.89,0.81,0.79,"[[2223, 474], [54, 79]]"
7B,clean,offensive,Yes,878,0.310247,709,0.25053,0.44417,0.4,,0.63,0.51,0.57,0.76,0.81,"[[1693, 428], [259, 450]]"
7B,clean,offensive,No,1952,0.689753,2121,0.74947,0.44417,0.4,,0.8,0.87,0.83,0.76,0.81,"[[1693, 428], [259, 450]]"
7B,clean,toxic,Yes,507,0.179152,1,0.000353,0.509435,0.6,,1.0,0.0,0.0,0.82,0.81,"[[2323, 506], [0, 1]]"
7B,clean,toxic,No,2323,0.820848,2829,0.999647,0.509435,0.6,,0.82,1.0,0.9,0.82,0.81,"[[2323, 506], [0, 1]]"


In [43]:
total_metric_table.loc[:, :, 'visual', 0.5]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Support,Support %,Prediction,Prediction %,Mean Score,Med Score,∆ Score,Precision,Recall,F1-Score,Accuracy,AUC,Confusion
model,size,Category,Class,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1
llama2,13B,hateful,Yes,553,0.195406,0,0.0,0.363746,0.2,0.120106,0.0,0.0,0.0,0.8,0.71,"[[2277, 553], [0, 0]]"
llama2,13B,hateful,No,2277,0.804594,2830,1.0,0.363746,0.2,0.120106,0.8,1.0,0.89,0.8,0.71,"[[2277, 553], [0, 0]]"
llama2,13B,offensive,Yes,878,0.310247,276,0.097527,0.340989,0.4,0.086714,0.8,0.25,0.38,0.75,0.64,"[[1896, 658], [56, 220]]"
llama2,13B,offensive,No,1952,0.689753,2554,0.902473,0.340989,0.4,0.086714,0.74,0.97,0.84,0.75,0.64,"[[1896, 658], [56, 220]]"
llama2,13B,toxic,Yes,507,0.179152,996,0.351943,0.556572,0.6,0.201272,0.33,0.65,0.44,0.7,0.7,"[[1659, 175], [664, 332]]"
llama2,13B,toxic,No,2323,0.820848,1834,0.648057,0.556572,0.6,0.201272,0.9,0.71,0.8,0.7,0.7,"[[1659, 175], [664, 332]]"
llama2,7B,hateful,Yes,553,0.195406,0,0.0,0.323428,0.2,0.09258,0.0,0.0,0.0,0.8,0.69,"[[2277, 553], [0, 0]]"
llama2,7B,hateful,No,2277,0.804594,2830,1.0,0.323428,0.2,0.09258,0.8,1.0,0.89,0.8,0.69,"[[2277, 553], [0, 0]]"
llama2,7B,offensive,Yes,878,0.310247,1378,0.486926,0.359293,0.2,0.096113,0.51,0.8,0.62,0.7,0.74,"[[1274, 178], [678, 700]]"
llama2,7B,offensive,No,1952,0.689753,1452,0.513074,0.359293,0.2,0.096113,0.88,0.65,0.75,0.7,0.74,"[[1274, 178], [678, 700]]"


In [19]:
# Uncomment to Update
# total_metric_table.to_csv('analysis/outputs/hot-llm-total-metrics.csv')

# Cache

In [21]:
total_metric_table = pd.read_csv('analysis/outputs/hot-llm-total-metrics.csv').set_index(['model', 'size', 'test', 'ratio', 'Category', 'Class'])
total_metric_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Support,Support %,Prediction,Prediction %,Mean Score,Med Score,∆ Score,Precision,Recall,F1-Score,Accuracy,AUC,Confusion
model,size,test,ratio,Category,Class,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
llama2,13B,clean,0.0,hateful,Yes,553,0.195406,0,0.000000,0.313110,0.2,,0.00,0.00,0.00,0.80,0.70,[[2277 553]\n [ 0 0]]
llama2,13B,clean,0.0,hateful,No,2277,0.804594,2830,1.000000,0.313110,0.2,,0.80,1.00,0.89,0.80,0.70,[[2277 553]\n [ 0 0]]
llama2,13B,clean,0.0,offensive,Yes,878,0.310247,375,0.132509,0.337385,0.4,,0.79,0.34,0.47,0.77,0.73,[[1874 581]\n [ 78 297]]
llama2,13B,clean,0.0,offensive,No,1952,0.689753,2455,0.867491,0.337385,0.4,,0.76,0.96,0.85,0.77,0.73,[[1874 581]\n [ 78 297]]
llama2,13B,clean,0.0,toxic,Yes,507,0.179152,434,0.153357,0.419187,0.4,,0.63,0.54,0.58,0.86,0.86,[[2162 234]\n [ 161 273]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mistral,7B-shielded,visual,0.5,hateful,No,2257,0.803203,2662,0.947331,0.418754,0.4,0.141922,0.83,0.97,0.89,0.81,0.77,[[2197 465]\n [ 60 88]]
mistral,7B-shielded,visual,0.5,offensive,Yes,878,0.312456,839,0.298577,0.507295,0.6,0.135231,0.56,0.53,0.54,0.72,0.77,[[1560 411]\n [ 372 467]]
mistral,7B-shielded,visual,0.5,offensive,No,1932,0.687544,1971,0.701423,0.507295,0.6,0.135231,0.79,0.81,0.80,0.72,0.77,[[1560 411]\n [ 372 467]]
mistral,7B-shielded,visual,0.5,toxic,Yes,507,0.180427,2,0.000712,0.561139,0.7,0.128754,1.00,0.00,0.01,0.82,0.77,[[2303 505]\n [ 0 2]]


In [22]:
total_metric_table.loc[:,:,'clean', :].groupby(['model', 'size']).mean()['AUC']

  total_metric_table.loc[:,:,'clean', :].groupby(['model', 'size']).mean()['AUC']


model    size
llama2   13B     0.763333
         7B      0.766667
mistral  7B      0.803333
Name: AUC, dtype: float64

In [23]:
total_metric_table

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Support,Support %,Prediction,Prediction %,Mean Score,Med Score,∆ Score,Precision,Recall,F1-Score,Accuracy,AUC,Confusion
model,size,test,ratio,Category,Class,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
llama2,13B,clean,0.0,hateful,Yes,553,0.195406,0,0.000000,0.313110,0.2,,0.00,0.00,0.00,0.80,0.70,[[2277 553]\n [ 0 0]]
llama2,13B,clean,0.0,hateful,No,2277,0.804594,2830,1.000000,0.313110,0.2,,0.80,1.00,0.89,0.80,0.70,[[2277 553]\n [ 0 0]]
llama2,13B,clean,0.0,offensive,Yes,878,0.310247,375,0.132509,0.337385,0.4,,0.79,0.34,0.47,0.77,0.73,[[1874 581]\n [ 78 297]]
llama2,13B,clean,0.0,offensive,No,1952,0.689753,2455,0.867491,0.337385,0.4,,0.76,0.96,0.85,0.77,0.73,[[1874 581]\n [ 78 297]]
llama2,13B,clean,0.0,toxic,Yes,507,0.179152,434,0.153357,0.419187,0.4,,0.63,0.54,0.58,0.86,0.86,[[2162 234]\n [ 161 273]]
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
mistral,7B-shielded,visual,0.5,hateful,No,2257,0.803203,2662,0.947331,0.418754,0.4,0.141922,0.83,0.97,0.89,0.81,0.77,[[2197 465]\n [ 60 88]]
mistral,7B-shielded,visual,0.5,offensive,Yes,878,0.312456,839,0.298577,0.507295,0.6,0.135231,0.56,0.53,0.54,0.72,0.77,[[1560 411]\n [ 372 467]]
mistral,7B-shielded,visual,0.5,offensive,No,1932,0.687544,1971,0.701423,0.507295,0.6,0.135231,0.79,0.81,0.80,0.72,0.77,[[1560 411]\n [ 372 467]]
mistral,7B-shielded,visual,0.5,toxic,Yes,507,0.180427,2,0.000712,0.561139,0.7,0.128754,1.00,0.00,0.01,0.82,0.77,[[2303 505]\n [ 0 2]]


In [24]:
auc_overall = total_metric_table[total_metric_table.index.get_level_values('test') != 'clean'].pivot_table(['AUC'], index=['model', 'size', 'test'], columns=['ratio']).round(2)

In [25]:
total_metric_table[total_metric_table.index.get_level_values('test') == 'visual'].pivot_table(['AUC'], index=['model', 'size', 'test'], columns=['ratio']).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AUC,AUC,AUC
Unnamed: 0_level_1,Unnamed: 1_level_1,ratio,0.125,0.250,0.500
model,size,test,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
llama2,13B,visual,0.73,0.72,0.68
llama2,7B,visual,0.75,0.74,0.73
mistral,7B,visual,0.73,0.69,0.65
mistral,7B-shielded,visual,,,0.77


In [26]:
auc_clean = total_metric_table[total_metric_table.index.get_level_values('test') == 'clean'].pivot_table(['AUC'], index=['model', 'size', 'test'], columns=['ratio']).round(2)

auc_clean.droplevel('test').merge(auc_overall, left_index=True, right_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,Unnamed: 1_level_1,ratio,0.000,0.125,0.250,0.500
model,size,test,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
llama2,13B,phonetic,0.76,0.74,0.74,0.71
llama2,13B,typo,0.76,0.73,0.72,0.69
llama2,13B,visual,0.76,0.73,0.72,0.68
llama2,7B,phonetic,0.77,0.76,0.74,0.73
llama2,7B,typo,0.77,0.75,0.73,0.71
llama2,7B,visual,0.77,0.75,0.74,0.73
mistral,7B,phonetic,0.8,0.75,0.74,0.7
mistral,7B,typo,0.8,0.74,0.71,0.67
mistral,7B,visual,0.8,0.73,0.69,0.65


In [None]:
for line in auc_clean.droplevel('test').merge(auc_overall, left_index=True, right_index=True).style.to_latex().split('\n'):
    print(line)

In [28]:
auc_clean = total_metric_table[total_metric_table.index.get_level_values('test') == 'clean'].pivot_table(['AUC'], index=['model', 'size', 'test'], columns=['ratio']).round(2)

auc_clean.droplevel('test').merge(auc_overall, left_index=True, right_index=True)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AUC,AUC,AUC,AUC
Unnamed: 0_level_1,Unnamed: 1_level_1,ratio,0.000,0.125,0.250,0.500
model,size,test,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
llama2,13B,phonetic,0.76,0.74,0.74,0.71
llama2,13B,typo,0.76,0.73,0.72,0.69
llama2,13B,visual,0.76,0.73,0.72,0.68
llama2,7B,phonetic,0.77,0.76,0.74,0.73
llama2,7B,typo,0.77,0.75,0.73,0.71
llama2,7B,visual,0.77,0.75,0.74,0.73
mistral,7B,phonetic,0.8,0.75,0.74,0.7
mistral,7B,typo,0.8,0.74,0.71,0.67
mistral,7B,visual,0.8,0.73,0.69,0.65


In [29]:
total_metric_table[total_metric_table.index.get_level_values('test') != 'clean'].pivot_table(['Mean Score'], index=['model', 'size', 'test'], columns=['ratio']).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean Score,Mean Score,Mean Score
Unnamed: 0_level_1,Unnamed: 1_level_1,ratio,0.125,0.250,0.500
model,size,test,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
llama2,13B,phonetic,0.37,0.39,0.41
llama2,13B,typo,0.38,0.4,0.43
llama2,13B,visual,0.39,0.4,0.42
llama2,7B,phonetic,0.36,0.37,0.4
llama2,7B,typo,0.36,0.37,0.39
llama2,7B,visual,0.36,0.36,0.37
mistral,7B,phonetic,0.52,0.57,0.63
mistral,7B,typo,0.56,0.61,0.66
mistral,7B,visual,0.6,0.65,0.69
mistral,7B-shielded,visual,,,0.5


In [30]:
total_metric_table[total_metric_table.index.get_level_values('test') == 'clean'].pivot_table(['Mean Score'], index=['model', 'size', 'test'], columns=['ratio']).round(2)

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean Score
Unnamed: 0_level_1,Unnamed: 1_level_1,ratio,0.0
model,size,test,Unnamed: 3_level_2
llama2,13B,clean,0.36
llama2,7B,clean,0.33
mistral,7B,clean,0.44


In [31]:
new_df = pd.DataFrame()
clean_table = total_metric_table[total_metric_table.index.get_level_values('test') == 'clean'].pivot_table(['Mean Score'], index=['model', 'size', 'test'], columns=['ratio'])

for model, size in ([('mistral', '7B'), ('llama2', '13B'), ('llama2', '7B')]):
    test_table = total_metric_table[(total_metric_table.index.get_level_values('test') != 'clean') & (total_metric_table.index.get_level_values('model') == model) &  (total_metric_table.index.get_level_values('size') == size)].pivot_table(['Mean Score'], index=['model', 'size', 'test'], columns=['ratio'])
    for _, ratio in test_table.columns:
        test_table[('Delta', ratio)] = test_table[('Mean Score', ratio)] - clean_table.loc[model, size, 'clean'][('Mean Score', 0.0)]
    new_df = pd.concat([new_df, test_table])
new_df.reset_index().merge(clean_table.reset_index(), on=('model', 'size'), suffixes=('', '__')).drop('test__', axis=1).set_index(['model', 'size', 'test'])[[('Mean Score', 0.0)] + [('Delta', ratio)  for ratio in [0.125, .25, .5]]].round(3)

  new_df.reset_index().merge(clean_table.reset_index(), on=('model', 'size'), suffixes=('', '__')).drop('test__', axis=1).set_index(['model', 'size', 'test'])[[('Mean Score', 0.0)] + [('Delta', ratio)  for ratio in [0.125, .25, .5]]].round(3)
  new_df.reset_index().merge(clean_table.reset_index(), on=('model', 'size'), suffixes=('', '__')).drop('test__', axis=1).set_index(['model', 'size', 'test'])[[('Mean Score', 0.0)] + [('Delta', ratio)  for ratio in [0.125, .25, .5]]].round(3)


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Mean Score,Delta,Delta,Delta
Unnamed: 0_level_1,Unnamed: 1_level_1,ratio,0.0,0.125,0.25,0.5
model,size,test,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2
mistral,7B,phonetic,0.438,0.086,0.132,0.194
mistral,7B,typo,0.438,0.117,0.168,0.225
mistral,7B,visual,0.438,0.164,0.214,0.254
llama2,13B,phonetic,0.357,0.017,0.033,0.056
llama2,13B,typo,0.357,0.027,0.048,0.077
llama2,13B,visual,0.357,0.032,0.047,0.064
llama2,7B,phonetic,0.332,0.026,0.042,0.069
llama2,7B,typo,0.332,0.028,0.036,0.054
llama2,7B,visual,0.332,0.026,0.033,0.037


In [43]:
total_metric_table.loc['llama2'].reset_index().groupby(['size', 'test', 'Category', 'ratio']).count()[['Class']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Class
size,test,Category,ratio,Unnamed: 4_level_1
13B,clean,hateful,0.0,2
13B,clean,offensive,0.0,2
13B,clean,toxic,0.0,2
13B,phonetic,hateful,0.125,2
13B,phonetic,hateful,0.25,2
13B,phonetic,hateful,0.5,2
13B,phonetic,offensive,0.125,2
13B,phonetic,offensive,0.25,2
13B,phonetic,offensive,0.5,2
13B,phonetic,toxic,0.125,2


In [33]:
total_metric_table.loc['llama2', :, 'clean'][['AUC']]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,AUC
size,ratio,Category,Class,Unnamed: 4_level_1
13B,0.0,hateful,Yes,0.7
13B,0.0,hateful,No,0.7
13B,0.0,offensive,Yes,0.73
13B,0.0,offensive,No,0.73
13B,0.0,toxic,Yes,0.86
13B,0.0,toxic,No,0.86
7B,0.0,hateful,Yes,0.69
7B,0.0,hateful,No,0.69
7B,0.0,offensive,Yes,0.78
7B,0.0,offensive,No,0.78


In [34]:
display(total_metric_table.loc['mistral', :, 'clean', 0.0][['AUC']])
display(total_metric_table.loc['mistral', :, 'visual', 0.5][['AUC']])

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AUC
size,Category,Class,Unnamed: 3_level_1
7B,hateful,Yes,0.79
7B,hateful,No,0.79
7B,offensive,Yes,0.81
7B,offensive,No,0.81
7B,toxic,Yes,0.81
7B,toxic,No,0.81


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,AUC
size,Category,Class,Unnamed: 3_level_1
7B,hateful,Yes,0.68
7B,hateful,No,0.68
7B,offensive,Yes,0.61
7B,offensive,No,0.61
7B,toxic,Yes,0.66
7B,toxic,No,0.66
7B-shielded,hateful,Yes,0.77
7B-shielded,hateful,No,0.77
7B-shielded,offensive,Yes,0.77
7B-shielded,offensive,No,0.77


# Statistical Difference Tests

In [35]:
from scipy.stats import ttest_rel

# Example data: Replace these with your actual datasets
rows = []
for (model, size) in [('mistral', '7B'), ('llama2', '7B'), ('llama2', '13B')]:
    for ratio in labeled_df.index.get_level_values('ratio').unique():
        if ratio == 0.0:
            continue
        for attack in ['phonetic', 'typo']:
            for clazz in ['hateful', 'offensive', 'toxic']:
                try:
                    clean_score = labeled_df.loc[model, size, 'clean', 0.0][f'{clazz}_score']
                    baseline_score = labeled_df.loc[model, size, attack, ratio][f'{clazz}_score']

                    score_base = (baseline_score - clean_score).abs()

                    visual_score = labeled_df.loc[model, size, 'visual', ratio][f'{clazz}_score']

                    score_visual = (visual_score - clean_score).abs()

                except:
                    continue

                # # Perform the paired t-test
                stat, p = ttest_rel(score_base, score_visual, alternative='less')
                delta = (score_base - score_visual).mean()

                sig_0_5 = p < 0.05
                sig_bon_ratio = p < (0.05/18)
                sig_bon = p < (0.05/54)
                
                rows.append((model, size, ratio, clazz, attack, stat, p, delta,  sig_0_5, sig_bon_ratio, sig_bon))

sig_tests =pd.DataFrame(rows, columns = ['model', 'size', 'ratio', 'clazz', 'baseline', 't-stat', 'p-value', 'mean', 'sig@.05', 'sig_bon_ratio', 'sig_bon_all'])
display(sig_tests.groupby(['model', 'size', 'ratio', 'sig_bon_ratio']).count()[['clazz']])

  clean_score = labeled_df.loc[model, size, 'clean', 0.0][f'{clazz}_score']
  baseline_score = labeled_df.loc[model, size, attack, ratio][f'{clazz}_score']
  visual_score = labeled_df.loc[model, size, 'visual', ratio][f'{clazz}_score']


Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,clazz
model,size,ratio,sig_bon_ratio,Unnamed: 4_level_1
llama2,13B,0.125,False,1
llama2,13B,0.125,True,5
llama2,13B,0.25,False,3
llama2,13B,0.25,True,3
llama2,13B,0.5,False,4
llama2,13B,0.5,True,2
llama2,7B,0.125,False,4
llama2,7B,0.125,True,2
llama2,7B,0.25,False,6
llama2,7B,0.5,False,6


In [36]:
(0.05/18), (0.05/54)

(0.002777777777777778, 0.000925925925925926)

In [37]:
sig_tests.set_index(['model', 'size', 'ratio', 'baseline']).loc['llama2', '13B'][['clazz', 'sig_bon_ratio']]

  sig_tests.set_index(['model', 'size', 'ratio', 'baseline']).loc['llama2', '13B'][['clazz', 'sig_bon_ratio']]


Unnamed: 0_level_0,Unnamed: 1_level_0,clazz,sig_bon_ratio
ratio,baseline,Unnamed: 2_level_1,Unnamed: 3_level_1
0.5,phonetic,hateful,False
0.5,phonetic,offensive,True
0.5,phonetic,toxic,True
0.5,typo,hateful,False
0.5,typo,offensive,False
0.5,typo,toxic,False
0.125,phonetic,hateful,True
0.125,phonetic,offensive,True
0.125,phonetic,toxic,True
0.125,typo,hateful,True


In [38]:
sig_tests.set_index(['model', 'size', 'ratio', 'baseline']).loc['llama2', '13B'][['clazz', 'sig_bon_ratio']]

  sig_tests.set_index(['model', 'size', 'ratio', 'baseline']).loc['llama2', '13B'][['clazz', 'sig_bon_ratio']]


Unnamed: 0_level_0,Unnamed: 1_level_0,clazz,sig_bon_ratio
ratio,baseline,Unnamed: 2_level_1,Unnamed: 3_level_1
0.5,phonetic,hateful,False
0.5,phonetic,offensive,True
0.5,phonetic,toxic,True
0.5,typo,hateful,False
0.5,typo,offensive,False
0.5,typo,toxic,False
0.125,phonetic,hateful,True
0.125,phonetic,offensive,True
0.125,phonetic,toxic,True
0.125,typo,hateful,True


In [39]:
sig_tests.groupby(['model', 'size', 'sig_bon_ratio']).count()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,ratio,clazz,baseline,t-stat,p-value,mean,sig@.05,sig_bon_all
model,size,sig_bon_ratio,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
llama2,13B,False,8,8,8,8,8,8,8,8
llama2,13B,True,10,10,10,10,10,10,10,10
llama2,7B,False,16,16,16,16,16,16,16,16
llama2,7B,True,2,2,2,2,2,2,2,2
mistral,7B,True,18,18,18,18,18,18,18,18


In [40]:
import math
round_to_3 = lambda x: x if x == 0 or np.isnan(x) else round(x, -int(math.floor(math.log10(abs(x)))) + (4 - 1))

sig_tests_for_print = sig_tests.set_index(['model', 'size', 'ratio', 'baseline', 'clazz']).rename({'t-stat':'T-stat', 'p-value': 'P-value', 'sig@.05': 'Sig. at 0.05', 'mean': 'Diff-in-Diff', 'sig_bon_ratio': 'Sig. BC on Model', 'sig_bon_all':'Sig. BC on All'}, axis=1)

cols = ['Diff-in-Diff', 'T-stat', 'P-value']

sig_tests
for col in cols:
    sig_tests_for_print[col] = sig_tests_for_print[col].apply(round_to_3)

print_cols = cols + ['Sig. at 0.05', 'Sig. BC on Model', 'Sig. BC on All']
sig_tests_for_print[print_cols]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Diff-in-Diff,T-stat,P-value,Sig. at 0.05,Sig. BC on Model,Sig. BC on All
model,size,ratio,baseline,clazz,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
mistral,7B,0.5,phonetic,hateful,-0.04788,-11.04,4.727e-28,True,True,True
mistral,7B,0.5,phonetic,offensive,-0.07102,-15.77,4.7260000000000005e-54,True,True,True
mistral,7B,0.5,phonetic,toxic,-0.05431,-12.93,1.6789999999999999e-37,True,True,True
mistral,7B,0.5,typo,hateful,-0.02385,-5.679,7.476e-09,True,True,True
mistral,7B,0.5,typo,offensive,-0.02141,-5.149,1.396e-07,True,True,True
mistral,7B,0.5,typo,toxic,-0.02102,-5.439,2.904e-08,True,True,True
mistral,7B,0.125,phonetic,hateful,-0.04866,-12.01,9.354e-33,True,True,True
mistral,7B,0.125,phonetic,offensive,-0.08806,-17.96,1.0300000000000001e-68,True,True,True
mistral,7B,0.125,phonetic,toxic,-0.05989,-13.21,5.5670000000000005e-39,True,True,True
mistral,7B,0.125,typo,hateful,-0.02855,-7.17,4.769e-13,True,True,True


In [None]:
sig_tests_for_print

for line in sig_tests_for_print[print_cols].to_latex().split('\n'):
    print(line)