In [72]:
%load_ext autoreload
%autoreload 2

import torch
import pandas as pd
import pickle
import numpy as np
from src.utils.file_utils import load_pickle_from_gpu
from src.utils.results_utils import *

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [73]:
seeds = ["0","2","4","6","7"]
threshold_logit = -0.63

dfs = {}
for seed in seeds:
    dfs[seed] = pd.read_csv(f"data/circuit_identification_data/final_toxicity_prompts_{seed}.csv")

In [74]:
scales = ["baseline","0.1","0.2","0.3","0.4","0.5"]
var_rank = dict((el,{}) for el in seeds)
var_rank_abs = dict((el,{}) for el in seeds)
std = dict((el,{}) for el in seeds)
acc = dict((el,{}) for el in seeds)
mean_majority = dict((el,{}) for el in seeds)
mean_b_ = None
std_b_ = None

for seed in seeds:
    for scale in scales:
        if scale == "baseline":
            file_name = f"work/bias_abl/results_abl_no_edges_bias.json_scaleby0.1_split{seed}.pkl"
            
            var_rank_, var_rank_abs_, std_, acc_, mean_majority_, mean_b_, std_b_ = get_results(file_name, dfs[seed], threshold_logit)

        else:
            file_name = f"work/bias_abl/results_abl_ablated_bias-3_EAP-IG-KL_step7000_6908edges_toxicity-samp_EAP-IG_step19000_18922edges.json_scaleby{scale}_split{seed}.pkl"
                
            var_rank_, var_rank_abs_, std_, acc_, mean_majority_, _, _ = get_results(file_name, dfs[seed], threshold_logit, mean_b_, std_b_)

        
        for res, res_dict in zip([var_rank_, var_rank_abs_, std_, acc_, mean_majority_], [var_rank, var_rank_abs, std, acc, mean_majority]):
            res_dict[seed][scale] = res

In [75]:
# group the results

data = []

for seed in seeds:
    for scale in scales:
        data.append([seed, scale, std[seed][scale], acc[seed][scale], mean_majority[seed][scale]])
        
df = pd.DataFrame(data, columns = ['seed', 'scale', 'std', 'acc', 'mean_majority'])

#add columns of the procentage change from baseline per seed
for metric in ['std', 'acc', 'mean_majority']:
    df[f'{metric}_change'] = None

for row in df.iterrows():
    seed = row[1]['seed']
    row_baseline = df[(df['seed'] == seed) & (df['scale'] == 'baseline')]
    for metric in ['std', 'acc', 'mean_majority']:
        df.at[row[0], f'{metric}_change'] = (row[1][metric] - row_baseline[metric].values[0]) / row_baseline[metric].values[0]


In [76]:
df.head()

Unnamed: 0,seed,scale,std,acc,mean_majority,std_change,acc_change,mean_majority_change
0,0,baseline,0.206837,0.793403,34.520833,0.0,0.0,0.0
1,0,0.1,0.202605,0.804977,34.3125,-0.020465,0.014588,-0.006035
2,0,0.2,0.19165,0.815972,34.541667,-0.073425,0.028446,0.000604
3,0,0.3,0.179449,0.820602,34.416667,-0.132417,0.034282,-0.003018
4,0,0.4,0.172096,0.786458,33.604167,-0.167965,-0.008753,-0.026554


In [77]:
# transform df to format where rows are seeds and columns are scales
metrics = df.columns[2:]

df_pivot = df.pivot(index='seed', columns='scale')

data = []

for scale in scales:
    d = [scale]
    for metric in metrics:
        d.append(df_pivot[metric][scale].mean())
        d.append(df_pivot[metric][scale].std())
    data.append(d)
                 
cols = ['scale']
for metric in metrics:
    cols.append(f"{metric}_mean")
    cols.append(f"{metric}_std")
    
df_scale = pd.DataFrame(data, columns = cols)

In [78]:
df_scale[['scale', 'std_change_mean', 'std_change_std', 'acc_change_mean', 'acc_change_std']]

Unnamed: 0,scale,std_change_mean,std_change_std,acc_change_mean,acc_change_std
0,baseline,0.0,0.0,0.0,0.0
1,0.1,-0.014902,0.005546,0.024803,0.01291
2,0.2,-0.066227,0.010558,0.037487,0.014754
3,0.3,-0.126126,0.016718,0.033992,0.020695
4,0.4,-0.158241,0.022085,0.002172,0.038805
5,0.5,-0.152003,0.026618,-0.064827,0.087505


In [79]:
df_scale.to_csv('work/saved/bias/debiasing_results.csv', index=False)

In [86]:
# rankings for 0.3 vs baseline
rank_b = compute_region_stats([var_rank[str(i)]['baseline'] for i in seeds])
rank_03 = compute_region_stats([var_rank[str(i)]['0.3'] for i in seeds]) 
# calculated difference between 0.3 and baseline across seeds
rank_diff =  compute_region_stats([ (var_rank[str(i)]['0.3']-var_rank[str(i)]['baseline'])/var_rank[str(i)]['baseline']\
                                for i in seeds])

In [87]:
rank_b.sort_values('region', inplace=True)
rank_03.sort_values('region', inplace=True)
rank_diff.sort_values('region', inplace=True)

rank_03['mean_score_baseline'] = rank_b['mean_score']   
rank_03['mean_score_diff'] = rank_diff['mean_score']
rank_03['std_score_baseline'] = rank_b['std_var']
rank_03['std_score_diff'] = rank_diff['std_var']

rank_03.sort_values('mean_score_baseline', ascending=False, inplace=True)

In [88]:
rank_03

Unnamed: 0,region,mean_score,std_var,mean_score_baseline,mean_score_diff,std_score_baseline,std_score_diff
10,Southern Europe,0.092209,0.016317,0.113712,-0.187519,0.021627,0.022317
0,Central America,0.057845,0.009175,0.071304,-0.181599,0.015392,0.054979
6,Oceania,0.033943,0.015428,0.063566,-0.478935,0.006588,0.189409
5,Northern Europe,0.041493,0.009865,0.055175,-0.25661,0.008192,0.089317
11,Western Europe,0.007562,0.004557,0.053556,-0.866767,0.008977,0.066297
4,North America,0.032828,0.006452,0.052628,-0.372835,0.005173,0.133916
7,South America,0.020796,0.013527,0.034528,-0.437523,0.012242,0.250599
2,Eastern Europe,0.025678,0.019342,-0.00165,-2.578545,0.026333,2.168275
1,East Asia,-0.047379,0.015444,-0.05595,-0.162325,0.008651,0.188115
3,Middle East,-0.044824,0.026365,-0.060837,-0.2985,0.018222,0.330261


In [89]:
rank_03.to_csv('work/saved/bias/debiasing_rankings.csv', index=False)

In [90]:
#cleaner version for latex

for col in rank_03.columns:
    rank_03[col] = rank_03[col].round(3)

rank_03['std_score_diff'] = (rank_03['std_score_diff'] * 100).round(1)
rank_03['mean_score_diff'] = (rank_03['mean_score_diff'] * 100).round(1)

#all to string
rank_03 = rank_03.astype(str)

rank_03['mean_score_diff'] = rank_03['mean_score_diff'] + rank_03['std_score_diff'].apply(lambda x: f' (±{x} %)')
rank_03['mean_score'] = rank_03['mean_score'] + rank_03['std_var'].apply(lambda x: f' (±{x})')
rank_03['mean_score_baseline'] = rank_03['mean_score_baseline'] + rank_03['std_score_baseline'].apply(lambda x: f' (±{x})')

rank_03[['region','mean_score_baseline', 'mean_score','mean_score_diff']].to_csv('work/saved/bias/debiasing_rankings_formated.csv', index=False)

In [91]:
rank_03[['region','mean_score_baseline', 'mean_score','mean_score_diff']]

Unnamed: 0,region,mean_score_baseline,mean_score,mean_score_diff
10,Southern Europe,0.114 (±0.022),0.092 (±0.016),-18.8 (±2.2 %)
0,Central America,0.071 (±0.015),0.058 (±0.009),-18.2 (±5.5 %)
6,Oceania,0.064 (±0.007),0.034 (±0.015),-47.9 (±18.9 %)
5,Northern Europe,0.055 (±0.008),0.041 (±0.01),-25.7 (±8.9 %)
11,Western Europe,0.054 (±0.009),0.008 (±0.005),-86.7 (±6.6 %)
4,North America,0.053 (±0.005),0.033 (±0.006),-37.3 (±13.4 %)
7,South America,0.035 (±0.012),0.021 (±0.014),-43.8 (±25.1 %)
2,Eastern Europe,-0.002 (±0.026),0.026 (±0.019),-257.9 (±216.8 %)
1,East Asia,-0.056 (±0.009),-0.047 (±0.015),-16.2 (±18.8 %)
3,Middle East,-0.061 (±0.018),-0.045 (±0.026),-29.8 (±33.0 %)
