In [1]:
import pandas
from sklearn.metrics import cohen_kappa_score
from  nltk.metrics import agreement
import numpy as np
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score

In [2]:
df1 = pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-NoDefinition.csv', index_col = 0)
df2 = pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-NoDefinition.csv', index_col = 0)
df3 = pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-NoDefinition.csv', index_col = 0)
df4 = pandas.read_csv('../data/inequality_dataset_GPT_Inequality_NoDefinition.csv', index_col=0)

In [4]:
df = pandas.DataFrame()
df['llama31'] = df1['gen_code']
df['llama3'] = df2['gen_code']
df['gemma'] = df3['gen_code']
df['gpt'] = df4['gen_code']

labels = [int(code_i in (1, 2)) for code_i in df1.code]

df['code'] = labels

df.head()

Unnamed: 0,llama31,llama3,gemma,gpt,code
0,0,0,0,0,0
1,0,0,1,1,1
2,1,1,1,1,1
3,0,0,0,0,0
4,0,0,1,0,0


# Metrics without GPT

In [11]:
data = []
for idx, row in df.iterrows():
    data.append(("a1", idx, row["llama31"]))
    data.append(("a2", idx, row["llama3"]))
    data.append(("a3", idx, row["gemma"]))
data[0]

('a1', 0, 0)

In [12]:
atask = agreement.AnnotationTask(data=data)

In [13]:
print("Fleiss's Kappa:", atask.multi_kappa())

Fleiss's Kappa: 0.6653988328512997


## Metrics for ensemble method

In [14]:
ensemble = []

for idx, row in df[['llama31', 'llama3', 'gemma', 'gpt']].iterrows():
    #print(list(row))
    ensemble.append((max(set(list(row)), key=list(row).count)))

df['ensemble'] = ensemble

In [15]:
y_true = df.code
y_pred = df.ensemble

# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
precision_per_class = precision_score(y_true, y_pred, average=None)

# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")


Weighted average metrics:

Precision for each class: ['0.65', '0.82']
Weighted Average Precision 0.74
Recall for each class: ['0.82', '0.65']
Weighted Average Recall: 0.74
F1 Score for each class: ['0.73', '0.73']
Weighted Average F1 Score: 0.73


In [16]:
df_agree = df[(df['llama31'] == df['llama3']) & (df['llama31'] == df['gemma']) & (df['llama3'] == df['gemma'])]
y_true = df_agree.code.astype(int)
y_pred = df_agree.gemma.astype(int)

# precision/recall of the 0 categories
## flipping the labels -> flipping precision/ recall -> high recall
# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
#precision_per_class = precision_score(y_true, y_pred, average=None)
weighted_avg_precision = precision_score(y_true, y_pred, average='weighted')



# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")

print(f"N = {len(df_agree)}")
print(f"%agreement = {len(df_agree) / len(df)}")


Weighted average metrics:

Precision for each class: ['0.72', '0.84']
Weighted Average Precision 0.78
Recall for each class: ['0.82', '0.75']
Weighted Average Recall: 0.78
F1 Score for each class: ['0.77', '0.79']
Weighted Average F1 Score: 0.78
N = 587
%agreement = 0.7468193384223919


# Metrics with GPT

Note these are not reported in the paper.

In [5]:
data = []
for idx, row in df.iterrows():
    data.append(("a1", idx, row["llama31"]))
    data.append(("a2", idx, row["llama3"]))
    data.append(("a3", idx, row["gemma"]))
    data.append(("a4", idx, row["gpt"]))
data[0]

('a1', 0, 0)

In [6]:
atask = agreement.AnnotationTask(data=data)

In [7]:
print("Fleiss's Kappa:", atask.multi_kappa())

Fleiss's Kappa: 0.6813220332065631


## Metrics for ensemble method

In [8]:
ensemble = []

for idx, row in df[['llama31', 'llama3', 'gemma', 'gpt']].iterrows():
    #print(list(row))
    ensemble.append((max(set(list(row)), key=list(row).count)))

df['ensemble'] = ensemble

In [9]:
y_true = df.code
y_pred = df.ensemble

# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
precision_per_class = precision_score(y_true, y_pred, average=None)

# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")


Weighted average metrics:

Precision for each class: ['0.65', '0.82']
Weighted Average Precision 0.74
Recall for each class: ['0.82', '0.65']
Weighted Average Recall: 0.74
F1 Score for each class: ['0.73', '0.73']
Weighted Average F1 Score: 0.73


In [10]:
df_agree = df[(df['llama31'] == df['llama3']) & (df['llama31'] == df['gemma']) & (df['llama3'] == df['gemma'])]
y_true = df_agree.code.astype(int)
y_pred = df_agree.gemma.astype(int)

# precision/recall of the 0 categories
## flipping the labels -> flipping precision/ recall -> high recall
# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
#precision_per_class = precision_score(y_true, y_pred, average=None)
weighted_avg_precision = precision_score(y_true, y_pred, average='weighted')



# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")

print(f"N = {len(df_agree)}")
print(f"%agreement = {len(df_agree) / len(df)}")


Weighted average metrics:

Precision for each class: ['0.72', '0.84']
Weighted Average Precision 0.78
Recall for each class: ['0.82', '0.75']
Weighted Average Recall: 0.78
F1 Score for each class: ['0.77', '0.79']
Weighted Average F1 Score: 0.78
N = 587
%agreement = 0.7468193384223919
