In [19]:
import pandas
from sklearn.metrics import cohen_kappa_score
from  nltk.metrics import agreement
import numpy as np
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score

In [20]:
df1 = pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-NoDefinition.csv', index_col = 0)
df2 = pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-ResearcherDefinition.csv', index_col = 0)
df3 = pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-Llama3_1Definition.csv', index_col = 0)

In [21]:
df = pandas.DataFrame()
df['nodef'] = df1['gen_code']
df['redef'] = df2['gen_code']
df['llmdef'] = df3['gen_code']
labels = [int(code_i in (1, 2)) for code_i in df1.code]
df['code'] = labels

In [22]:
data = []
for idx, row in df.iterrows():
    data.append(("a1", idx, row["nodef"]))
    data.append(("a2", idx, row["redef"]))
    data.append(("a3", idx, row["llmdef"]))
data[0]

('a1', 0, 0)

In [23]:
atask = agreement.AnnotationTask(data=data)

In [24]:
print("Fleiss's Kappa:", atask.multi_kappa())

Fleiss's Kappa: 0.7826510573854957


## Metrics for ensemble method

In [25]:
ensemble = []

for idx, row in df[['nodef', 'redef', 'llmdef']].iterrows():
    #print(list(row))
    ensemble.append((max(set(list(row)), key=list(row).count)))

df['ensemble'] = ensemble

In [26]:
y_true = df.code
y_pred = df.ensemble

# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
precision_per_class = precision_score(y_true, y_pred, average=None)

# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")


Weighted average metrics:

Precision for each class: ['0.65', '0.80']
Weighted Average Precision 0.73
Recall for each class: ['0.79', '0.67']
Weighted Average Recall: 0.73
F1 Score for each class: ['0.71', '0.73']
Weighted Average F1 Score: 0.72


In [27]:
df_agree = df[(df['nodef'] == df['redef']) & (df['nodef'] == df['llmdef'])]
y_true = df_agree.code.astype(int)
y_pred = df_agree.nodef.astype(int)

# precision/recall of the 0 categories
## flipping the labels -> flipping precision/ recall -> high recall
# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
precision_per_class = precision_score(y_true, y_pred, average=None)

# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")

print(f"N = {len(df_agree)}")
print(f"%agreement = {len(df_agree) / len(df)}")


Weighted average metrics:

Precision for each class: ['0.69', '0.84']
Weighted Average Precision 0.77
Recall for each class: ['0.83', '0.70']
Weighted Average Recall: 0.76
F1 Score for each class: ['0.75', '0.76']
Weighted Average F1 Score: 0.76
N = 658
%agreement = 0.8371501272264631
