In [1]:
import pandas
from sklearn.metrics import cohen_kappa_score
from  nltk.metrics import agreement
import numpy as np
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score

In [3]:
df1 = pandas.read_csv('../data/inequality_dataset_GPT_NoDefinition.csv')
df2 = pandas.read_csv('../data/inequality_dataset_GPT_ResearcherDefinition.csv')
df3 = pandas.read_csv('../data/inequality_dataset_GPT_GPTDefinition.csv')

In [9]:
df = pandas.DataFrame()
df['nodef'] = df1['gen_code']
df['redef'] = df2['gen_code']
df['llmdef'] = df3['gen_code']

labels = [int(df1['code'][i] != 5) for i in range(len(df1))]

df['code'] = labels

df.head()

Unnamed: 0,nodef,redef,llmdef,code
0,1,1,1,0
1,1,1,1,1
2,1,1,1,1
3,0,0,0,0
4,0,0,0,0


In [10]:
data = []
for idx, row in df.iterrows():
    data.append(("a1", idx, row["nodef"]))
    data.append(("a2", idx, row["redef"]))
    data.append(("a3", idx, row["llmdef"]))
data[0]

('a1', 0, 1)

In [11]:
atask = agreement.AnnotationTask(data=data)

In [12]:
print("Fleiss's Kappa:", atask.multi_kappa())

Fleiss's Kappa: 0.6830947579948012


## Metrics for ensemble method

In [13]:
ensemble = []

for idx, row in df[['nodef', 'redef', 'llmdef']].iterrows():
    #print(list(row))
    ensemble.append((max(set(list(row)), key=list(row).count)))

df['ensemble'] = ensemble

In [14]:
y_true = df.code
y_pred = df.ensemble

# precision/recall of the 0 categories
## flipping the labels -> flipping precision/ recall -> high recall
# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
precision_per_class = precision_score(y_true, y_pred, average=None)

# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")


Weighted average metrics:

Precision for each class: ['0.82', '0.85']
Weighted Average Precision 0.84
Recall for each class: ['0.72', '0.91']
Weighted Average Recall: 0.85
F1 Score for each class: ['0.77', '0.88']
Weighted Average F1 Score: 0.84


In [15]:
df_agree = df[(df['nodef'] == df['redef']) & (df['nodef'] == df['llmdef'])]
y_true = df_agree.code.astype(int)
y_pred = df_agree.nodef.astype(int)

# precision/recall of the 0 categories
## flipping the labels -> flipping precision/ recall -> high recall
# calculate the precision and recall scores
precision = precision_score(y_true, y_pred, pos_label=0)
recall = recall_score(y_true, y_pred, pos_label=0)
accuracy = accuracy_score(y_true, y_pred)
f1 = f1_score(y_true, y_pred, pos_label=0)
cm = confusion_matrix(y_true, y_pred)


conf_matrix = confusion_matrix(y_true, y_pred)
# Extract true positives for each class
true_positives = np.diag(conf_matrix)

# Calculate precision for each class
precision_per_class = precision_score(y_true, y_pred, average=None)

# Calculate proportion of true positives for each class
total_true_positives = np.sum(true_positives)
proportions = true_positives / total_true_positives

weighted_avg_precision = np.sum(precision_per_class * proportions)

    
print("\nWeighted average metrics:\n")

precision_per_class = precision_score(y_true, y_pred, average=None)
print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
weighted_avg_precision = np.sum(precision_per_class * proportions)
print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
recall_per_class = recall_score(y_true, y_pred, average=None)
print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
weighted_avg_recall = np.sum(recall_per_class * proportions)
print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
# Calculate F1 score for each class
f1_per_class = f1_score(y_true, y_pred, average=None)
print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
# Calculate weighted average F1 score
weighted_avg_f1 = np.sum(f1_per_class * proportions)
print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")

print(f"N = {len(df_agree)}")
print(f"%agreement = {len(df_agree) / len(df)}")


Weighted average metrics:

Precision for each class: ['0.93', '0.88']
Weighted Average Precision 0.89
Recall for each class: ['0.72', '0.97']
Weighted Average Recall: 0.91
F1 Score for each class: ['0.81', '0.92']
Weighted Average F1 Score: 0.89
N = 997
%agreement = 0.7956903431763767
