## Calculate Accuracy Metrics

In [38]:
import pandas
import numpy as np
from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, f1_score

In [60]:
def calc_metrics(data):
    #labels = [int(code_i in (1, 2)) for code_i in data.code]
    labels = [int(data['code'][i] != 5) for i in range(len(data))]
    data['label'] = labels

    y_true = data.label
    y_pred = data.gen_code.astype(int)

    # calculate the precision and recall scores
    precision = precision_score(y_true, y_pred, pos_label=0)
    recall = recall_score(y_true, y_pred, pos_label=0)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label=0)
    cm = confusion_matrix(y_true, y_pred)


    conf_matrix = confusion_matrix(y_true, y_pred)
    # Extract true positives for each class
    true_positives = np.diag(conf_matrix)

    # Calculate precision for each class
    precision_per_class = precision_score(y_true, y_pred, average=None)

    # Calculate proportion of true positives for each class
    total_true_positives = np.sum(true_positives)
    proportions = true_positives / total_true_positives

    weighted_avg_precision = np.sum(precision_per_class * proportions)


    print("\nWeighted average metrics:\n")

    precision_per_class = precision_score(y_true, y_pred, average=None)
    print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
    weighted_avg_precision = np.sum(precision_per_class * proportions)
    print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
    recall_per_class = recall_score(y_true, y_pred, average=None)
    print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
    weighted_avg_recall = np.sum(recall_per_class * proportions)
    print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
    # Calculate F1 score for each class
    f1_per_class = f1_score(y_true, y_pred, average=None)
    print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
    # Calculate weighted average F1 score
    weighted_avg_f1 = np.sum(f1_per_class * proportions)
    print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")


In [58]:
def calc_metrics_inequality(data):

    labels = [int(code_i in (1, 2)) for code_i in data.code]
    data['label'] = labels

    y_true = data.label
    y_pred = data.gen_code.astype(int)

    # calculate the precision and recall scores
    
    precision = precision_score(y_true, y_pred, pos_label=0)
    recall = recall_score(y_true, y_pred, pos_label=0)
    accuracy = accuracy_score(y_true, y_pred)
    f1 = f1_score(y_true, y_pred, pos_label=0)
    cm = confusion_matrix(y_true, y_pred)

    conf_matrix = confusion_matrix(y_true, y_pred)
    # Extract true positives for each class
    true_positives = np.diag(conf_matrix)

    # Calculate precision for each class
    precision_per_class = precision_score(y_true, y_pred, average=None)

    # Calculate proportion of true positives for each class
    total_true_positives = np.sum(true_positives)
    proportions = true_positives / total_true_positives

    weighted_avg_precision = np.sum(precision_per_class * proportions)


    print("\nWeighted average metrics:\n")

    precision_per_class = precision_score(y_true, y_pred, average=None)
    print(f"Precision for each class: {[f'{r:.2f}' for r in precision_per_class]}")
    weighted_avg_precision = np.sum(precision_per_class * proportions)
    print(f"Weighted Average Precision {weighted_avg_precision:.2f}")
    recall_per_class = recall_score(y_true, y_pred, average=None)
    print(f"Recall for each class: {[f'{r:.2f}' for r in recall_per_class]}")
    weighted_avg_recall = np.sum(recall_per_class * proportions)
    print(f"Weighted Average Recall: {weighted_avg_recall:.2f}")
    # Calculate F1 score for each class
    f1_per_class = f1_score(y_true, y_pred, average=None)
    print(f"F1 Score for each class: {[f'{f:.2f}' for f in f1_per_class]}")
    # Calculate weighted average F1 score
    weighted_avg_f1 = np.sum(f1_per_class * proportions)
    print(f"Weighted Average F1 Score: {weighted_avg_f1:.2f}")

In [61]:
print("Relevant: Zero Shot")
print("---------------")
print("---------------")

print("\nLlama3(70b): No definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_llama3-70b-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3(70b): Researcher definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_llama3-70b-ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3(70b): Llama definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_llama3-70b-LlamaDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): No definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): Researcher definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): Llama3.1-generated definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-LlamaDefinition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): No definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset-gemma2-27b-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): Researcher-generated definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_gemma2-27b-ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): Gemma-generated definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_gemma2-27b-GemmaDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: No definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_GPT_NoDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: Researcher-generated definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_GPT_ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: GPT-generated definition:\n")
calc_metrics(pandas.read_csv('../data/inequality_dataset_GPT_ResearcherDefinition.csv', index_col = 0))
print("---------------")

Relevant: Zero Shot
---------------
---------------

Llama3(70b): No definition:


Weighted average metrics:

Precision for each class: ['0.63', '0.81']
Weighted Average Precision 0.74
Recall for each class: ['0.70', '0.75']
Weighted Average Recall: 0.73
F1 Score for each class: ['0.66', '0.78']
Weighted Average F1 Score: 0.74
---------------

Llama3(70b): Researcher definition:


Weighted average metrics:

Precision for each class: ['0.60', '0.87']
Weighted Average Precision 0.76
Recall for each class: ['0.84', '0.66']
Weighted Average Recall: 0.74
F1 Score for each class: ['0.70', '0.75']
Weighted Average F1 Score: 0.73
---------------

Llama3(70b): Llama definition:


Weighted average metrics:

Precision for each class: ['0.72', '0.88']
Weighted Average Precision 0.82
Recall for each class: ['0.82', '0.81']
Weighted Average Recall: 0.82
F1 Score for each class: ['0.77', '0.85']
Weighted Average F1 Score: 0.82
---------------

Llama3.1(70b): No definition:


Weighted average metrics:

In [64]:
print("Inequality: Zero Shot")
print("---------------")
print("---------------")

calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3(70b): Researcher definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3(70b): Llama3-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-LlamaDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): Researcher definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): Llama3.1-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-Llama3_1Definition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): Researcher-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): Gemma-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-GemmaDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_GPT_Inequality_NoDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: Researcher-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_GPT_Inequality_ResearcherDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: GPT-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_GPT_Inequality_GPTDefinition.csv', index_col = 0))
print("---------------")

Inequality: Zero Shot
---------------
---------------

Weighted average metrics:

Precision for each class: ['0.62', '0.82']
Weighted Average Precision 0.71
Recall for each class: ['0.84', '0.58']
Weighted Average Recall: 0.72
F1 Score for each class: ['0.71', '0.68']
Weighted Average F1 Score: 0.70
---------------

Llama3(70b): Researcher definition:


Weighted average metrics:

Precision for each class: ['0.64', '0.86']
Weighted Average Precision 0.74
Recall for each class: ['0.87', '0.61']
Weighted Average Recall: 0.75
F1 Score for each class: ['0.74', '0.71']
Weighted Average F1 Score: 0.73
---------------

Llama3(70b): Llama3-generated definition:


Weighted average metrics:

Precision for each class: ['0.66', '0.88']
Weighted Average Precision 0.76
Recall for each class: ['0.89', '0.64']
Weighted Average Recall: 0.77
F1 Score for each class: ['0.76', '0.74']
Weighted Average F1 Score: 0.75
---------------

Llama3.1(70b): No definition:


Weighted average metrics:

Precision for e

In [48]:
print("Inequality: Few Shot")
print("---------------")
print("---------------")

print("\nLlama3(70b): No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-FewShot-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3(70b): Researcher definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-FewShot-ResearcherDefinition.csv', index_col = 0))
print("---------------")

#print("\nLlama3(70b): Llama3-generated definition:\n")
#calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3-70b-Inequality-LlamaDefinition.csv', index_col = 0))
#print("---------------")

print("\nLlama3.1(70b): No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-FewShot-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nLlama3.1(70b): Researcher definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-FewShot-ResearcherDefinition.csv', index_col = 0))
print("---------------")

#print("\nLlama3.1(70b): Llama3.1-generated definition:\n")
#calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_llama3_1-70b-Inequality-Llama3_1Definition.csv', index_col = 0))
#print("---------------")

print("\nGemma2(27b): No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-FewShot-NoDefinition.csv', index_col = 0))
print("---------------")

print("\nGemma2(27b): Researcher-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-FewShot-ResearcherDefinition.csv', index_col = 0))
print("---------------")

#print("\nGemma2(27b): Gemma-generated definition:\n")
#calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_gemma2-27b-Inequality-GemmaDefinition.csv', index_col = 0))
#print("---------------")

print("\nGPT: No definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_GPT_Inequality_FewShot_NoDefinition.csv', index_col = 0))
print("---------------")

print("\nGPT: Researcher-generated definition:\n")
calc_metrics_inequality(pandas.read_csv('../data/inequality_dataset_GPT_Inequality_FewShot_ResearcherDefinition.csv', index_col = 0))
print("---------------")

#print("\nGPT): Gemma-generated definition:\n")
#calc_metrics(pandas.read_csv('../data/inequality_dataset_gemma2-27b-GemmaDefinition.csv', index_col = 0))
#print("---------------")

Inequality: Few Shot
---------------
---------------

Llama3(70b): No definition:


Weighted average metrics:

Precision for each class: ['0.70', '0.74']
Weighted Average Precision 0.72
Recall for each class: ['0.66', '0.78']
Weighted Average Recall: 0.73
F1 Score for each class: ['0.68', '0.76']
Weighted Average F1 Score: 0.73
---------------

Llama3(70b): Researcher definition:


Weighted average metrics:

Precision for each class: ['0.70', '0.68']
Weighted Average Precision 0.69
Recall for each class: ['0.52', '0.82']
Weighted Average Recall: 0.72
F1 Score for each class: ['0.60', '0.74']
Weighted Average F1 Score: 0.69
---------------

Llama3.1(70b): No definition:


Weighted average metrics:

Precision for each class: ['0.61', '0.88']
Weighted Average Precision 0.73
Recall for each class: ['0.90', '0.54']
Weighted Average Recall: 0.75
F1 Score for each class: ['0.73', '0.67']
Weighted Average F1 Score: 0.70
---------------

Llama3.1(70b): Researcher definition:


Weighted average 

# TO DO:

- Add Gemma fine tuning

- Add BERT fine tuning


