In [9]:
import pandas as pd
from metrics import compute_metrics_prompting
from data_processing.data_processor import get_label_info

# Standard Prompting Analysis

In [10]:
df = pd.read_csv('data/standard_prompting_results.csv')
df.head()

Unnamed: 0,time,epochs,messages,response,label
0,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant,['partially relevant']
1,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant,['irrelevant']
2,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Relevant,['partially relevant']
3,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant,['irrelevant']
4,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Irrelevant,['relevant']


In [22]:
df = df.dropna()

In [12]:
_, labels2ids, _ = get_label_info(binary=False)

def convert_to_ids(text):
    if 'partially irrelevant' in text.lower():
        return labels2ids['partially irrelevant']
    elif 'partially relevant' in text.lower():
        return labels2ids['partially relevant']
    elif 'irrelevant' in text.lower():
        return labels2ids['irrelevant']
    else:
        return labels2ids['relevant']

df['response_no'] = df['response'].map(convert_to_ids)
df['label_no'] = df['label'].map(convert_to_ids)
df.head()

Unnamed: 0,time,epochs,messages,response,label,response_no,label_no
0,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant,['partially relevant'],2,2
1,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant,['irrelevant'],2,0
2,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Relevant,['partially relevant'],3,2
3,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant,['irrelevant'],2,0
4,20240201-215401,50,"[{'role': 'system', 'content': 'You will be pr...",Irrelevant,['relevant'],0,3


In [13]:
compute_metrics_prompting(list(df['response_no']), list(df['label_no']))

{'accuracy': 0.34,
 'f1_macro': 0.26459229835595277,
 'f1_micro': 0.34,
 'f1_weighted': 0.2835882158225753,
 'recall_macro': 0.3062091503267974,
 'recall_micro': 0.34,
 'recall_weighted': 0.34,
 'precision_macro': 0.34305555555555556,
 'precision_micro': 0.34,
 'precision_weighted': 0.3758888888888889}

# Chain of Thought Analysis
#### Load data and calculate metrics

In [14]:
df = pd.read_csv('data/cot_results.csv')
df.head()

Unnamed: 0,time,epochs,number_of_examples,messages,response,label
0,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",The description is partially relevant. The sou...,partially irrelevant
1,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant. Although the code performs...,irrelevant
2,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",The description is partially relevant. The sou...,irrelevant
3,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",Partially irrelevant. The source code is gener...,partially irrelevant
4,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",partially relevant,relevant


In [23]:
df = df.dropna()

In [16]:
_, labels2ids, _ = get_label_info(binary=False)

def convert_to_ids(text):
    if 'partially irrelevant' in text.lower():
        return labels2ids['partially irrelevant']
    elif 'partially relevant' in text.lower():
        return labels2ids['partially relevant']
    elif 'irrelevant' in text.lower():
        return labels2ids['irrelevant']
    else:
        return labels2ids['relevant']

df['response_no'] = df['response'].map(convert_to_ids)
df['label_no'] = df['label'].map(convert_to_ids)
df.head()

Unnamed: 0,time,epochs,number_of_examples,messages,response,label,response_no,label_no
0,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",The description is partially relevant. The sou...,partially irrelevant,2,1
1,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",Partially relevant. Although the code performs...,irrelevant,2,0
2,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",The description is partially relevant. The sou...,irrelevant,2,0
3,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",Partially irrelevant. The source code is gener...,partially irrelevant,1,1
4,20240201-215511,50,3,"[{'role': 'system', 'content': 'You will be pr...",partially relevant,relevant,2,3


In [17]:
df_e3 = df[df['number_of_examples'] == 3]
df_e5 = df[df['number_of_examples'] == 5]
df_e10 = df[df['number_of_examples'] == 10]
df_e20 = df[df['number_of_examples'] == 20]

In [18]:
print("RESULTS: Number of Examples: 3")
compute_metrics_prompting(list(df_e3['response_no']), list(df_e3['label_no']))

RESULTS: Number of Examples: 3


{'accuracy': 0.3,
 'f1_macro': 0.27226939726939725,
 'f1_micro': 0.3,
 'f1_weighted': 0.2610989010989011,
 'recall_macro': 0.3141339869281046,
 'recall_micro': 0.3,
 'recall_weighted': 0.3,
 'precision_macro': 0.35086805555555556,
 'precision_micro': 0.3,
 'precision_weighted': 0.3468333333333334}

In [19]:
print("RESULTS: Number of Examples: 5")
compute_metrics_prompting(list(df_e5['response_no']), list(df_e5['label_no']))

RESULTS: Number of Examples: 5


{'accuracy': 0.2,
 'f1_macro': 0.1994174625753573,
 'f1_micro': 0.20000000000000004,
 'f1_weighted': 0.15475445370182211,
 'recall_macro': 0.3388888888888889,
 'recall_micro': 0.2,
 'recall_weighted': 0.2,
 'precision_macro': 0.39488636363636365,
 'precision_micro': 0.2,
 'precision_weighted': 0.46340909090909094}

In [20]:
print("RESULTS: Number of Examples: 10")
compute_metrics_prompting(list(df_e10['response_no']), list(df_e10['label_no']))

RESULTS: Number of Examples: 10


{'accuracy': 0.3,
 'f1_macro': 0.2606035437430786,
 'f1_micro': 0.3,
 'f1_weighted': 0.2750963455149502,
 'recall_macro': 0.278125,
 'recall_micro': 0.3,
 'recall_weighted': 0.3,
 'precision_macro': 0.41666666666666663,
 'precision_micro': 0.3,
 'precision_weighted': 0.42}

In [21]:
print("RESULTS: Number of Examples: 20")
compute_metrics_prompting(list(df_e20['response_no']), list(df_e20['label_no']))

RESULTS: Number of Examples: 20


{'accuracy': 0.2692307692307692,
 'f1_macro': 0.22976190476190472,
 'f1_micro': 0.2692307692307692,
 'f1_weighted': 0.23644688644688644,
 'recall_macro': 0.25595238095238093,
 'recall_micro': 0.2692307692307692,
 'recall_weighted': 0.2692307692307692,
 'precision_macro': 0.3758741258741259,
 'precision_micro': 0.2692307692307692,
 'precision_weighted': 0.36632598171059705}