## ChatGPT: Clean up and evaluate on test set

In [None]:
import pandas as pd
import re
import json
import pandas as pd
import numpy as np
import re
import pickle
import os
from datetime import datetime
from tqdm import tqdm

## Load test set 

In [None]:
# test-train split
X_test = np.load("data/train_test/X_test.npy", allow_pickle=True).tolist()
y_test = np.load("data/train_test/y_test.npy", allow_pickle=True).tolist()
X_test_c = [x.lower() for x in X_test]
test_df = pd.DataFrame({"X_test":X_test, "y_test":y_test, "q_match":X_test_c})
test_df.shape
labels = pd.read_csv("data/annotations.csv")
merged = pd.merge(test_df, labels[['q_match', 'short_date']], how='left')
merged.shape

## Load GPT labels

In [None]:
## long definition, no date
l = pd.read_csv("data/chatgpt_labels/long_nodate.csv")
print(l.shape)
l.head()

In [None]:
## Short definition, no date
s = pd.read_csv("data/chatgpt_labels/short_nodate.csv")
print(s.shape)
s.head()

In [None]:
## Short definition, date
sd = pd.read_csv("data/chatgpt_labels/short_date.csv")
print(sd.shape)
sd.head()

In [None]:
## long definition, date
ld = pd.read_csv("data/chatgpt_labels/long_date.csv")
print(ld.shape)
ld.head()

In [None]:
#l[['finish_reason', 'finish_reason2']].value_counts(dropna=False)
#s[['finish_reason', 'finish_reason2']].value_counts(dropna=False)
#ld[['finish_reason', 'finish_reason2']].value_counts(dropna=False)
#sd[['finish_reason', 'finish_reason2']].value_counts(dropna=False)

### Extract binary labels

In [None]:
def answer_cleaning(x):
    x = x.lower().strip()
    x = re.sub(r'[^\w\s]|_', '', x)
    return x

def get_labels_3cat(x):
    if x.startswith('no'):
        return 'no'
    elif x.startswith('yes'):
        return 'yes'
    elif x.startswith('the answer is not'):
        return 'uncertain'
    elif x.startswith('the answer is no'):
        return 'no'
    elif x.startswith('the answer is yes'):
        return 'yes'
    else:
        return 'uncertain'

def get_labels(x):
    if x == 'yes':
        return 1
    else:
        return 0

In [None]:
# short, no date
s['gpt_label_3cat'] = s['message2'].apply(answer_cleaning).apply(get_labels_3cat)
s['gpt_label_3cat'].value_counts(dropna=False)
s['gpt_label_3cat'].value_counts(dropna=False,normalize=True)*100

In [None]:
s['gpt_label'] = s['gpt_label_3cat'].apply(get_labels)
s['gpt_label'].value_counts(dropna=False)

In [None]:
# long, no date
l['gpt_label_3cat'] = l['message2'].apply(answer_cleaning).apply(get_labels_3cat)
#l['gpt_label_3cat'].value_counts(dropna=False)
l['gpt_label_3cat'].value_counts(dropna=False,normalize=True)*100

In [None]:
l['gpt_label'] = l['gpt_label_3cat'].apply(get_labels)
l['gpt_label'].value_counts(dropna=False)

In [None]:
#l[l['gpt_label_3cat']=='uncertain'].message.tolist()
#l[l['gpt_label_3cat']=='yes'].message.tolist()

In [None]:
# long, date
ld['gpt_label_3cat'] = ld['message2'].apply(answer_cleaning).apply(get_labels_3cat)
#ld['gpt_label_3cat'].value_counts(dropna=False)
ld['gpt_label_3cat'].value_counts(dropna=False, normalize=True)*100

In [None]:
ld['gpt_label'] = ld['gpt_label_3cat'].apply(get_labels)
ld['gpt_label'].value_counts(dropna=False)

In [None]:
#ld[ld['gpt_label_3cat']=='uncertain'].message.tolist()
#ld[ld['gpt_label_3cat']=='yes'].message.tolist()

In [None]:
# short, date
sd['gpt_label_3cat'] = sd['message2'].apply(answer_cleaning).apply(get_labels_3cat)
#sd['gpt_label_3cat'].value_counts(dropna=False)
sd['gpt_label_3cat'].value_counts(dropna=False, normalize=True)*100

In [None]:
sd['gpt_label'] = sd['gpt_label_3cat'].apply(get_labels)
sd['gpt_label'].value_counts(dropna=False)

## combine dataframes

In [None]:
dfs = [l, s, ld, sd]
cols = ["gpt_long", "gpt_short", "gpt_long_date", "gpt_short_date"]

In [None]:
# merge with y_test
for df, name in zip(dfs, cols):
    name_3cat = f"{name}_3cat"
    message = f"{name}_message"
    message2 = f"{name}_message2"
    df = df.rename(columns={"gpt_label_3cat":name_3cat, "gpt_label":name, "message":message, "message2":message2})
    #print(df.columns)
    merged = pd.merge(merged, df[['sq', name, name_3cat, message, message2]], how='left', left_on='X_test', right_on='sq')


In [None]:
## knowledge cut off chat gpt = Sep 2021
cutoff_date = pd.to_datetime('2021-09-30')

In [None]:
merged['short_date'] = pd.to_datetime(merged['short_date'])
merged['cutoff'] = np.where(merged['short_date']<= cutoff_date, 'before', 'after')
merged.cutoff.value_counts(dropna=False)

In [None]:
merged.groupby('cutoff')['gpt_long_3cat'].value_counts(dropna=False, normalize=True)*100

In [None]:
merged.groupby('cutoff')['gpt_short_3cat'].value_counts(dropna=False, normalize=True)*100

In [None]:
condition = merged['cutoff']=='after'

In [None]:
ld_cols = ['gpt_long_date', 'gpt_long_date_3cat','gpt_long_date_message', 'gpt_long_date_message2']
l_cols = ['gpt_long', 'gpt_long_3cat', 'gpt_long_message', 'gpt_long_message2']
merged.loc[condition, ld_cols] = merged.loc[condition, l_cols].values

In [None]:
sd_cols = ['gpt_short_date', 'gpt_short_date_3cat', 'gpt_short_date_message', 'gpt_short_date_message2']
s_cols = ['gpt_short', 'gpt_short_3cat', 'gpt_short_message','gpt_short_message2']
merged.loc[condition, sd_cols] = merged.loc[condition, s_cols].values

### Evaluation of performance

In [None]:
from sklearn.metrics import classification_report

In [None]:
## full report
for e in cols:
    print(e)
    print(classification_report(merged['y_test'], merged[e]))