# Evaluate answers
In this notebook, we collect obtained answers and evaluate their quality based on provided human responces.

In [31]:
import os
import re
import pandas as pd
import numpy as np
from scipy.stats import spearmanr

In [32]:
PH2ANS_DIR = '../data/raw/Testing/Phase2Answers' # test questions and answers provided by humans
GPTANS_DIR = '../data/interim' # answers obtained from GPT-4
CAT_PATH = '../data/raw/subcategories-list.txt' # metadata for storing relation between categories and subcategories
REV_PATH = '../data/raw/reversals.txt' # metadata for storing information which pairs are reversals

In [33]:
def read_txt_info(path):
    with open(path, 'r') as f:
        info = f.read()
    return info


def read_hum_answers(path):
    df = pd.read_csv(path, sep='\t')
    df = df.rename({'# pair1': 'pair1'}, axis='columns')
    return df


def read_gpt_answers(path):
    df = pd.read_csv(path, sep=', ',
                     names=['pair1', 'pair2',
                            'pair3', 'pair4',
                            'least_illustrative',
                            'most_illustrative'],
                     engine='python')
    return df


def read_cat_info(path):
    df = pd.read_csv(path, header=None,
                     names=['num', 'letter', 'cat', 'subcat'])
    df['file_idx'] = df['num'].astype(str) + \
        df['letter'].apply(lambda x: x.replace(' ', ''))
    return df


def read_reversals_info(path):
    with open(path, 'r') as f:
        reversals = f.read()

    reversals = reversals.split('original:pair --> reversed:pair')[-1]

    idxs = re.findall('[0-9]+[a-z]', reversals)
    pairs = re.findall('--> [A-Z, a-z]+:[A-Z, a-z]+', reversals)
    pairs = list(map(lambda x: x.split('--> ')[-1], pairs))

    reversal_df = pd.DataFrame({'idxs': idxs, 'pair': pairs})
    return reversal_df

In [34]:
def pairqnum2least(question_data):
    df = question_data['least_illustrative'].value_counts()
    return df


def pairqnum2most(question_data):
    return question_data['most_illustrative'].value_counts()

In [35]:
def extract_voices(pairs, pair, kind='least'):

    global most_data, least_data

    if kind == 'least':
        ref_data = least_data
    else:
        ref_data = most_data

    multiidx = tuple(np.append(pairs, pair))

    if multiidx in ref_data.index:
        votes = ref_data.loc[multiidx]
    else:
        votes = 0

    return votes


def maxdiff(answer):

    least_ans = answer['least_illustrative']
    most_ans = answer['most_illustrative']
    pairs = answer[['pair1', 'pair2', 'pair3', 'pair4']].values.ravel()

    votes_guess_least = extract_voices(pairs, least_ans, kind='least')
    votes_guess_most = extract_voices(pairs, most_ans, kind='most')

    num_votes_least = []
    num_votes_most = []

    for pair in pairs:

        num_votes_least.append(extract_voices(pairs, pair, kind='least'))
        num_votes_most.append(extract_voices(pairs, pair, kind='most'))

    max_votes_least = max(num_votes_least)
    max_votes_most = max(num_votes_most)

    maxdiff_df = answer.copy()
    maxdiff_df['num_least_right'] = int(votes_guess_least == max_votes_least)
    maxdiff_df['num_least_wrong'] = int(votes_guess_least != max_votes_least)
    maxdiff_df['num_most_right'] = int(votes_guess_most == max_votes_most)
    maxdiff_df['num_most_wrong'] = int(votes_guess_most != max_votes_most)

    return maxdiff_df


def calc_maxdiff_acc(rel_data):
    acc_all = 100 * (rel_data.num_most_right.sum() +
                     rel_data.num_least_right.sum()) \
                        / (2 * len(rel_data))
    return acc_all

In [36]:
def calc_rating_scores(questions):

    '''
    calculates golden standard ratings
    '''

    raveled_pairs = questions[['pair1', 'pair2',
                               'pair3', 'pair4']].values.ravel()
    all_pairs = np.unique(raveled_pairs)
    rating_scores = {}

    for pair in all_pairs:
        rating_scores[pair] = calc_one_rating(pair, raveled_pairs,
                                              questions)

    return rating_scores


def calc_one_rating(pair, raveled_pairs, questions):

    pair_num_least = (questions.least_illustrative == pair).sum()
    pair_num_most = (questions.most_illustrative == pair).sum()
    pair_num = (raveled_pairs == pair).sum()
    pct_least = 100 * pair_num_least / pair_num
    pct_most = 100 * pair_num_most / pair_num
    score = pct_most - pct_least
    return score


def calc_spearman(gpt_data_subcat, hum_data):
    name_subcat = gpt_data_subcat.user_selected_relation.unique()[0]
    hum_data_subcat = hum_data[hum_data['user_selected_relation'] ==
                               name_subcat]
    pair2gold = calc_rating_scores(hum_data_subcat)
    pair2test = calc_rating_scores(gpt_data_subcat)

    cor = spearmanr(list(pair2gold.values()),
                    list(pair2test.values()))
    rho = cor.correlation
    pvalue = cor.pvalue

    return pd.DataFrame([{'rho': rho, 'pvalue': pvalue}])

In [37]:
def find_reversals(question):

    global reversal_df

    question['with_reversals'] = 0
    rel_name = question['user_selected_relation']
    pairs = question[['pair1', 'pair2', 'pair3', 'pair4']]
    reversals = reversal_df[reversal_df.user_selected_relation ==
                            rel_name].pair

    if reversals.isin(pairs).sum():
        question['with_reversals'] = 1

    return question


def remove_reversals(data):
    data_no_rev = data.apply(lambda x: find_reversals(x), axis=1)
    data_no_rev = data_no_rev[data_no_rev['with_reversals'] == 0].\
        reset_index(drop=True)
    data_no_rev.drop('with_reversals', axis=1, inplace=True)
    return data_no_rev

## Evaluate MaxDiff accuracy

In [38]:
lst_files = os.listdir(GPTANS_DIR)

In [39]:
lst_hum_dfs = []
lst_gpt_dfs = []
rel_mapping = {}

for file_name in lst_files:

    idx = re.findall('[0-9]+[a-z]', file_name)[0]
    ans_name = f"Phase2Answers-{idx}.txt"

    # reading answers provived by humans
    # on the questions related to
    # current subcategory
    ans_path = os.path.join(PH2ANS_DIR, ans_name)
    hum_df = read_hum_answers(ans_path)
    hum_df['file_idx'] = idx
    lst_hum_dfs.append(hum_df)

    # reading answers provived by GPT
    # on the questions related to
    # current subcategory
    gpt_ans_path = os.path.join(GPTANS_DIR, ans_name)
    gpt_df = read_gpt_answers(gpt_ans_path)
    # add name of user selected relation
    rel_name = hum_df['user_selected_relation'].unique()[0]
    gpt_df['user_selected_relation'] = rel_name
    gpt_df['file_idx'] = idx
    lst_gpt_dfs.append(gpt_df)

    # create mapping between file idx and relation name
    rel_mapping[idx] = rel_name


hum_data = pd.concat(lst_hum_dfs).reset_index(drop=True)
gpt_data = pd.concat(lst_gpt_dfs).reset_index(drop=True)

In [40]:
gpt_data.head()

Unnamed: 0,pair1,pair2,pair3,pair4,least_illustrative,most_illustrative,user_selected_relation,file_idx
0,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
1,ahead:behind,in:out,tip:root,interior:exterior,tip:root,ahead:behind,X is the opposite direction from Y,4d
2,around:through,backward:forward,north:south,interior:exterior,around:through,north:south,X is the opposite direction from Y,4d
3,high:low,above:below,backward:forward,stop:go,stop:go,above:below,X is the opposite direction from Y,4d
4,top:bottom,boring:interesting,sad:happy,interior:exterior,boring:interesting,top:bottom,X is the opposite direction from Y,4d


In [41]:
hum_data.head()

Unnamed: 0,pair1,pair2,pair3,pair4,least_illustrative,most_illustrative,user_selected_relation,file_idx
0,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
1,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
2,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
3,high:low,around:through,under:over,root:tip,around:through,high:low,X is the opposite direction from Y,4d
4,ahead:behind,in:out,tip:root,interior:exterior,tip:root,ahead:behind,X is the opposite direction from Y,4d


In [42]:
# for each question and each pair compute
# how many times the pair was selected as least and most illustrative example
# for the question
# according to human answers

least_data = hum_data.groupby(['pair1', 'pair2', 'pair3', 'pair4']).apply(lambda x: pairqnum2least(x))
most_data = hum_data.groupby(['pair1', 'pair2', 'pair3', 'pair4']).apply(lambda x: pairqnum2most(x))

In [43]:
# calculate intermediate variables for computing maxdiff accuracy

maxdiff_data = gpt_data.apply(lambda x: maxdiff(x), axis=1)

In [44]:
# calculate maxdiff accuracy

# first calculate it across all subcatigories
maxdiff_acc_subcat = maxdiff_data.groupby('user_selected_relation').apply(lambda x: calc_maxdiff_acc(x))
# then average them
maxdiff_acc_avg = maxdiff_acc_subcat.mean()
maxdiff_acc_avg

49.271504888528646

## Evaluate Spearman Rank Correllation Coefficient

In [45]:
# categories info
cat_df = read_cat_info(CAT_PATH)

# reversal info
reversal_df = read_reversals_info(REV_PATH)
reversal_df['user_selected_relation'] = reversal_df['idxs'].map(rel_mapping)
reversal_df = reversal_df.dropna()
reversal_df = reversal_df.drop('idxs', axis=1)

### Across subcategories

In [46]:
# calculate Spearman Coefficient across all subcategories

spearman_df = gpt_data.groupby('user_selected_relation').apply(lambda x: calc_spearman(x, hum_data))
spearman_df

Unnamed: 0_level_0,Unnamed: 1_level_0,rho,pvalue
user_selected_relation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
X and Y are contrary / opposite to each other,0,0.357606,0.03493601
X is an expression that indicates Y,0,0.738815,5.234404e-08
X is made of / is comprised of Y,0,0.683619,4.36056e-07
X is the opposite direction from Y,0,0.796049,1.443225e-07
a Y is a part of an X,0,0.444778,0.002486345
a Y is one item in a collection/group of X,0,0.667393,1.383984e-06
an Y receives an X,0,0.583677,0.000119787
someone/something who is X cannot be Y or be in the state of Y,0,0.127958,0.4022181


In [47]:
spearman_coef = spearman_df.rho.mean()
spearman_coef

0.5499869250341645

### Across categories

In [48]:
spearman_df_cat = cat_df.merge(gpt_data, on='file_idx', how='left')
spearman_df_cat = spearman_df_cat[['cat', 'user_selected_relation']].\
    dropna().drop_duplicates()
spearman_df_cat = spearman_df_cat.merge(spearman_df.reset_index(),
                      on='user_selected_relation', how='left')
spearman_df_cat = spearman_df_cat.groupby('cat').apply(lambda x: x['rho'].mean())
spearman_df_cat = spearman_df_cat.round(3)

In [49]:
spearman_df_cat

cat
 CASE RELATIONS     0.584
 CLASS-INCLUSION    0.667
 CONTRAST           0.577
 NON-ATTRIBUTE      0.128
 PART-WHOLE         0.564
 REFERENCE          0.739
dtype: float64

### Removing reversals

In [50]:
hum_data_no_rev = remove_reversals(hum_data)
gpt_data_no_rev = remove_reversals(gpt_data)

In [51]:
spearman_no_rev_df = gpt_data_no_rev.groupby('user_selected_relation').\
    apply(lambda x: calc_spearman(x, hum_data_no_rev))
spearman_no_rev_df

Unnamed: 0_level_0,Unnamed: 1_level_0,rho,pvalue
user_selected_relation,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
X and Y are contrary / opposite to each other,0,0.353619,0.055237
X is an expression that indicates Y,0,0.636437,4e-05
X is made of / is comprised of Y,0,0.678638,3e-06
X is the opposite direction from Y,0,0.732525,3.1e-05
a Y is a part of an X,0,0.485565,0.001728
a Y is one item in a collection/group of X,0,0.625186,3.5e-05
an Y receives an X,0,0.628347,9e-05
someone/something who is X cannot be Y or be in the state of Y,0,0.171957,0.288694


In [52]:
spearman_no_rev_coef = spearman_no_rev_df.rho.mean()
spearman_no_rev_coef

0.5390341494360579

We can observemthat adding reversals to dataset doesn’t decrease score of GPT-4. 

### RMSE error

Since GPT-4 identifies reversals, we can evaluate its ability to correctly order them by measurring RMSE error.

In [53]:
pair2gold = calc_rating_scores(hum_data)
pair2test = calc_rating_scores(gpt_data)

pair2gold_no_rev = calc_rating_scores(hum_data_no_rev)
pair2test_no_rev = calc_rating_scores(gpt_data_no_rev)

In [54]:
range_rev_df = reversal_df.copy()
# recover original pair from reversal
range_rev_df['original_pair'] = range_rev_df['pair'].\
    apply(lambda x: f"{x.split(':')[1]}:{x.split(':')[0]}")
range_rev_df['test_score'] = range_rev_df['original_pair'].apply(lambda x: pair2test[x])
range_rev_df['gold_score'] = range_rev_df['pair'].apply(lambda x: pair2gold[x])
range_rev_df['SE'] = (range_rev_df['test_score'] - range_rev_df['gold_score']) ** 2
rmse_score = np.sqrt(range_rev_df['SE'].mean())

In [55]:
rmse_score

59.157206209846436