In [54]:
import json


filepath_gpt4_turbo = 'results/wiki20m_rand_100_gpt-4-1106-preview_detailed.json'
filepath_gt = 'results/wiki20m_rand_100_groundtruth_detailed.json'
filepath_llama2 = 'results/wiki20m_rand_100_llama-2-70b_detailed.json'
filepath_openchat = 'results/wiki20m_rand_100_openchat_detailed.json'

with open(filepath_gpt4_turbo) as f:
    gpt4_turbo = json.load(f)

with open(filepath_gt) as f:
    gt = json.load(f)

with open(filepath_llama2) as f:
    llama2 = json.load(f)

with open(filepath_openchat) as f:
    openchat = json.load(f)

In [55]:
sample_to_id = {}

for idx, key in enumerate(gpt4_turbo.keys()):
    sample_to_id[key] = idx

sample_to_id

{'The official announcement was made during a press conference at East River Park in Manhattan by New York City Mayor Michael Bloomberg and National Lacrosse League Commissioner Jim Jennings .': 0,
 'MySQL Cluster is implemented through the NDB or NDBCLUSTER storage engine for MySQL ( " NDB " stands for Network Database ) .': 1,
 'Shanty Run joins Quinn Run upstream of its mouth .': 2,
 'Following the passage of a Genoa low the rivers Rhine , Moselle , Main , Danube , Weser , Werra , Unstrut , Elbe , Vltava and their tributaries inundated large areas .': 3,
 'Its architect was Guðjón Samúelsson , who also designed Hallgrímskirkja , a Reykjavik landmark , and Akureyrarkirkja in Akureyri , North Iceland .': 4,
 'The April 1972 election of Jerry DeGrieck and Nancy Wechsler to the Ann Arbor city council on the Human Rights Party ticket would signal changes for LGBT rights in the state .': 5,
 'In July 2006 it performed Leonard Bernstein \'s comic operetta " Candide "   to much acclaim ) , 

In [56]:
import pandas as pd

metrics_names = ['Completeness', 'Factualness', 'Granularity', 'Topical', 'Uniqueness']
metrics_names_to_abbr = {
    'Completeness': 'CS',
    'Factualness': 'FS',
    'Granularity': 'GS',
    'Topical': 'TS',
    'Uniqueness': 'US'
}

gre_results_dict = {}

for metric_name in metrics_names:
    df = pd.DataFrame(columns=["sample_id", "Groundtruth", "LLaMA2-70b", "OpenChat", "GPT-4-Turbo"])
    metric = metrics_names_to_abbr[metric_name]
    for key in sample_to_id.keys():
        sample_id = sample_to_id[key]
        gt_score = gt[key][metric]
        llama2_score = llama2[key][metric]
        openchat_score = openchat[key][metric]
        gpt4_turbo_score = gpt4_turbo[key][metric]

        comparison = {
            "sample_id": sample_id,
            "Groundtruth": gt_score,
            "LLaMA2-70b": llama2_score,
            "OpenChat": openchat_score,
            "GPT-4-Turbo": gpt4_turbo_score
        }
        df = pd.concat([df, pd.DataFrame([comparison])], ignore_index=True)
    

    gre_results_dict[metric_name] = df


    

In [57]:
gre_results_dict['Completeness']

Unnamed: 0,sample_id,Groundtruth,LLaMA2-70b,OpenChat,GPT-4-Turbo
0,0,1.0,0.5,0.0,0.5
1,1,1.0,0.0,0.0,0.0
2,2,1.0,1.0,1.0,1.0
3,3,1.0,0.0,0.0,0.5
4,4,1.0,1.0,1.0,1.0
...,...,...,...,...,...
95,95,1.0,0.0,0.0,0.0
96,96,0.0,0.0,1.0,0.0
97,97,1.0,0.0,0.5,0.5
98,98,1.0,0.0,0.0,0.0


In [58]:
import os
# Function to create pairwise comparisons between models
def create_pairwise_comparisons(df):
    # List of models for comparison
    model_columns = df.columns[1:]  # Excluding the sample ID column

    # Create an empty DataFrame for the pairwise comparisons
    comparisons_df = pd.DataFrame(columns=["sample_id", "model_A_name", "model_B_name", "win"])

    threshold = 0.2

    # Iterate through each row in the DataFrame
    for _, row in df.iterrows():
        # Get the sample ID
        sample_id = row.iloc[0]

        # Compare each model with every other model
        for i in range(len(model_columns)):
            for j in range(i+1, len(model_columns)):
                model_a = model_columns[i]
                model_b = model_columns[j]
                scores_a = row[model_a]
                scores_b = row[model_b]

                # Determine the winner
                if scores_a - scores_b >= threshold:
                    win = 'model_A_win'
                elif scores_b - scores_a >= threshold :
                    win = 'model_B_win'
                else:
                    win = 'tie'

                # Add the comparison to the DataFrame
                comparison = {"sample_id": sample_id, "model_A_name": model_a, "model_B_name": model_b, "win": win}
                comparisons_df = pd.concat([comparisons_df, pd.DataFrame([comparison])], ignore_index=True)


    return comparisons_df


In [59]:
for metric_name in metrics_names:
    df = gre_results_dict[metric_name]
    pairwise_comparisons_df = create_pairwise_comparisons(df)
    
    # obtain the filename
    csv_filename = f'{metric_name}_GREScores.csv'

    pairwise_comparisons_df.to_csv(os.path.join('results/gre', csv_filename), index=False)