In [52]:
import pandas as pd
import os
import json

In [53]:
def fetch_results(dataset_name, model_name, dataset_split):
    results = []
    directory = f"generated_quiz_answers/{dataset_name}/{model_name}/{dataset_split}"
    model_name = model_name.split("_")[0] if "rerun" in model_name else model_name

    for language in os.listdir(directory):
        language_directory = os.path.join(directory, language)
        json_file = os.path.join(language_directory, "contamination.json")
        df_file = os.path.join(language_directory, "quiz_answers.csv")
        df_ans = pd.read_csv(df_file)

        with open(json_file) as f:
            data = json.load(f)
            score = data["score"]
            contamination = data["contamination"]
            results.append(
                {
                    "Language": language,
                    f"Score": score,
                    f"Contamination": contamination,
                    f"Total_points": len(df_ans),
                    f"Model": model_name,
                    f'Split': dataset_split,
                    f"Dataset": dataset_name,
                }
            )

    df = pd.DataFrame(results)
    return df

In [54]:
# list all directories in generated_quiz_answers
dataset_names = os.listdir("generated_quiz_answers")
models = ["dev-moonshot","palm-32k"]
# results = {}
df = pd.DataFrame()
for dataset_name in dataset_names:
    for model_name in models:
        # fetch results for each dataset from generated_quiz_answers -> dataset_name -> model_name
        # if model_name in generated_quiz_answers/dataset_name
        if os.path.exists(f"generated_quiz_answers/{dataset_name}/{model_name}_rerun"):
            model_name = f"{model_name}_rerun"
        if os.path.exists(f"generated_quiz_answers/{dataset_name}/{model_name}/test"):
            split = "test"
        else:
            split = "validation"
        try:   
            df = pd.concat([df, fetch_results(dataset_name, model_name, split)])
        except:
            print(f"Error in {dataset_name} {model_name}")
            continue

In [55]:
df.to_excel("contamination_results_compiled.xlsx")

In [56]:
df.head()

Unnamed: 0,Language,Score,Contamination,Total_points,Model,Split,Dataset
0,es,0.743902,0.658537,82,dev-moonshot,test,paws-x
1,fr,0.78125,0.708333,96,dev-moonshot,test,paws-x
2,zh,0.735849,0.647799,53,dev-moonshot,test,paws-x
3,de,0.829787,0.77305,94,dev-moonshot,test,paws-x
4,ko,0.583333,0.444444,60,dev-moonshot,test,paws-x


In [57]:
df["Model"].replace({"dev-moonshot": "GPT-4"}, inplace=True)
df.drop(columns=["Score","Total_points","Split"], inplace=True)
df["Contamination"] = df["Contamination"].apply(lambda x: round(x, 2))

In [59]:
#replace model names dev-moonshot with GPT-4
datasets = df["Dataset"].unique()
df_dict = {}
for i in datasets:
    temp_df = df[df["Dataset"]==i].drop(columns=["Dataset"]).reset_index().drop(columns=["index"])
    df_dict[i] = temp_df.pivot(index='Model', columns='Language', values='Contamination').reset_index()
    df_dict[i].to_excel(f"results/contamination_results_{i}.xlsx")


Bad pipe message: %s [b't\x02ln#k\xd3]\r\x9a\xdd\x08\xb3m_\x87\x03\xcd N\xe8\x06v\xbd}\xe1\xf5']
Bad pipe message: %s [b'\xbc\xa2\xd5\x92)\xaa\xd2\x9fO\xbd\xf0"\xe7K\x86\xe5 R\x00\x00|\xc0,\xc00\x00\xa3\x00\x9f\xcc\xa9\xcc\xa8\xcc\xaa\xc0', b"\xad\xc0\xa3\xc0\x9f\xc0]\xc0a\xc0W\xc0S\xc0+\xc0/\x00\xa2\x00\x9e\xc0\xae\xc0\xac\xc0\xa2\xc0\x9e\xc0\\\xc0`\xc0V\xc0R\xc0$\xc0(\x00k\x00j\xc0#\xc0'\x00g\x00@\xc0\n\xc0\x14\x009\x008\xc0\t\xc0\x13\x003\x002\x00\x9d\xc0\xa1\xc0\x9d\xc0Q\x00\x9c\xc0\xa0\xc0\x9c\xc0P\x00=\x00<\x005\x00/\x00\x9a\x00\x99\xc0\x07\xc0\x11\x00\x96\x00\x05\x00\xff\x01\x00\x00j\x00\x00\x00\x0e\x00\x0c\x00\x00\t127.0.0.1\x00\x0b\x00\x04\x03\x00\x01\x02\x00\n\x00\x0c\x00\n\x00\x1d\x00\x17\x00\x1e\x00\x19\x00\x18\x00#\x00\x00\x00\x16\x00\x00\x00\x17\x00\x00\x00\r\x000\x00.\x04\x03\x05"]
Bad pipe message: %s [b'\x03\x08']
Bad pipe message: %s [b'\x08\x08\t\x08\n\x08']
Bad pipe message: %s [b'\x04\x08\x05\x08\x06\x04\x01\x05\x01\x06']
Bad pipe message: %s [b'', b'\x03\x03']
Bad

In [14]:
results_udpos_gpt = fetch_results("udpos", "dev-moonshot", "test")

In [15]:
results_udpos_palm = fetch_results("udpos", "palm-32k", "test")

In [16]:
results_udpos_palm

Unnamed: 0,Language,Score_palm-32k,Contamination_palm-32k,Total_points_palm-32k
0,fr,0.34,0.12,109
1,zh,0.15,-0.133333,110
2,el,0.16,-0.12,108
3,bg,0.1,-0.2,110
4,fi,0.15,-0.133333,110
5,pl,0.09,-0.213333,110
6,it,0.23,-0.026667,106
7,et,0.1,-0.2,109
8,ro,0.15,-0.133333,109
9,hu,0.13,-0.16,106


In [14]:
df

Unnamed: 0,Language,Score,Contamination,Total_points,Model,Split,Dataset
0,es,0.743902,0.658537,82,dev-moonshot,test,paws-x
1,fr,0.781250,0.708333,96,dev-moonshot,test,paws-x
2,zh,0.735849,0.647799,53,dev-moonshot,test,paws-x
3,de,0.829787,0.773050,94,dev-moonshot,test,paws-x
4,ko,0.583333,0.444444,60,dev-moonshot,test,paws-x
...,...,...,...,...,...,...,...
4,ja,0.790000,0.720000,141,palm-32k,validation,tydiqa
5,id,0.370000,0.160000,129,palm-32k,validation,tydiqa
6,th,0.380000,0.173333,144,palm-32k,validation,tydiqa
7,en,0.730000,0.640000,139,palm-32k,validation,tydiqa


In [17]:

# join the two dataframes on language
results_udpos_combined = results_udpos_gpt.merge(results_udpos_palm, on="Language")

In [18]:
results_udpos_combined

Unnamed: 0,Language,Score_dev-moonshot,Contamination_dev-moonshot,Total_points_dev-moonshot,Score_palm-32k,Contamination_palm-32k,Total_points_palm-32k
0,fr,0.32,0.093333,110,0.34,0.12,109
1,zh,0.3,0.066667,110,0.15,-0.133333,110
2,el,0.38,0.173333,110,0.16,-0.12,108
3,bg,0.29,0.053333,110,0.1,-0.2,110
4,fi,0.28,0.04,110,0.15,-0.133333,110
5,pl,0.24,-0.013333,110,0.09,-0.213333,110
6,it,0.27,0.026667,109,0.23,-0.026667,106
7,et,0.37,0.16,109,0.1,-0.2,109
8,ro,0.06,-0.253333,110,0.15,-0.133333,109
9,hu,0.4,0.2,110,0.13,-0.16,106


In [None]:
results_