In [1]:
import pandas as pd
import os


# 1. Data loading

In [2]:
question_df = pd.read_csv("data/raw/questions.csv")
question_df.rename(
    columns={
        "id": "question_id", 
        "Answer": "expected_answer",
        "Category": "category",
        "Question": "question_text",
        "Sheet Name": "sheet_name"
    }, 
    inplace=True)

answer_folder = "data/processed"

answers_df = pd.DataFrame()
for file in os.listdir(answer_folder):
    answer_df = pd.read_csv(os.path.join(answer_folder, file))
    model_name = [cname for cname in answer_df.columns if "answer" in cname][0].split("_")[1]
    answer_df.columns = [cname.replace("_"+model_name, "") for cname in answer_df.columns]
    answer_df["model"] = model_name
    answers_df = pd.concat([answers_df, answer_df])
answers_df = answers_df.merge(question_df[["question_id", "expected_answer", "question_text"]], on="question_id", how="left")
answers_df['answer'] = answers_df['answer'].astype(str)
answers_df['expected_answer'] = answers_df['expected_answer'].astype(str)

In [3]:
answers_df

Unnamed: 0,answer,time_taken,question_id,model,expected_answer,question_text
0,In which state is Gettysburg and the Liberty B...,0 days 00:00:00.360079,4414,HuggingFaceTB/SmolLM2-135M-Instruct,Pennsylvania,In which state are Gettysburg and the Liberty ...
1,The last book of the Bible is:\n\n- The Book o...,0 days 00:00:00.624916,15342,HuggingFaceTB/SmolLM2-135M-Instruct,Revelations,What is the last book of the Bible
2,Richard Branson\n\nRichard Branson is a Britis...,0 days 00:00:00.585978,7626,HuggingFaceTB/SmolLM2-135M-Instruct,Virgin,Name The Company Originally Started By (And La...
3,The world busiest port handling the most tonna...,0 days 00:00:00.601287,39903,HuggingFaceTB/SmolLM2-135M-Instruct,Rotterdam,What Is The Worlds Busiest Seaport Handling Th...
4,The first album Roger Waters released after le...,0 days 00:00:00.518937,7295,HuggingFaceTB/SmolLM2-135M-Instruct,The Pros and Cons of Hitch Hiking,What was the first album Roger Waters released...
...,...,...,...,...,...,...
1395,"New York City, United States",0 days 00:00:00.474274,20198,Qwen/Qwen2.5-1.5B-Instruct,Highgate,What cemetery is Karl Marx buried in
1396,Baja赛车,0 days 00:00:00.474274,34430,Qwen/Qwen2.5-1.5B-Instruct,Big wheel,Plastic vehicle equipped with spin_out brake.
1397,The Price is Right,0 days 00:00:00.474274,31735,Qwen/Qwen2.5-1.5B-Instruct,Press Your Luck,"What 80's game show featured the ""Whammy""?"
1398,Slide rule,0 days 00:00:00.474274,15014,Qwen/Qwen2.5-1.5B-Instruct,Slide rule,With what invention is the name James Oughtred...


# 2. Evaluation

In [4]:
from src.types import Question
from src.evaluation.index import evaluation_function

answers_df['is_correct'] = answers_df.apply(lambda row: evaluation_function(row['expected_answer'], row['answer']), axis=1)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
answers_df.pivot_table(columns="model", values="is_correct", aggfunc="mean")

model,HuggingFaceTB/SmolLM2-135M-Instruct,HuggingFaceTB/SmolLM2-360M-Instruct,Qwen/Qwen2.5-1.5B-Instruct,google/flan-t5-base,google/flan-t5-small,google/gemma-2-2b-it
is_correct,0.07,0.135,0.205,0.045,0.04,0.295


In [8]:
answers_df.loc[answers_df['model']=="google/gemma-2-2b-it"]

Unnamed: 0,answer,time_taken,question_id,model,expected_answer,question_text,is_correct
800,Pennsylvania \n,0 days 00:00:00.694689,4414,google/gemma-2-2b-it,Pennsylvania,In which state are Gettysburg and the Liberty ...,True
801,Revelation \n,0 days 00:00:00.694689,15342,google/gemma-2-2b-it,Revelations,What is the last book of the Bible,False
802,Virgin Records \n,0 days 00:00:00.694689,7626,google/gemma-2-2b-it,Virgin,Name The Company Originally Started By (And La...,True
803,Singapore \n,0 days 00:00:00.694689,39903,google/gemma-2-2b-it,Rotterdam,What Is The Worlds Busiest Seaport Handling Th...,False
804,"""The Pros and Cons Of Hitchhiking"" \n",0 days 00:00:00.694689,7295,google/gemma-2-2b-it,The Pros and Cons of Hitch Hiking,What was the first album Roger Waters released...,False
...,...,...,...,...,...,...,...
995,London Cemetery \n,0 days 00:00:00.694689,20198,google/gemma-2-2b-it,Highgate,What cemetery is Karl Marx buried in,False
996,Remote control car \n,0 days 00:00:00.694689,34430,google/gemma-2-2b-it,Big wheel,Plastic vehicle equipped with spin_out brake.,False
997,Double Dare \n,0 days 00:00:00.694689,31735,google/gemma-2-2b-it,Press Your Luck,"What 80's game show featured the ""Whammy""?",False
998,Typewriter \n,0 days 00:00:00.694689,15014,google/gemma-2-2b-it,Slide rule,With what invention is the name James Oughtred...,False


In [6]:
failing_eval = answers_df.loc[answers_df['model']=="google/gemma-2-2b-it"].loc[answers_df['question_id']==7295]
failing_eval['answer'].tolist(), failing_eval['expected_answer'].tolist()

(['"The Pros and Cons Of Hitchhiking" \n'],
 ['The Pros and Cons of Hitch Hiking'])

In [19]:
" xk s x".strip()

'xk s x'

In [18]:
answers_df.pivot_table(index="question_id", columns="model", values="answer", aggfunc="first")

model,HuggingFaceTB/SmolLM2-135M-Instruct,HuggingFaceTB/SmolLM2-360M-Instruct,Qwen/Qwen2.5-1.5B-Instruct,google/flan-t5-base,google/flan-t5-small,google/gemma-2-2b-it
question_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
148,Ralph Freeman designed the Sydney Opera House,Ralph Freeman designed the Sydney Opera House.,Sydney Opera House,sydney harbour,Carlton,Sydney Opera House \n
502,Dame Margot Fonteyn is famous for her role as ...,Dame Margot Fonteyn is famous for her role in ...,Dame Margot Fonteyn was famous for her dance p...,Dame Margott Fonteyn,1945,Dame Margot Fonteyn \n
843,"The answer to this question is ""Galaxy""",Our galaxy is commonly known as the Milky Way.,Milky Way,saturn,galaxy,Milky Way \n
1176,"The answer to the question ""Its all Greek to m...","""Oedipus Rex""","""Antigone""",romeo and juliet,"""The Greeks""",The Merchant of Venice \n
2594,"The answer to this question is: ""1978""",The crew that sank in the 1978 boat race was t...,The Nereus,samuel scott,a sank crew,University of Wales\n
...,...,...,...,...,...,...
45750,4/8 = 0.454545...,4/8 = 0.5,0.5,0.8,0.5,0.5 \n
45926,"The term used in hockey is ""hockey,"" not ""hock...","The term ""hockey"" is not used in hockey.",Cycling,hockey,Hockey,Faceoff \n
46952,"The answer to the question ""Who was the Favour...","Impressionists, such as Claude Monet and Pierr...",Claude Monet,henry viii,a sexy sexy sexy,Edgar Degas \n
47268,Australia: Wagga Wagga\n\nAustralia is a count...,"Wagga Wagga is a town in New South Wales, Aust...","""Wagga Wagga"" means ""Wattle town.""",a syllable,Australia,"""Wagga Wagga"" means ""Where the water flows"" \n"


In [6]:
answer_df.head()

Unnamed: 0,answer_HuggingFaceTB/SmolLM2-135M-Instruct,time_taken_HuggingFaceTB/SmolLM2-135M-Instruct,question_id
0,In which state is Gettysburg and the Liberty B...,0 days 00:00:00.360079,4414
1,The last book of the Bible is:\n\n- The Book o...,0 days 00:00:00.624916,15342
2,Richard Branson\n\nRichard Branson is a Britis...,0 days 00:00:00.585978,7626
3,The world busiest port handling the most tonna...,0 days 00:00:00.601287,39903
4,The first album Roger Waters released after le...,0 days 00:00:00.518937,7295
