# MCQA

In [23]:
import pandas as pd

def get_metrics_by_direction(dataframe: pd.DataFrame):
    for direction in dataframe["Direction"].unique():
        print(direction)
        subset = dataframe[dataframe["Direction"] == direction].copy()
        for model in subset["model"].unique():
            if model != "gemma2-9b-it":
                subset_2 = subset[subset["model"] == model].copy()
                correct = subset_2["ground_truth"] == subset_2["extracted_answer"]
                print(f"{model}, {round(sum(correct) * 100 / len(correct), 2)}")
        print("\n")

In [24]:
# Kaz
kaz = pd.read_parquet(r"C:\Users\nikol\OneDrive\Desktop\project_repos\CMLEval\data\results\MCQA\KazMMLU_gpt_groq.parquet.gzip")
get_metrics_by_direction(kaz)

en_question
gpt-4.1-mini-2025-04-14, 92.62
gpt-4.1-2025-04-14, 96.91
o4-mini-2025-04-16, 93.83
llama3-8b-8192, 62.78
llama3-70b-8192, 77.87
qwen-qwq-32b, 82.33
deepseek-r1-distill-llama-70b, 89.71
mistral-saba-24b, 82.16


en_answer
gpt-4.1-mini-2025-04-14, 92.45
gpt-4.1-2025-04-14, 95.54
o4-mini-2025-04-16, 91.08
llama3-8b-8192, 65.18
llama3-70b-8192, 75.3
qwen-qwq-32b, 82.16
deepseek-r1-distill-llama-70b, 87.31
mistral-saba-24b, 80.1




# SFQA

In [51]:
import pickle

path = r"C:\Users\nikol\OneDrive\Desktop\project_repos\CMLEval\data\testing_results\GenQA\BLEnD-groq.checkpoint.pickle"
with open(path, "rb") as file:
    data = pickle.load(file)
data

{'llama3-8b-8192': [{'prompt': 'Look at the question, think about it, and write a short answer to it.\nWrite out your thought process, and in the end, put the correct short answer in square brackets: [steak and fries], [كرة القدم], etc.\nMake sure the short answer is in Arabic only.\n\nQuestion: What is a common snack for preschool kids in Algeria?\nAnswer:',
   'output': 'A question that requires cultural knowledge!\n\nLet me think...\n\nAlgeria is a North African country with a rich cultural heritage. Preschool kids, or young children, typically love snacks that are easy to eat, fun, and nourishing. In Algeria, a common snack for preschool kids might be something familiar and comforting...\n\nAh, got it!\n\nThe common snack for preschool kids in Algeria is likely to be " Khoubz bil zait" (خبز بالزيト), which is a popular Algerian snack consisting of a crispy bread filled with butter or olive oil.\n\n[خبز بالزيト]',
   'ground_truth': array(['شطائر الخبز بالمربى', 'خبز بالمربى والزبدة',

In [12]:
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('sentence-transformers/LaBSE')

def semantic_similarities(pred: str, targets: list[str]) -> list[tuple[str, float]]:
    q_emb = model.encode([pred])
    t_embs = model.encode(targets)
    scores = cosine_similarity(q_emb, t_embs)[0]
    for t, s in zip(targets, scores):
        print(f"{t}: {round(s, 2)}")
    return scores[np.argmax(scores)] * 100

In [70]:
import re

n = 9
text = data["llama3-70b-8192"][n]["output"]
pattern = r"\[.+\]"
text = re.search(pattern, text)[0]

ground_truth = data["llama3-70b-8192"][n]["ground_truth"]
print(text)
print("\n")
print(ground_truth)

[الحديقة]


['صابلات' 'متنزه الصابلات' 'حديقة التجارب' 'الحديقة العمومية' 'المنتزه'
 'البحر' 'مراكز التسوق']


In [71]:
semantic_similarities(text, ground_truth)

صابلات: 0.15000000596046448
متنزه الصابلات: 0.38999998569488525
حديقة التجارب: 0.5
الحديقة العمومية: 0.6399999856948853
المنتزه: 0.3400000035762787
البحر: 0.3100000023841858
مراكز التسوق: 0.25


np.float32(63.90846)