<a href="https://colab.research.google.com/github/mariah0134/Data-Science-Project/blob/main/Notebooks/Phase-3/models_and_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Baseline Objective Metrics

In [None]:
import pandas as pd
import re

gt_file = "test_set_ground_truth (1).csv"   # ground truth
model_file = "model_a_results (1).csv"      # baseline results

gt = pd.read_csv(gt_file)
model = pd.read_csv(model_file)

print("Ground Truth columns:", gt.columns)
print("Model columns:", model.columns)



# ===== 2) توحيد صيغة السؤال والدمج =====
def normalize_question(s):
    if not isinstance(s, str):
        return ""
    return " ".join(s.strip().split())

gt["question_norm"] = gt["Question"].apply(normalize_question)
model["question_norm"] = model["question"].apply(normalize_question)

merged = pd.merge(
    model,
    gt[["question_norm", "Question", "Answer", "Source", "Topic"]],
    on="question_norm",
    how="inner",
)

print("عدد الأسئلة بعد الدمج:", len(merged))


# ===== 3) دوال التوكن والـ Overlap =====
def simple_tokenize_ar(s):
    if not isinstance(s, str):
        return []
    s = s.replace("،", " ").replace("؟", " ").replace("ـ", " ")
    s = re.sub(r"[^\w\s\u0600-\u06FF]", " ", s)
    return [t for t in s.split() if t]

def overlap_scores(gold_answer, pred_text):
    """
    بين الجواب الصحيح (Answer) وبين النص اللي رجعه الـbaseline (chunk)
    """
    if not isinstance(gold_answer, str) or not isinstance(pred_text, str):
        return 0.0, 0.0, 0.0

    gold_tokens = simple_tokenize_ar(gold_answer)
    pred_tokens = simple_tokenize_ar(pred_text)

    if not gold_tokens or not pred_tokens:
        return 0.0, 0.0, 0.0

    set_gold = set(gold_tokens)
    set_pred = set(pred_tokens)

    inter = set_gold & set_pred
    union = set_gold | set_pred

    jaccard = len(inter) / len(union) if union else 0.0
    coverage_pred_in_gold = len(inter) / len(set_pred) if set_pred else 0.0
    coverage_gold_in_pred = len(inter) / len(set_gold) if set_gold else 0.0

    return jaccard, coverage_pred_in_gold, coverage_gold_in_pred


# ===== 4) حساب المقاييس لـ top1 / top2 / top3 =====
j1_list, c1_list, c1g_list = [], [], []
j2_list, c2_list, c2g_list = [], [], []
j3_list, c3_list, c3g_list = [], [], []

for _, row in merged.iterrows():
    gold = row["Answer"]

    j1, c1, c1g = overlap_scores(gold, row["top_retrieved_chunk"])
    j2, c2, c2g = overlap_scores(gold, row["top_2_retrieved_chunk"])
    j3, c3, c3g = overlap_scores(gold, row["top_3_retrieved_chunk"])

    j1_list.append(j1); c1_list.append(c1); c1g_list.append(c1g)
    j2_list.append(j2); c2_list.append(c2); c2g_list.append(c2g)
    j3_list.append(j3); c3_list.append(c3); c3g_list.append(c3g)

merged["Jaccard_top1"] = j1_list
merged["Coverage_top1_AnsInChunk"] = c1_list
merged["Coverage_top1_ChunkInAns"] = c1g_list

merged["Jaccard_top2"] = j2_list
merged["Coverage_top2_AnsInChunk"] = c2_list
merged["Coverage_top2_ChunkInAns"] = c2g_list

merged["Jaccard_top3"] = j3_list
merged["Coverage_top3_AnsInChunk"] = c3_list
merged["Coverage_top3_ChunkInAns"] = c3g_list


# ===== 5) أفضل نتيجة من الثلاثة (baseline@k) =====
merged["Best_Jaccard"] = merged[["Jaccard_top1", "Jaccard_top2", "Jaccard_top3"]].max(axis=1)
merged["Best_Coverage_AnsInChunk"] = merged[
    ["Coverage_top1_AnsInChunk", "Coverage_top2_AnsInChunk", "Coverage_top3_AnsInChunk"]
].max(axis=1)


# ===== 6) (اختياري) Correctness أوتوماتي تقريبي للـBaseline =====
def auto_correctness(cov):
    if cov >= 0.60:
        return 2
    elif cov >= 0.30:
        return 1
    else:
        return 0

merged["Baseline_Correctness_auto"] = merged["Best_Coverage_AnsInChunk"].apply(auto_correctness)


# ===== 7) ملخص سريع للمقاييس =====
print("\n========== Baseline Metrics (Using Answer vs Retrieved Chunk) ==========")
print(f"Average Jaccard (Top-1): {merged['Jaccard_top1'].mean():.3f}")
print(f"Average Jaccard (Best of Top-3): {merged['Best_Jaccard'].mean():.3f}")
print(f"Average Coverage Answer in Chunk (Top-1): {merged['Coverage_top1_AnsInChunk'].mean():.3f}")
print(f"Average Coverage Answer in Chunk (Best of Top-3): {merged['Best_Coverage_AnsInChunk'].mean():.3f}")
print(f"Average Baseline_Correctness_auto: {merged['Baseline_Correctness_auto'].mean():.3f}")


# ===== 8) حفظ ملف جاهز للمقارنة =====
out_path = "baseline_evaluation_metrics.csv"
merged.to_csv(out_path, index=False, encoding="utf-8-sig")
print("\nتم حفظ ملف التقييم الكامل للبيس لاين في:", out_path)


Ground Truth columns: Index(['Question', 'Answer', 'Source', 'Topic'], dtype='object')
Model columns: Index(['question', 'top_retrieved_chunk', 'top_2_retrieved_chunk',
       'top_3_retrieved_chunk'],
      dtype='object')
عدد الأسئلة بعد الدمج: 65

Average Jaccard (Top-1): 0.049
Average Jaccard (Best of Top-3): 0.060
Average Coverage Answer in Chunk (Top-1): 0.060
Average Coverage Answer in Chunk (Best of Top-3): 0.077
Average Baseline_Correctness_auto: 0.000

تم حفظ ملف التقييم الكامل للبيس لاين في: baseline_evaluation_metrics.csv


In [None]:
# غيّري الأسماء هنا إذا ملفاتك مختلفة
boe_filename = "boe_cleaned (5).csv"
zero_filename = "zero-shot(geminiQ-A).xlsx"

# قراءة الملفات
boe = pd.read_csv(boe_filename)
zero = pd.read_excel(zero_filename)

# نتأكد من الأسماء
print("أعمدة BOE:", boe.columns)
print("أعمدة Gemini:", zero.columns)

أعمدة BOE: Index(['Section', 'Chapter', 'Article', 'Text', 'Status', 'Source'], dtype='object')
أعمدة Gemini: Index(['المادة القانونية (م)', 'السؤال', 'الإجابة'], dtype='object')


#zero-shot

In [None]:

def normalize_article_label(s):
    """
    تنظيف اسم المادة:
    - إزالة كلمة (المادة)
    - حذف الرموز الزائدة
    - توحيد المسافات
    """
    if not isinstance(s, str):
        return None
    s = s.strip()
    s = s.replace("المادة", "")
    s = s.strip(" :-؟:.")
    s = " ".join(s.split())
    return s

boe["Article_label"] = boe["Article"].apply(normalize_article_label)


zero["Article_label"] = zero["المادة القانونية (م)"].apply(normalize_article_label)


merged = pd.merge(
    zero,
    boe[["Article", "Text", "Article_label"]],
    on="Article_label",
    how="left",
    suffixes=("", "_boe")
)

# نعيد تسمية أعمدة الذهبية
merged["Gold_Article"] = merged["Article"]
merged["Gold_Text"] = merged["Text"]

print("عدد الصفوف بعد الدمج:", len(merged))
print("عدد الصفوف اللي ما لها Gold_Article:", merged["Gold_Article"].isna().sum())


عدد الصفوف بعد الدمج: 267
عدد الصفوف اللي ما لها Gold_Article: 33


In [None]:
import re

=====

def simple_tokenize_ar(s):
    """
    تقسيم النص العربي إلى كلمات بسيطة،
    مع إزالة علامات الترقيم والرموز.
    """
    if not isinstance(s, str):
        return []
    s = s.replace("،", " ").replace("؟", " ").replace("ـ", " ")

    s = re.sub(r"[^\w\s\u0600-\u06FF]", " ", s)
    tokens = [t for t in s.split() if t]
    return tokens

def overlap_scores(article_text, answer_text):
    """
    ترجع:
    - Jaccard overlap بين كلمات المادة والجواب
    - Coverage Answer in Article
    - Coverage Article in Answer
    """
    if not isinstance(article_text, str) or not isinstance(answer_text, str):
        return 0.0, 0.0, 0.0

    art_tokens = simple_tokenize_ar(article_text)
    ans_tokens = simple_tokenize_ar(answer_text)

    if not art_tokens or not ans_tokens:
        return 0.0, 0.0, 0.0

    set_art = set(art_tokens)
    set_ans = set(ans_tokens)

    inter = set_art.intersection(set_ans)
    union = set_art.union(set_ans)

    jaccard = len(inter) / len(union) if union else 0.0
    coverage_answer_in_article = len(inter) / len(set_ans) if set_ans else 0.0
    coverage_article_in_answer = len(inter) / len(set_art) if set_art else 0.0

    return jaccard, coverage_answer_in_article, coverage_article_in_answer

In [None]:

jaccards = []
cov_answer = []
cov_article = []

for idx, row in merged.iterrows():
    j, ca, ct = overlap_scores(row.get("Gold_Text"), row.get("الإجابة"))
    jaccards.append(j)
    cov_answer.append(ca)
    cov_article.append(ct)

merged["Overlap_Jaccard"] = jaccards
merged["Coverage_Answer_in_Article"] = cov_answer
merged["Coverage_Article_in_Answer"] = cov_article

# أطوال النصوص
merged["Answer_Length_Tokens"] = merged["الإجابة"].apply(
    lambda s: len(simple_tokenize_ar(s))
)
merged["Article_Length_Tokens"] = merged["Gold_Text"].apply(
    lambda s: len(simple_tokenize_ar(s))
)

print("تم حساب المقاييس.")


تم حساب المقاييس.


In [None]:

merged["Manual_Correctness"] = ""
merged["Manual_Grammar"] = ""
merged["Manual_Style"] = ""
merged["Relevance"] = ""
merged["Notes"] = ""

def apply_hard_rules(row):
    art_lbl = row["Article_label"]
    notes = []
    manual_correctness = ""

    if isinstance(art_lbl, str):
        if "الثامنة" in art_lbl:
            manual_correctness = 0
            notes.append("إحالة خاطئة: المادة الصحيحة للّغة هي التاسعة.")
        if "مائة وواحد" in art_lbl:
            manual_correctness = 0
            notes.append("إحالة خاطئة: فترات الراحة في المادة 102.")
        if "مائة وسبعة" in art_lbl:
            manual_correctness = 0
            notes.append("المادة 107 ملغاة، لا يُستند إليها.")

    return manual_correctness, " | ".join(notes)

mc_list = []
notes_list = []

for idx, row in merged.iterrows():
    mc, note = apply_hard_rules(row)
    mc_list.append(mc)
    notes_list.append(note)

merged["Manual_Correctness"] = mc_list
merged["Notes"] = notes_list

print("تم تطبيق القواعد الخاصة للمقالات (8/9، 101/102، 107).")


تم تطبيق القواعد الخاصة للمقالات (8/9، 101/102، 107).


In [None]:

out_filename = "zero_shot_evaluation_full.csv"
merged.to_csv(out_filename, index=False, encoding="utf-8-sig")

print("تم حفظ ملف التقييم في:", out_filename)

from google.colab import files
files.download(out_filename)


تم حفظ ملف التقييم في: zero_shot_evaluation_full.csv


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

#one-shot

In [None]:
import pandas as pd
import re
from google.colab import files


boe_filename = "boe_cleaned (5).csv"
one_filename = "one-shot(gemini-Q-A).xlsx"


boe = pd.read_csv(boe_filename)
one = pd.read_excel(one_filename)

# ===== 2) توحيد اسم المادة والدمج =====
def normalize_article_label(s):
    if not isinstance(s, str):
        return None
    s = s.strip()
    s = s.replace("المادة", "")
    s = s.strip(" :-؟:.")
    s = " ".join(s.split())
    return s

boe["Article_label"] = boe["Article"].apply(normalize_article_label)
one["Article_label"] = one["المادة القانونية (م)"].apply(normalize_article_label)

merged = pd.merge(
    one,
    boe[["Article", "Text", "Article_label"]],
    on="Article_label",
    how="left",
)

merged["Gold_Article"] = merged["Article"]
merged["Gold_Text"] = merged["Text"]


def simple_tokenize_ar(s):
    if not isinstance(s, str):
        return []
    s = s.replace("،", " ").replace("؟", " ").replace("ـ", " ")
    s = re.sub(r"[^\w\s\u0600-\u06FF]", " ", s)
    return [t for t in s.split() if t]

def overlap_scores(article_text, answer_text):
    if not isinstance(article_text, str) or not isinstance(answer_text, str):
        return 0.0, 0.0, 0.0
    art_tokens = simple_tokenize_ar(article_text)
    ans_tokens = simple_tokenize_ar(answer_text)
    if not art_tokens or not ans_tokens:
        return 0.0, 0.0, 0.0
    set_art = set(art_tokens)
    set_ans = set(ans_tokens)
    inter = set_art & set_ans
    union = set_art | set_ans
    jaccard = len(inter) / len(union) if union else 0.0
    cov_ans = len(inter) / len(set_ans) if set_ans else 0.0
    cov_art = len(inter) / len(set_art) if set_art else 0.0
    return jaccard, cov_ans, cov_art

j_list, cov_ans_list, cov_art_list = [], [], []

for _, row in merged.iterrows():
    j, ca, ct = overlap_scores(row["Gold_Text"], row["الإجابة"])
    j_list.append(j)
    cov_ans_list.append(ca)
    cov_art_list.append(ct)

merged["Overlap_Jaccard"] = j_list
merged["Coverage_Answer_in_Article"] = cov_ans_list
merged["Coverage_Article_in_Answer"] = cov_art_list

merged["Answer_Length_Tokens"] = merged["الإجابة"].apply(
    lambda s: len(simple_tokenize_ar(s))
)
merged["Article_Length_Tokens"] = merged["Gold_Text"].apply(
    lambda s: len(simple_tokenize_ar(s))
)


n = len(merged)
avg_jaccard = merged["Overlap_Jaccard"].mean()
avg_cov_ans = merged["Coverage_Answer_in_Article"].mean()

print("\n========== One-Shot Objective Metrics ==========")
print("Number of One-Shot Q&A pairs:", n)
print("-" * 50)
print(f"Average Overlap Jaccard            : {avg_jaccard:.3f}")
print(f"Average Coverage Answer in Article : {avg_cov_ans:.3f}")

cols_to_keep = [
    "المادة القانونية (م)",
    "السؤال",
    "الإجابة",
    "Gold_Article",
    "Gold_Text",
    "Overlap_Jaccard",
    "Coverage_Answer_in_Article",
    "Coverage_Article_in_Answer",
    "Answer_Length_Tokens",
    "Article_Length_Tokens",
]

final_df = merged[cols_to_keep]

out_path = "one_shot_metrics_clean.csv"
final_df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("\nتم حفظ الملف النظيف بدون أي Manual أو Relevance باسم:", out_path)



Number of One-Shot Q&A pairs: 207
--------------------------------------------------
Average Overlap Jaccard            : 0.148
Average Coverage Answer in Article : 0.322

تم حفظ الملف النظيف بدون أي Manual أو Relevance باسم: one_shot_metrics_clean.csv


#three-shot

In [None]:
import pandas as pd
import re
from google.colab import files


boe_filename = "boe_cleaned (5).csv"
three_filename = "three-shot(gemini-Q-A).xlsx"


boe = pd.read_csv(boe_filename)
three = pd.read_excel(three_filename)

print("أعمدة BOE:", boe.columns)
print("أعمدة Three-Shot:", three.columns)



# ===== 2) توحيد اسم المادة والدمج =====
def normalize_article_label(s):
    if not isinstance(s, str):
        return None
    s = s.strip()
    s = s.replace("المادة", "")
    s = s.strip(" :-؟:.")
    s = " ".join(s.split())
    return s

boe["Article_label"] = boe["Article"].apply(normalize_article_label)
three["Article_label"] = three["المادة القانونية (م)"].apply(normalize_article_label)

merged = pd.merge(
    three,
    boe[["Article", "Text", "Article_label"]],
    on="Article_label",
    how="left",
)

merged["Gold_Article"] = merged["Article"]
merged["Gold_Text"] = merged["Text"]

print("عدد أزواج السؤال/الجواب بعد الدمج:", len(merged))
print("عدد الصفوف بدون مادة مطابقة:", merged["Gold_Article"].isna().sum())


# ===== 3) دوال التوكن والـ overlap =====
def simple_tokenize_ar(s):
    if not isinstance(s, str):
        return []
    s = s.replace("،", " ").replace("؟", " ").replace("ـ", " ")
    s = re.sub(r"[^\w\s\u0600-\u06FF]", " ", s)
    return [t for t in s.split() if t]

def overlap_scores(article_text, answer_text):
    if not isinstance(article_text, str) or not isinstance(answer_text, str):
        return 0.0, 0.0, 0.0
    art_tokens = simple_tokenize_ar(article_text)
    ans_tokens = simple_tokenize_ar(answer_text)
    if not art_tokens or not ans_tokens:
        return 0.0, 0.0, 0.0
    set_art = set(art_tokens)
    set_ans = set(ans_tokens)
    inter = set_art & set_ans
    union = set_art | set_ans

    jaccard = len(inter) / len(union) if union else 0.0
    cov_ans = len(inter) / len(set_ans) if set_ans else 0.0
    cov_art = len(inter) / len(set_art) if set_art else 0.0
    return jaccard, cov_ans, cov_art

j_list, cov_ans_list, cov_art_list = [], [], []

for _, row in merged.iterrows():
    j, ca, ct = overlap_scores(row["Gold_Text"], row["الإجابة"])
    j_list.append(j)
    cov_ans_list.append(ca)
    cov_art_list.append(ct)

merged["Overlap_Jaccard"] = j_list
merged["Coverage_Answer_in_Article"] = cov_ans_list
merged["Coverage_Article_in_Answer"] = cov_art_list

merged["Answer_Length_Tokens"] = merged["الإجابة"].apply(
    lambda s: len(simple_tokenize_ar(s))
)
merged["Article_Length_Tokens"] = merged["Gold_Text"].apply(
    lambda s: len(simple_tokenize_ar(s))
)


# ===== 4) (اختياري) نحسب المتوسطات للطباعة بس =====
n = len(merged)
avg_jaccard = merged["Overlap_Jaccard"].mean()
avg_cov_ans = merged["Coverage_Answer_in_Article"].mean()

print("\n========== Three-Shot Objective Metrics ==========")
print("Number of Three-Shot Q&A pairs:", n)
print("-" * 50)
print(f"Average Overlap Jaccard            : {avg_jaccard:.3f}")
print(f"Average Coverage Answer in Article : {avg_cov_ans:.3f}")



cols_to_keep = [
    "المادة القانونية (م)",
    "السؤال",
    "الإجابة",
    "Gold_Article",
    "Gold_Text",
    "Overlap_Jaccard",
    "Coverage_Answer_in_Article",
    "Coverage_Article_in_Answer",
    "Answer_Length_Tokens",
    "Article_Length_Tokens",
]

final_df = merged[cols_to_keep]

out_path = "three_shot_metrics_clean.csv"
final_df.to_csv(out_path, index=False, encoding="utf-8-sig")
print("\nتم حفظ الملف النظيف بدون أي Manual باسم:", out_path)


أعمدة BOE: Index(['Section', 'Chapter', 'Article', 'Text', 'Status', 'Source'], dtype='object')
أعمدة Three-Shot: Index(['المادة القانونية (م)', 'السؤال', 'الإجابة'], dtype='object')
عدد أزواج السؤال/الجواب بعد الدمج: 426
عدد الصفوف بدون مادة مطابقة: 147

Number of Three-Shot Q&A pairs: 426
--------------------------------------------------
Average Overlap Jaccard            : 0.132
Average Coverage Answer in Article : 0.390

تم حفظ الملف النظيف بدون أي Manual باسم: three_shot_metrics_clean.csv
