In [2]:
##IMPORTS
import pandas as pd
import polars as pl
from collections import Counter
from transformers import pipeline
from googletrans import Translator
import string
import nltk
from nltk.corpus import stopwords
import re

In [4]:
#DOWNLOAD DATASET

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])

In [3]:
arabic_records_train = df_train[df_train["lang"] == "ar"]
print(arabic_records_train.head())


korean_records_train = df_train[df_train["lang"] == "ko"]
print(korean_records_train.head())


telugu_records_train = df_train[df_train["lang"] == "te"]
print(telugu_records_train.head())

                                                question  \
11213         متى تدخلت روسيا في  الحرب الأهلية السورية؟   
11214         متى حصلت هنغاريا على استقلالها من النمسا ؟   
11215  متى تحالفت فرنسا و بريطانيا العظمى ضد ألمانيا ...   
11216   كم عدد ضحايا أول إعتداء إسرائيلي على مدينة غزة ؟   
11217       هل سلسلة هاري بوتر مخالفة لقوانين المسيحية ؟   

                                                 context lang  answerable  \
11213  The Russian military intervention in the Syria...   ar        True   
11214  By 1918, the economic situation had deteriorat...   ar        True   
11215  France and Britain declared war on Germany whe...   ar        True   
11216  The 2014 Israel–Gaza conflict also known as Op...   ar        True   
11217  Religious debates over the "Harry Potter" seri...   ar       False   

       answer_start                        answer answer_inlang  
11213            67                September 2015          None  
11214           454                  October

In [21]:
#TRANSLATORS           #GPU use if avaiable (device = 0)


#Model 'nllb-200-distilled-600M'
arabic_translator = pipeline(
    task="translation", 
    model="facebook/nllb-200-distilled-600M",
    src_lang="arb_Arab",    
    tgt_lang="eng_Latn",   
    device=0               
)

korean_translator = pipeline("translation",
                  model="facebook/nllb-200-distilled-600M",
                    src_lang="kor_Hang", 
                    tgt_lang="eng_Latn", 
                    device=0)

telugu_translator = pipeline("translation",
                  model="facebook/nllb-200-distilled-600M", 
                  src_lang="tel_Telu", 
                  tgt_lang="eng_Latn",
                  device=0)

#googletrans
translator = Translator()

Device set to use cpu
Device set to use cpu
Device set to use cpu


In [23]:
##WEEK 36

## Word level tokenisation using  re '\W+'
#[w for w in re.split(r'\W+', train_questions) if w] this splits according to 
#  spaces, punctuation (. , ? !), symbols, etc. And removes empty strings from the list.

##!!!!!!!!!!!!!!!!QUESTION: ARE SUPPPOSED TO CONSIDER PONCTUATION AS WORDS????????

#Arabic ("Ar")
print("--------Arabic Records---------")
arabic_records_train = df_train[df_train["lang"] == "ar"]
arabic_records_val = df_val[df_val["lang"] == "ar"]
print("Train Size:", len(arabic_records_train))
print("Val Size:", len(arabic_records_val))


train_questions = " ".join(arabic_records_train["question"].astype(str))
val_questions   = " ".join(arabic_records_val["question"].astype(str))


train_words = [w for w in re.split(r'\W+', train_questions) if w]
val_words   = [w for w in re.split(r'\W+', val_questions) if w]


#Word counts
total_words_train = len(train_words)
total_words_val   = len(val_words)

print ("Total words in Train:", total_words_train)
print ("Total words in Val:", total_words_val)



#5 most common words in the training set and translations
count = Counter(train_words)
top5 = count.most_common(5)


print("Top 5 most common words in Arabic training set:", top5)

for word, freq in top5:
        translated_word = arabic_translator(word)[0]['translation_text']

        print(f"{word}: {freq} - Model Translation: {translated_word} - Google Translate: {translator.translate(word, src='ar', dest='en').text}")





#Korean ("Ko")

print("---------Korean Records----------")
korean_records_train = df_train[df_train["lang"] == "ko"]
korean_records_val = df_val[df_val["lang"] == "ko"]
print("Train Size:", len(korean_records_train))
print("Val Size:", len(korean_records_val))



train_questions = " ".join(korean_records_train["question"].astype(str))
val_questions   = " ".join(korean_records_val["question"].astype(str))


train_words = [w for w in re.split(r'\W+', train_questions) if w]
val_words   = [w for w in re.split(r'\W+', val_questions) if w]

#Word counts
total_words_train = len(train_words)
total_words_val   = len(val_words)

print ("Total words in Train:", total_words_train)
print ("Total words in Val:", total_words_val)



#5 most common words in the training set and translations
count = Counter(train_words)
top5 = count.most_common(5)


print("Top 5 most common words in Korean training set:", top5)

for word, freq in top5:
        translated_word = korean_translator(word)[0]['translation_text']

        print(f"{word}: {freq} - Model Translation: {translated_word} - Google Translate: {translator.translate(word, src='ko', dest='en').text}")





#Telugu ("Te")
print("--------Telugu Records-------------")
telugu_records_train = df_train[df_train["lang"] == "te"]
telugu_records_val = df_val[df_val["lang"] == "te"]
print("Train Size:", len(telugu_records_train))
print("Val Size:", len(telugu_records_val))



train_questions = " ".join(telugu_records_train["question"].astype(str))
val_questions   = " ".join(telugu_records_val["question"].astype(str))


train_questions = train_questions.replace('?', '')
val_questions   = val_questions.replace('?', '')

train_words = [w for w in re.split(r'\s', train_questions) if w]
val_words   = [w for w in re.split(r'\s', val_questions) if w]

#Worc counts
total_words_train = len(train_words)
total_words_val   = len(val_words)

print ("Total words in Train:", total_words_train)
print ("Total words in Val:", total_words_val)



#5 most common words in the training set and translations
count = Counter(train_words)
top5 = count.most_common(5)


print("Top 5 most common words in Telegu training set:", top5)

for word, freq in top5:
        translated_word = telugu_translator(word)[0]['translation_text']

        print(f"{word}: {freq} - Model Translation: {translated_word} - Google Translate: {translator.translate(word, src='te', dest='en').text}")

--------Arabic Records---------
Train Size: 2558
Val Size: 415
Total words in Train: 16320
Total words in Val: 2668
Top 5 most common words in Arabic training set: [('في', 593), ('من', 587), ('متى', 536), ('ما', 443), ('هو', 350)]
في: 593 - Model Translation: In . - Google Translate: in
من: 587 - Model Translation: Who ? - Google Translate: from
متى: 536 - Model Translation: When ? - Google Translate: when
ما: 443 - Model Translation: What ? - Google Translate: what
هو: 350 - Model Translation: It 's him . - Google Translate: he
---------Korean Records----------
Train Size: 2422
Val Size: 356
Total words in Train: 11863
Total words in Val: 1737
Top 5 most common words in Korean training set: [('가장', 527), ('무엇인가', 497), ('언제', 336), ('몇', 234), ('어디인가', 228)]
가장: 527 - Model Translation: It's the most - Google Translate: most
무엇인가: 497 - Model Translation: It's something. - Google Translate: Something
언제: 336 - Model Translation: When? - Google Translate: when
몇: 234 - Model Translatio

We conclude that the 5 most common words for each language are "stop words" mentioned in the lectures. 

In [19]:


#Rule base classifier
#Stop words 
nltk.download('stopwords')
stop_words = set(stopwords.words('english')) | set(string.punctuation)






def translate_question_to_en(question, src_lang):
    try:
        if src_lang == "en" or not question:
            return question 
        translated = translator.translate(question, src=src_lang, dest="en")
        return translated.text
    except Exception:
        return question






def tokenize(text):
    tokens = re.split(r'\W+', str(text)) 

    return [
        t.lower() 
        for t in tokens 
        if  t.lower() not in stop_words
    ]


#Overlap Score (Substring)
#Computes overlap ratio (matched question tokens to the context /total question tokens)
#Also retunrs the number of matched tokens. A match is counted if the token is equal or if one contains the other as a substring in either direction
def overlap_score_question(question, context,src_lang):
    translated_question = translate_question_to_en(question, src_lang)
    q_toks = tokenize(translated_question)
    c_toks = tokenize(context)
    if not q_toks:
        return 0.0, 0
    
    matched = set()
    for q in q_toks:
        for c in c_toks:
            if q == c or q in c or c in q:
                matched.add(q)
                break
    matches = len(matched)
    ratio = matches / len(q_toks)
    return ratio, matches


def tune_parameters(train_df):
   
    data = [(overlap_score_question(r.question, r.context, r.lang), bool(r.answerable))
           for r in train_df.itertuples(index=False)] #storring ratio , number of matches and answerable or not for each question
    best = (0, 0, 0.0)  #initializing accuracy, min match count, threshold for the overlap ratio 
    for k in (1, 2, 3, 4, 5,6,7,8,9,10):
        for threshold in (0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9):
            correct = 0
            for (ratio, match_count), y in data:
                pred = (match_count >= k) and (ratio >= threshold)
                correct += int(pred == y)
            acc = correct / len(data) if data else 0
            if acc > best[0]:
                best = (acc, k, threshold)
    return {"min_match_count": best[1], "min_ratio_threshold": best[2], "best_train_acc": best[0]}


def eval(df, min_matches, ratio_threshold):
    correct = 0
    for r in df.itertuples(index=False):
        ratio, m = overlap_score_question(r.question, r.context, r.lang)
        pred = (m >= min_matches) and (ratio >= ratio_threshold)
        correct += int(pred == bool(r.answerable))
    return correct / len(df) if len(df) else 0.0



def run_rule_classifier(df_train, df_val):
  
    results = {}

    for code, name in [("ar","Arabic"), ("ko","Korean"), ("te","Telugu")]:
        train = df_train[df_train.lang == code]
        val = df_val[df_val.lang == code]
        params = tune_parameters(train)
        val_acc = eval(val, params["min_match_count"], params["min_ratio_threshold"])
        results[name] = {
            "train_acc": round(params["best_train_acc"], 3),
            "val_acc": round(val_acc, 3),
            "min_matches_count": params["min_match_count"],
            "min_ratio_threshold": params["min_ratio_threshold"]
        }
    return pd.DataFrame(results).T


results = run_rule_classifier(df_train, df_val)
print(results)


[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Asus\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


        train_acc  val_acc  min_matches_count  min_ratio_threshold
Arabic      0.900    0.870                1.0                  0.3
Korean      0.972    0.941                1.0                  0.3
Telugu      0.979    0.914                1.0                  0.3


In [None]:
#WEEK 37




