NLP Project for KU Course

In [196]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
import math
nltk.download('punkt')
nltk.download('punkt_tab')


## Load the dataset
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreasmelbye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/andreasmelbye/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Week 36

In [197]:
## Remove unwanted characters from the questions

def cleanDf(df):
    pattern = re.compile(r"[?؟,;\/\\\[\]#():]")
    df['question'] = df['question'].apply(lambda x: pattern.sub("", x))
    df['context'] = df['context'].apply(lambda x: pattern.sub("", x))
    return df

In [198]:
langForStat = ['ar','ko','te']

numQuestions = []
totalWordCount = []
distinctWordCount = []
distinctCharCount = []

df_train_clean = cleanDf(df_train)
df_val_clean = cleanDf(df_val)

for lang in langForStat:
    numQuestions_train = df_train_clean[df_train_clean['lang'] == lang].shape[0]
    numQuestions_val = df_val_clean[df_val_clean['lang'] == lang].shape[0]
    numQuestions.append((lang, numQuestions_train, numQuestions_val))
    print(f"Language: {lang}, Train Questions: {numQuestions_train}, Validation Questions: {numQuestions_val}")

    # Compute word and character statistics
    df_train_lang = df_train_clean[df_train_clean['lang'] == lang].copy()
    df_val_lang = df_val_clean[df_val_clean['lang'] == lang]
    df_train_lang['wordcount'] = df_train_lang['question'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)


    maxId = df_train_lang['wordcount'].idxmax()
    longest_question = df_train_lang.loc[maxId, "question"]
    max_words = df_train_lang.loc[maxId, "wordcount"]

    # print(f"Language: {lang}")
    # print(f"  Longest train question (index {maxId}): {longest_question}")
    # print(f"  Word count: {max_words}")

    totalWordCount_train = df_train_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount_val = df_val_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount.append((lang, totalWordCount_train, totalWordCount_val))
    print(f"Language: {lang}, Train Total Words: {totalWordCount_train}, Validation Total Words: {totalWordCount_val}")



Language: ar, Train Questions: 2558, Validation Questions: 415
Language: ar, Train Total Words: 16202, Validation Total Words: 2621
Language: ko, Train Questions: 2422, Validation Questions: 356
Language: ko, Train Total Words: 11840, Validation Total Words: 1729
Language: te, Train Questions: 1355, Validation Questions: 384
Language: te, Train Total Words: 7668, Validation Total Words: 2299


In [199]:
def wordCount(df, lang):
    allWords = []
    df = df[df['lang'] == lang].copy()
    
    # df = df[df['answerable'] == False]
    df['question'] = df['question'].astype(str)

    for q in df['question']:
        allWords.extend(q.split()) 
    
    wordDict = dict(Counter(allWords))
    wordDict = dict(sorted(wordDict.items(), key=lambda item: item[1], reverse=True))

    return wordDict
            

for lang in langForStat:
    wordDict = wordCount(df_train_clean, lang)
    distinctWordCount.append((lang, len(wordDict)))
    print(f"Language: {lang}, Distinct Words: {len(wordDict)}")

    allChars = []
    for word in wordDict.keys():
        allChars.extend(list(word))
    
    charDict = dict(Counter(allChars))
    charDict = dict(sorted(charDict.items(), key=lambda item: item[1], reverse=True))
    distinctCharCount.append((lang, len(charDict)))
    print(f"Language: {lang}, Distinct Characters: {len(charDict)}")
    calculatedTotalWords = sum(wordDict.values())
    print(f"Language: {lang}, Calculated Total Words from Distinct Words: {calculatedTotalWords}")

    top5Words = list(wordDict.items())[:10]
    print(f"Language: {lang}, Top 5 Words: {top5Words}")

Language: ar, Distinct Words: 5427
Language: ar, Distinct Characters: 106
Language: ar, Calculated Total Words from Distinct Words: 16202
Language: ar, Top 5 Words: [('في', 592), ('من', 584), ('متى', 535), ('ما', 441), ('هو', 349), ('هل', 329), ('هي', 268), ('كم', 256), ('عدد', 161), ('أول', 157)]
Language: ko, Distinct Words: 4394
Language: ko, Distinct Characters: 819
Language: ko, Calculated Total Words from Distinct Words: 11840
Language: ko, Top 5 Words: [('가장', 527), ('무엇인가', 497), ('언제', 336), ('몇', 234), ('어디인가', 228), ('큰', 194), ('누구인가', 186), ('세상에서', 142), ('누구인가요', 105), ('무엇인가요', 95)]
Language: te, Distinct Words: 2420
Language: te, Distinct Characters: 91
Language: te, Calculated Total Words from Distinct Words: 7668
Language: te, Top 5 Words: [('ఎవరు', 274), ('ఏది', 192), ('ఎన్ని', 165), ('ఎప్పుడు', 154), ('ఏ', 142), ('ఎంత', 116), ('చిత్ర', 97), ('ఎక్కడ', 96), ('మొదటి', 86), ('ఉంది', 83)]


In [200]:
def arabicClassifier(question, context):
    goodWords = ['متى','ما','هو','هي','كم','عدد','أول','في']
    badWords = ['هل', 'يمكن']
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
        return True
    else:
        return True
        # return np.random.choice([True, False])

def koreanClassifier(question, context):
    goodWords = ['가장', '무엇인가', '언제', '몇']
    badWords = ['수 '] # '시차는', '중력과'
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
        return True
    else:
        return True
        # return np.random.choice([True, False])
    
def teluguClassifier(question, context):
    goodWords = []
    badWords = ['విస్తీర్ణం', 'జనాభా', 'ఆఫ్రికాలో']
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
            return True
    # if any(word in question for word in badWords):
    #     return False
    else:
        return True
        # return np.random.choice([True, False])

### --- Arabic ---
arabicDf = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
arabicDf['prediction'] = arabicDf.apply(lambda row: arabicClassifier(row['question'], row['context']), axis=1)
accuracy = (arabicDf['answerable'] == arabicDf['prediction']).mean()
print(f"Arabic Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {arabicDf['answerable'].value_counts(normalize=True).to_dict()}")

### --- Korean ---
koreanDf = df_val_clean[df_val_clean['lang'] == 'ko'].copy()
koreanDf['prediction'] = koreanDf.apply(lambda row: koreanClassifier(row['question'], row['context']), axis=1)
accuracy = (koreanDf['answerable'] == koreanDf['prediction']).mean()
print(f"Korean Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {koreanDf['answerable'].value_counts(normalize=True).to_dict()}")

### --- Telugu ---
teluguDf = df_val_clean[df_val_clean['lang'] == 'te'].copy()
teluguDf['prediction'] = teluguDf.apply(lambda row: teluguClassifier(row['question'], row['context']), axis=1)
accuracy = (teluguDf['answerable'] == teluguDf['prediction']).mean()
print(f"Telugu Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {teluguDf['answerable'].value_counts(normalize=True).to_dict()}")

Arabic Classifier Accuracy (validation): 96.87%
True distribution in validation set: {True: 0.8746987951807229, False: 0.12530120481927712}
Korean Classifier Accuracy (validation): 94.94%
True distribution in validation set: {True: 0.9466292134831461, False: 0.05337078651685393}
Telugu Classifier Accuracy (validation): 79.17%
True distribution in validation set: {True: 0.7578125, False: 0.2421875}


In [201]:
def overlap_ratio(question, context):
    q_words = set(question.split())
    c_words = set(context.split())
    if not q_words: return 0
    return len(q_words & c_words) / len(q_words)


import re
def contains_digit(text):
    return bool(re.search(r"\d+", text))


def arabicClassifier(question, context):
    goodWords = ['متى','ما','هو','هي','كم','عدد','أول','في']
    badWords = ['هل', 'يمكن']
    
    # Rule 1: keyword spotting
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
        return True
    
    # # Rule 2: overlap
    # if overlap_ratio(question, context) < 0.1:
    #     return False
    
    # # Rule 3: digits
    # if contains_digit(question) and not contains_digit(context):
    #     return False
    
    # Rule 4: context length
    if len(context.split()) < 25:
        return False
    
    return True

# Arabic
arabicDf = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
arabicDf['prediction'] = arabicDf.apply(lambda row: arabicClassifier(row['question'], row['context']), axis=1)
arabic_acc = (arabicDf['answerable'] == arabicDf['prediction']).mean()
print(f"Arabic Classifier Accuracy (validation): {arabic_acc*100:.2f}%")




Arabic Classifier Accuracy (validation): 96.87%


In [202]:
df_val_clean

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,ఒరెగాన్ రాష్ట్రంలోని అతిపెద్ద నగరం ఏది,Portland is the largest city in the U.S. state...,te,True,0,Portland,
1,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,"The word cholera is from ""kholera"" from χολή ""...",te,True,99,Indian subcontinent,
2,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,Since it became widespread in the 19th century...,te,True,451,England,
3,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I occurred from 1914 to 1918. In ter...,te,True,26,1914,
4,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I often abbreviated as WWI or WW1 al...,te,True,155,28 July 1914,
...,...,...,...,...,...,...,...
3006,2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషు...,Reyyalagadda is a village belonging to Gangara...,te,True,378,37,37
3007,2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ...,Boothumillipadu is a village in Gannavaram man...,te,True,308,433,433
3008,2011 జనాభా లెక్కల ప్రకారం మల్లవేముల గ్రామ జనాభ...,Mallavemula is a village belonging to Chagalam...,te,False,-1,1131,1131
3009,2011 నాటికి రష్యా దేశ ప్రధాన మంత్రి ఎవరు,Andria Urushadze born April 25 1968 is a Geor...,te,False,-1,Vladimir Putin,వ్లాదిమిర్ పుతిన్


## Week 37

In [203]:
arabicDf_train = df_train_clean[df_train_clean['lang'] == 'ar'].copy()
teluguDf_train = df_train_clean[df_train_clean['lang'] == 'te'].copy()
koreanDf_train = df_train_clean[df_train_clean['lang'] == 'ko'].copy()

arabicDf_val = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
teluguDf_val = df_val_clean[df_val_clean['lang'] == 'te'].copy()
koreanDf_val = df_val_clean[df_val_clean['lang'] == 'ko'].copy()

In [204]:
def compute_probability(question, unigram_fd, bigram_fd):
    tokens = nltk.word_tokenize(question)
    bigrams = list(ngrams(tokens, 2))
    prob = 1.0
    for bigram in bigrams:
        bigram_count = bigram_fd[bigram]
        unigram_count = unigram_fd[(bigram[0],)]
        if unigram_count > 0 and bigram_count > 0:
            prob *= bigram_count / unigram_count
        else:
            prob *= 1e-6
    return prob

def compute_logprob(question, unigram_fd, bigram_fd):
    tokens = nltk.word_tokenize(question)
    bigrams_list = list(ngrams(tokens, 2))
    log_prob = 0.0
    for bigram in bigrams_list:
        bigram_count = bigram_fd[bigram]
        unigram_count = unigram_fd[(bigram[0],)]
        if unigram_count > 0 and bigram_count > 0:
            prob = bigram_count / unigram_count
        else:
            prob = 1e-6  # smoothing for unseen bigrams
        log_prob += math.log(prob)
    return log_prob, len(tokens)  # return log_prob and number of tokens


https://dev.to/amananandrai/language-model-implementation-bigram-model-22ij

### Arabic

In [205]:
from nltk import FreqDist, ConditionalFreqDist

# Make a list of all bigrams in arabicDf questions
allBigrams_ar = []
allUnigrams_ar = []
for q in arabicDf_train['question']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_ar.extend(bigrams)
    allUnigrams_ar.extend(unigrams)

unigram_fd_ar = FreqDist(allUnigrams_ar)
bigram_fd_ar = FreqDist(allBigrams_ar)
print(unigram_fd_ar.most_common(10))
print(bigram_fd_ar.most_common(10))
cfdist_ar = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_ar)
print(cfdist_ar['ما'].most_common(10))
    


[(('في',), 593), (('من',), 587), (('متى',), 536), (('ما',), 443), (('هو',), 349), (('هل',), 329), (('هي',), 268), (('كم',), 256), (('عدد',), 161), (('أول',), 157)]
[(('ما', 'هي'), 232), (('من', 'هو'), 219), (('كم', 'عدد'), 114), (('ما', 'هو'), 113), (('متى', 'تم'), 103), (('متى', 'تأسست'), 54), (('في', 'أي'), 53), (('في', 'العالم'), 52), (('أي', 'عام'), 42), (('هل', 'يمكن'), 35)]
[(('ما', 'هي'), 232), (('ما', 'هو'), 113), (('ما', 'أول'), 9), (('ما', 'اسم'), 9), (('ما', 'هى'), 6), (('ما', 'نسبة'), 5), (('ما', 'أكبر'), 5), (('ما', 'عدد'), 5), (('ما', 'جنسية'), 5), (('ما', 'سبب'), 4)]


In [206]:
arabicDf_val['question_prob'] = arabicDf_val['question'].apply(lambda q: compute_probability(q, unigram_fd_ar, bigram_fd_ar))

print(arabicDf_val[['question', 'question_prob']].sort_values(by='question_prob').head(10))

arabicDf_val

                                               question  question_prob
1860  في أي عام تم انطلاق المكوك الفضائي ديب إمباكت ...   6.370950e-79
2037  ما هو الاكتشاف الذي نتج عنه اختراع القنبلة الذ...   3.401053e-75
1927  متى تولى الحُسَيْنُ بْنُ طَلالٍ بْنُ عَبْدِ ال...   1.119403e-68
2177  متى تولى الحُسَيْنُ بْنُ طَلالٍ بْنُ عَبْدِ ال...   1.119403e-68
2131  هل امتلك اللاعب الأرجنتيني ميسي الجنسية الاسبا...   1.000000e-60
1894  هل امتلك اللاعب الأرجنتيني ميسي الجنسية الاسبا...   1.000000e-60
1427  من هو المستكشف الذي عثرعلى جزيرة القيامة في ال...   3.730835e-55
1879  بم تمثل الجناح السياسي لجيش التحرير الوطني الج...   1.000000e-54
1450  بم تمثل الجناح السياسي لجيش التحرير الوطني الج...   1.000000e-54
1942  من هو الممثل الذي قام تمثيل دور التاجر الجشع ف...   3.568684e-53


Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,question_prob
1411,ما هي أولى جامعات فنلندا,"The Royal Academy of Åbo or ""Åbo Kungliga Aka...",ar,True,4,Royal Academy of Åbo,,5.237020e-19
1412,ما عدد الدول المطلة على بحر البلطيق,The Baltic Sea is a marginal sea of the Atlant...,ar,True,68,"Finland, Sweden, Denmark, Estonia, Latvia, Lit...",,7.711397e-28
1413,اين عاش نيوتن,From age 12 to age 17 Newton resided with Will...,ar,True,74,Grantham,,4.444444e-08
1417,هل زار ابن بطوطة اليمن,"After the ""hajj"" in either 1328 or 1330 he mad...",ar,False,-1,no,,1.000000e-24
1422,من هو الرئيس الأول للجمهورية اليمنية,The first President of unified Yemen was Ali A...,ar,True,41,Ali Abdullah Saleh,,7.741086e-10
...,...,...,...,...,...,...,...,...
2222,هل تعد الحشرات من ذوات الدم البارد,Insect thermoregulation is the process whereby...,ar,False,-1,no,,9.118541e-33
2227,من هو أشهر الفنانين اللبنانين في عام 2009,Ayman Baalbaki who is represented by Saleh Bar...,ar,True,0,Ayman Baalbaki,,1.622439e-29
2228,من هي أشهر مؤلفت القصص البوليسية في بريطانية,Sherlock Holmes is British detective fiction w...,ar,True,56,Sir Arthur Conan Doyle,,4.599659e-38
2231,من هو الممثل الفائز بجائزة الاوسكار لدور رئيسي...,Spotlight won two awards including Best Pictur...,ar,True,202,Leonardo DiCaprio,,1.243612e-43


In [207]:
# Calculate total log probability and total tokens over validation set
total_log_prob_ar = 0.0
total_tokens_ar  = 0
for q in arabicDf_val['question']:
    logp, n = compute_logprob(q, unigram_fd_ar, bigram_fd_ar)
    total_log_prob_ar += logp
    total_tokens_ar += n

# Perplexity
perplexity_ar = math.exp(-total_log_prob_ar / total_tokens_ar)
print("Validation Perplexity for Arabic:", perplexity_ar)

Validation Perplexity for Arabic: 3475.076581820621


### Korean

In [208]:
# Make a list of all bigrams in arabicDf questions
allBigrams_ko = []
allUnigrams_ko = []
for q in koreanDf_train['question']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_ko.extend(bigrams)
    allUnigrams_ko.extend(unigrams)

unigram_fd_ko = FreqDist(allUnigrams_ko)
bigram_fd_ko = FreqDist(allBigrams_ko)
print(unigram_fd_ko.most_common(10))
print(bigram_fd_ko.most_common(10))
cfdist_ko = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_ko)



[(('가장',), 527), (('무엇인가',), 497), (('언제',), 336), (('몇',), 234), (('어디인가',), 228), (('큰',), 194), (('누구인가',), 186), (('세상에서',), 142), (('누구인가요',), 105), (('무엇인가요',), 95)]
[(('가장', '큰'), 172), (('세상에서', '가장'), 138), (('가장', '많은'), 66), (('나라는', '어디인가'), 64), (('몇', '년도에'), 63), (('사람은', '누구인가'), 55), (('가장', '높은'), 48), (('몇', '개의'), 44), (('지도자는', '누구인가'), 37), (('얼마나', '되나요'), 34)]


In [209]:
koreanDf_val['question_prob'] = koreanDf_val['question'].apply(lambda q: compute_probability(q, unigram_fd_ko, bigram_fd_ko))

print(koreanDf_val[['question', 'question_prob']].sort_values(by='question_prob').head(10))

koreanDf_val

                                               question  question_prob
1131  임시정부는 종전의 정부가 무너진 후 무정부 상태를 해소하기 위해 임시로 구성된 정부...   1.000000e-72
663                  한국 전쟁 당시 중국이 북한을 위해 파병한 군인은 총 몇명인가   5.263158e-50
455                  한국 전쟁 당시 중국이 북한을 위해 파병한 군인은 총 몇명인가   5.263158e-50
765                 지자기 폭풍이 지구에 일어났을때 지구에선 어떤 현상이 일어나는가   1.000000e-42
451                      논증의 오류로 인해 무지에의 호소 오류를 범할수 있는가   1.000000e-42
1370                    파티마의 성모가 처음 나타났을 때 어떤 현상이 일어났는가   1.000000e-42
655                  제1대 휘트워스 준남작 조지프 휘트워스 경의 소속은 어디인가요   1.000000e-42
868                2019년 6월 기준 세상에서 가장 사양이 높은 컴퓨터는 무엇인가   9.255533e-38
1335                     인체가 자외선에 많이 노출된다면 어떤 변화가 일어나는가   1.000000e-36
1118                     인체가 자외선에 많이 노출된다면 어떤 변화가 일어나는가   1.000000e-36


Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,question_prob
356,북유럽의 노르딕 국가는 몇개인가요,At the beginning of the 20th century almost 12...,ko,True,393,five,,2.777778e-02
357,1887년 케이스 웨스턴 리저브 대학의 이름은 무엇인가,Case Western Reserve University was created in...,ko,True,58,Western Reserve University (formerly Western R...,,6.923077e-31
358,옴진리교는 어느 나라에서 시작된 종교인가,These letters are believed to have derived fro...,ko,True,51,Egypt,,2.203390e-19
359,댈러스의 면적은 얼마나 되나요,Dallas is the county seat of Dallas County. Po...,ko,True,232,999.3 km2,,4.888889e-07
360,오픈스택의 프로그래밍 언어는 무엇인가요,It is written in Python and uses many external...,ko,True,17,Python,,5.000000e-07
...,...,...,...,...,...,...,...,...
1393,사형제도가 유효한 나라는 몇 개국인가,Since World War II there has been a trend towa...,ko,True,80,58,,1.783724e-14
1396,체르노빌 원전 사고로 사망한 인원은 몇 명인가,In 1994 thirty-one deaths were directly attrib...,ko,True,281,64,,4.273504e-32
1397,베냉 공화국의 최대 도시는 어디인가요,Benin officially the Republic of Benin an...,ko,True,448,Cotonou,,5.482456e-14
1398,고조선은 언제 세워졌나요,Gojoseon was first mentioned in Chinese record...,ko,True,61,7th century BC,,2.976190e-09


In [210]:
# Calculate total log probability and total tokens over validation set
total_log_prob_ko = 0.0
total_tokens_ko = 0
for q in koreanDf_val['question']:
    logp, n = compute_logprob(q, unigram_fd_ko, bigram_fd_ko)
    total_log_prob_ko += logp
    total_tokens_ko += n

# Perplexity
perplexity_ko = math.exp(-total_log_prob_ko / total_tokens_ko)
print("Validation Perplexity for Korean:", perplexity_ko)

Validation Perplexity for Korean: 1061.2832744231378


### Telugu

In [211]:
# Make a list of all bigrams in arabicDf questions
allBigrams_telugu = []
allUnigrams_telugu = []
for q in teluguDf_train['question']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_telugu.extend(bigrams)
    allUnigrams_telugu.extend(unigrams)

unigram_fd_te = FreqDist(allUnigrams_telugu)
bigram_fd_te = FreqDist(allBigrams_telugu)
print(unigram_fd_te.most_common(10))
print(bigram_fd_te.most_common(10))
cfdist_telugu = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_telugu)

[(('ఎవరు',), 274), (('ఏది',), 192), (('ఎన్ని',), 165), (('ఎప్పుడు',), 154), (('ఏ',), 142), (('ఎంత',), 116), (('చిత్ర',), 97), (('ఎక్కడ',), 96), (('మొదటి',), 86), (('ఉంది',), 83)]
[(('ఎక్కడ', 'ఉంది'), 51), (('దర్శకుడు', 'ఎవరు'), 48), (('చిత్ర', 'దర్శకుడు'), 42), (('ఏ', 'సంవత్సరంలో'), 33), (('ఆంధ్ర', 'ప్రదేశ్'), 30), (('ముఖ్యమంత్రి', 'ఎవరు'), 29), (('చిత్రం', 'ఎప్పుడు'), 27), (('సంఖ్య', 'ఎంత'), 24), (('పేరు', 'ఏమిటి'), 23), (('నిర్మాత', 'ఎవరు'), 23)]


In [212]:
teluguDf_val['question_prob'] = teluguDf_val['question'].apply(lambda q: compute_probability(q, unigram_fd_te, bigram_fd_te))

print(teluguDf_val[['question', 'question_prob']].sort_values(by='question_prob').head(10))

teluguDf_val

                                               question  question_prob
72    ప్రపంచంలో 70% శాతం నీటిలో మానవుడు నిత్యవసరాలకు...   8.000000e-55
24     ప్రాచీన గ్రీకు ఏ కాలం నుండి ఏ కాలం వరకు నడిచింది   1.000000e-48
168   చైనాలోని ఒక నగరమైన బీజింగ్ ను  గతంలో ఏ నగరంగా ...   1.000000e-48
150   శని గ్రహం సూర్యుని చుట్టూ తిరిగి రావడానికి ఎన్...   3.030303e-44
45    మొదటి ప్రపంచ యుద్ధం మొదటగా ఏఏ దేశాల మధ్య మొదలు...   3.333333e-43
44    మొదటి ప్రపంచ యుద్ధం మొదటగా ఏఏ దేశాల మధ్య మొదలు...   3.333333e-43
75    భారతదేశంలో రాష్ట్రపతిగా పోటీ చేయడానికి కావాల్స...   1.000000e-42
3001  2001 జనాభా లెక్కల ప్రకారం తిమ్మయ్యపల్లె గ్రామం...   1.000000e-42
9     భారతదేశంలో ఓటు హక్కు పొందడానికి ఉండవలసిన కనీస ...   1.000000e-42
258        జపాన్ లో  ఓటు వేయడానికి కనీస వయసు ఎంత ఉండాలి   1.000000e-42


Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,question_prob
0,ఒరెగాన్ రాష్ట్రంలోని అతిపెద్ద నగరం ఏది,Portland is the largest city in the U.S. state...,te,True,0,Portland,,5.080645e-02
1,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,"The word cholera is from ""kholera"" from χολή ""...",te,True,99,Indian subcontinent,,1.629612e-15
2,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,Since it became widespread in the 19th century...,te,True,451,England,,1.629612e-15
3,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I occurred from 1914 to 1918. In ter...,te,True,26,1914,,4.329004e-15
4,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I often abbreviated as WWI or WW1 al...,te,True,155,28 July 1914,,4.329004e-15
...,...,...,...,...,...,...,...,...
3006,2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషు...,Reyyalagadda is a village belonging to Gangara...,te,True,378,37,37,3.361345e-26
3007,2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ...,Boothumillipadu is a village in Gannavaram man...,te,True,308,433,433,6.250000e-31
3008,2011 జనాభా లెక్కల ప్రకారం మల్లవేముల గ్రామ జనాభ...,Mallavemula is a village belonging to Chagalam...,te,False,-1,1131,1131,1.000000e-36
3009,2011 నాటికి రష్యా దేశ ప్రధాన మంత్రి ఎవరు,Andria Urushadze born April 25 1968 is a Geor...,te,False,-1,Vladimir Putin,వ్లాదిమిర్ పుతిన్,7.130125e-14


In [213]:
# Calculate total log probability and total tokens over validation set
total_log_prob_te = 0.0
total_tokens_te = 0
for q in teluguDf_val['question']:
    logp, n = compute_logprob(q, unigram_fd_te, bigram_fd_te)
    total_log_prob_te += logp
    total_tokens_te += n

# Perplexity
perplexity_te = math.exp(-total_log_prob_te / total_tokens_te)
print("Validation Perplexity for Telugu:", perplexity_te)

Validation Perplexity for Telugu: 540.9787802403338


### English

In [214]:
# Make a list of all bigrams in arabicDf questions
allBigrams_en = []
allUnigrams_en = []
for q in df_train_clean['context']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_en.extend(bigrams)
    allUnigrams_en.extend(unigrams)

unigram_fd_en = FreqDist(allUnigrams_en)
bigram_fd_en = FreqDist(allBigrams_en)
print(unigram_fd_en.most_common(10))
print(bigram_fd_en.most_common(10))
cfdist_en = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_en)

[(('the',), 102629), (('.',), 69142), (('of',), 62312), (('and',), 48800), (('in',), 40805), (('a',), 27707), (('to',), 27572), (('is',), 20181), (('was',), 17824), (('The',), 17672)]
[(('of', 'the'), 18179), (('.', 'The'), 11665), (('in', 'the'), 11115), (('to', 'the'), 5020), (('and', 'the'), 4699), (('.', 'It'), 4016), (('.', 'In'), 3596), (('is', 'a'), 3495), (('is', 'the'), 3437), (('by', 'the'), 3214)]


In [215]:

df_val_clean['context_prob'] = df_val_clean['context'].apply(lambda q: compute_probability(q, unigram_fd_en, bigram_fd_en))

print(df_val_clean[['context', 'context_prob']].sort_values(by='context_prob').head(10))

df_val_clean

                                                context  context_prob
3010  Guntur district is 11391 sq. km. Spread over a...           0.0
992   During the early 1570s King John III of Sweden...           0.0
991   Absolute zero is the lowest limit of the therm...           0.0
2214  The first recorded instance of a colonist iden...           0.0
986   Lesbian gay bisexual and transgender LGBT peop...           0.0
985   The term graphic design was coined by William ...           0.0
2216  Addiction is a brain disorder characterized by...           0.0
2221  During the Indian Rebellion of 1857 Delhi fell...           0.0
2222  Insect thermoregulation is the process whereby...           0.0
981   The Ancient Ones is the oldest school of Tibet...           0.0


Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang,context_prob
0,ఒరెగాన్ రాష్ట్రంలోని అతిపెద్ద నగరం ఏది,Portland is the largest city in the U.S. state...,te,True,0,Portland,,4.853396e-183
1,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,"The word cholera is from ""kholera"" from χολή ""...",te,True,99,Indian subcontinent,,0.000000e+00
2,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,Since it became widespread in the 19th century...,te,True,451,England,,0.000000e+00
3,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I occurred from 1914 to 1918. In ter...,te,True,26,1914,,0.000000e+00
4,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I often abbreviated as WWI or WW1 al...,te,True,155,28 July 1914,,1.756688e-214
...,...,...,...,...,...,...,...,...
3006,2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషు...,Reyyalagadda is a village belonging to Gangara...,te,True,378,37,37,0.000000e+00
3007,2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ...,Boothumillipadu is a village in Gannavaram man...,te,True,308,433,433,0.000000e+00
3008,2011 జనాభా లెక్కల ప్రకారం మల్లవేముల గ్రామ జనాభ...,Mallavemula is a village belonging to Chagalam...,te,False,-1,1131,1131,0.000000e+00
3009,2011 నాటికి రష్యా దేశ ప్రధాన మంత్రి ఎవరు,Andria Urushadze born April 25 1968 is a Geor...,te,False,-1,Vladimir Putin,వ్లాదిమిర్ పుతిన్,0.000000e+00


In [216]:
# Calculate total log probability and total tokens over validation set
total_log_prob_en = 0.0
total_tokens_en = 0
for q in df_val_clean['context']:
    logp, n = compute_logprob(q, unigram_fd_en, bigram_fd_en)
    total_log_prob_en += logp
    total_tokens_en += n

# Perplexity
perplexity_en = math.exp(-total_log_prob_en / total_tokens_en)
print("Validation Perplexity for English:", perplexity_en)

Validation Perplexity for English: 1088.9448235837644


In [217]:
# --- TRAINING: trigram + bigram + unigram ---
allTrigrams_en = []
allBigrams_en = []
allUnigrams_en = []

for q in df_train_clean['context']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_en.extend(tokens)
    allBigrams_en.extend(list(ngrams(tokens, 2)))
    allTrigrams_en.extend(list(ngrams(tokens, 3)))

unigram_fd_en = FreqDist(allUnigrams_en)
bigram_fd_en = FreqDist(allBigrams_en)
trigram_fd_en = FreqDist(allTrigrams_en)

print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")
print(f"Most common bigrams: {bigram_fd_en.most_common(10)}")
print(f"Most common trigrams: {trigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under trigram model ---
def question_logprob(sentence, bigram_fd, trigram_fd, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    trigrams = list(ngrams(tokens, 3))
    log_prob = 0.0
    for w1, w2, w3 in trigrams:
        trigram_count = trigram_fd[(w1, w2, w3)]
        bigram_count = bigram_fd[(w1, w2)]
        if bigram_count > 0 and trigram_count > 0:
            prob = trigram_count / bigram_count
        else:
            prob = smoothing  # unseen trigram → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_en = 0.0
total_tokens_en = 0

for q in df_val_clean['context']:
    logp, n = question_logprob(q, bigram_fd_en, trigram_fd_en)
    total_log_prob_en += logp
    total_tokens_en += n

perplexity_en = math.exp(-total_log_prob_en / total_tokens_en)
print("Validation Perplexity for English:", perplexity_en)

Most common unigrams: [('the', 102629), ('.', 69142), ('of', 62312), ('and', 48800), ('in', 40805), ('a', 27707), ('to', 27572), ('is', 20181), ('was', 17824), ('The', 17672)]
Most common bigrams: [(('of', 'the'), 18179), (('.', 'The'), 11665), (('in', 'the'), 11115), (('to', 'the'), 5020), (('and', 'the'), 4699), (('.', 'It'), 4016), (('.', 'In'), 3596), (('is', 'a'), 3495), (('is', 'the'), 3437), (('by', 'the'), 3214)]
Most common trigrams: [(('.', 'It', 'is'), 1659), (('one', 'of', 'the'), 1305), (('the', 'United', 'States'), 1134), (('.', 'It', 'was'), 931), (('as', 'well', 'as'), 695), (('.', 'He', 'was'), 671), (('in', 'the', 'world'), 657), (('.', 'In', 'the'), 633), (('the', 'world', "'s"), 561), (('part', 'of', 'the'), 553)]
Validation Perplexity for English: 8453.467175831895


In [218]:
def build_counts(corpus):
    allUnigrams = []
    allBigrams = []
    allTrigrams = []

    for text in corpus:
        tokens = nltk.word_tokenize(text)
        allUnigrams.extend(tokens)
        allBigrams.extend(list(ngrams(tokens, 2)))
        allTrigrams.extend(list(ngrams(tokens, 3)))

    unigram_fd = FreqDist(allUnigrams)
    bigram_fd = FreqDist(allBigrams)
    trigram_fd = FreqDist(allTrigrams)

    return unigram_fd, bigram_fd, trigram_fd


# build counts
unigram_fd, bigram_fd, trigram_fd = build_counts(df_train_clean['context'])
V = len(unigram_fd)

def conditional_prob_unigram(w, unigram_fd):
    return unigram_fd[w] / sum(unigram_fd.values())

def conditional_prob_bigram(w2, w1, bigram_fd, unigram_fd, k=0.0):
    # Add-k for bigram if you like, or simple MLE
    bi = bigram_fd[(w1,w2)]
    uni = unigram_fd[w1]
    if uni>0:
        return bi/uni
    return 1.0/V

def conditional_prob_trigram(w3, w1, w2, trigram_fd, bigram_fd, V, k=0.0):
    tri = trigram_fd[(w1,w2,w3)]
    bi  = bigram_fd[(w1,w2)]
    if bi>0:
        return (tri + k) / (bi + k*V)
    return 1.0/V

def sentence_logprob_interpolated(sentence, unigram_fd, bigram_fd, trigram_fd,
                                  V, lambdas=(0.1,0.3,0.6), k=0.0):
    # lambdas: (lambda_uni, lambda_bi, lambda_tri) must sum to 1
    lam1, lam2, lam3 = lambdas
    toks = nltk.word_tokenize(sentence)
    trigs = list(ngrams(toks, 3))
    logp = 0.0
    for w1,w2,w3 in trigs:
        p_uni = conditional_prob_unigram(w3, unigram_fd)
        p_bi  = conditional_prob_bigram(w3, w2, bigram_fd, unigram_fd, k)
        p_tri = conditional_prob_trigram(w3, w1, w2, trigram_fd, bigram_fd, V, k)
        p = lam1*p_uni + lam2*p_bi + lam3*p_tri
        logp += math.log(p)
    return logp, len(toks)

# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in df_val_clean['context']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
pp = math.exp(-total_log/total_tokens)
print("Interpolated trigram perplexity:", pp)

Interpolated trigram perplexity: 720.8373580260715


In [219]:
# --- TRAINING: unigram model ---
allUnigrams_en = []

for q in df_train_clean['context']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_en.extend(tokens)

unigram_fd_en = FreqDist(allUnigrams_en)
total_tokens_train = sum(unigram_fd_en.values())
V = len(unigram_fd_en)  # vocabulary size

print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under unigram model ---
def question_logprob_unigram(sentence, unigram_fd, total_tokens, V, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    log_prob = 0.0
    for w in tokens:
        count = unigram_fd[w]
        if count > 0:
            prob = count / total_tokens
        else:
            prob = smoothing  # unseen word → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_en = 0.0
total_tokens_en = 0

for q in df_val_clean['context']:
    logp, n = question_logprob_unigram(q, unigram_fd_en, total_tokens_train, V)
    total_log_prob_en += logp
    total_tokens_en += n

perplexity_en = math.exp(-total_log_prob_en / total_tokens_en)
print("Validation Perplexity (Unigram) for English:", perplexity_en)

Most common unigrams: [('the', 102629), ('.', 69142), ('of', 62312), ('and', 48800), ('in', 40805), ('a', 27707), ('to', 27572), ('is', 20181), ('was', 17824), ('The', 17672)]
Validation Perplexity (Unigram) for English: 1991.2012701142155


In [None]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in arabicDf_val['context']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
pp = math.exp(-total_log/total_tokens)
print("Interpolated trigram perplexity in Arabic:", pp)

In [None]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in teluguDf_val['context']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
pp = math.exp(-total_log/total_tokens)
print("Interpolated trigram perplexity in Telugu:", pp)

Interpolated trigram perplexity in Telugu: 505.08113446541773


In [None]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in koreanDf_val['context']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
pp = math.exp(-total_log/total_tokens)
print("Interpolated trigram perplexity in Korean:", pp)

Interpolated trigram perplexity in Korean: 476.5108696681436


In [None]:
# --- TRAINING: unigram model ---
allUnigrams_ar = []

for q in arabicDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_ar.extend(tokens)

unigram_fd_ar = FreqDist(allUnigrams_ar)
total_tokens_train = sum(unigram_fd_ar.values())
V = len(unigram_fd_ar)  # vocabulary size

print(f"Most common unigrams Arabic: {unigram_fd_ar.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under unigram model ---
def question_logprob_unigram(sentence, unigram_fd, total_tokens, V, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    log_prob = 0.0
    for w in tokens:
        count = unigram_fd[w]
        if count > 0:
            prob = count / total_tokens
        else:
            prob = smoothing  # unseen word → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_ar = 0.0
total_tokens_ar = 0

for q in arabicDf_val['context']:
    logp, n = question_logprob_unigram(q, unigram_fd_ar, total_tokens_train, V)
    total_log_prob_ar += logp
    total_tokens_ar += n

perplexity_ar = math.exp(-total_log_prob_ar / total_tokens_ar)
print("Validation Perplexity (Unigram) for Arabic:", perplexity_ar)

Most common unigrams Arabic: [('في', 593), ('من', 587), ('متى', 536), ('ما', 443), ('هو', 349), ('هل', 329), ('هي', 268), ('كم', 256), ('عدد', 161), ('أول', 157)]
Validation Perplexity (Unigram) for Arabic: 433310.1100322282


In [None]:
# --- TRAINING: trigram + bigram + unigram ---
allTrigrams_ar = []
allBigrams_ar = []
allUnigrams_ar = []

for q in arabicDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_ar.extend(tokens)
    allBigrams_ar.extend(list(ngrams(tokens, 2)))
    allTrigrams_ar.extend(list(ngrams(tokens, 3)))

unigram_fd_ar = FreqDist(allUnigrams_ar)
bigram_fd_ar = FreqDist(allBigrams_ar)
trigram_fd_ar = FreqDist(allTrigrams_ar)

print(f"Most common unigrams: {unigram_fd_ar.most_common(10)}")
print(f"Most common bigrams: {bigram_fd_ar.most_common(10)}")
print(f"Most common trigrams: {trigram_fd_ar.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under trigram model ---
def question_logprob(sentence, bigram_fd, trigram_fd, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    trigrams = list(ngrams(tokens, 3))
    log_prob = 0.0
    for w1, w2, w3 in trigrams:
        trigram_count = trigram_fd[(w1, w2, w3)]
        bigram_count = bigram_fd[(w1, w2)]
        if bigram_count > 0 and trigram_count > 0:
            prob = trigram_count / bigram_count
        else:
            prob = smoothing  # unseen trigram → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_ar = 0.0
total_tokens_ar = 0

for q in arabicDf_val['question']:
    logp, n = question_logprob(q, bigram_fd_ar, trigram_fd_ar)
    total_log_prob_ar += logp
    total_tokens_ar += n

perplexity_ar = math.exp(-total_log_prob_ar / total_tokens_ar)
print("Validation Perplexity for Arabic:", perplexity_ar)

Most common unigrams: [('في', 593), ('من', 587), ('متى', 536), ('ما', 443), ('هو', 349), ('هل', 329), ('هي', 268), ('كم', 256), ('عدد', 161), ('أول', 157)]
Most common bigrams: [(('ما', 'هي'), 232), (('من', 'هو'), 219), (('كم', 'عدد'), 114), (('ما', 'هو'), 113), (('متى', 'تم'), 103), (('متى', 'تأسست'), 54), (('في', 'أي'), 53), (('في', 'العالم'), 52), (('أي', 'عام'), 42), (('هل', 'يمكن'), 35)]
Most common trigrams: [(('في', 'أي', 'عام'), 42), (('من', 'هو', 'رئيس'), 28), (('ما', 'هي', 'أكبر'), 25), (('من', 'هو', 'أول'), 23), (('من', 'هو', 'مؤسس'), 22), (('من', 'هو', 'الرئيس'), 21), (('ما', 'هي', 'اكبر'), 19), (('من', 'هو', 'اول'), 18), (('ما', 'هي', 'جنسية'), 18), (('متى', 'صدر', 'فيلم'), 18)]
Validation Perplexity for Arabic: 3721.809419391342
