NLP Project for KU Course

In [1]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
import nltk
from nltk.util import ngrams
import math
nltk.download('punkt')
nltk.download('punkt_tab')
from nltk import FreqDist, ConditionalFreqDist


## Load the dataset
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])


[nltk_data] Downloading package punkt to
[nltk_data]     /Users/andreasmelbye/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/andreasmelbye/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
  from .autonotebook import tqdm as notebook_tqdm


## Week 36

In [2]:
## Remove unwanted characters from the questions

def cleanDf(df):
    pattern = re.compile(r"[?؟,;\/\\\[\]#():]")
    # pattern_context = re.compile(r"[?؟,;\/\\\[\]#():.]")
    df['question'] = df['question'].apply(lambda x: pattern.sub("", x))
    df['context'] = df['context'].apply(lambda x: pattern.sub("", x))
    return df

In [3]:
langForStat = ['ar','ko','te']

numQuestions = []
totalWordCount = []
distinctWordCount = []
distinctCharCount = []

df_train_clean = cleanDf(df_train)
df_val_clean = cleanDf(df_val)

for lang in langForStat:
    numQuestions_train = df_train_clean[df_train_clean['lang'] == lang].shape[0]
    numQuestions_val = df_val_clean[df_val_clean['lang'] == lang].shape[0]
    numQuestions.append((lang, numQuestions_train, numQuestions_val))
    print(f"Language: {lang}, Train Questions: {numQuestions_train}, Validation Questions: {numQuestions_val}")

    # Compute word and character statistics
    df_train_lang = df_train_clean[df_train_clean['lang'] == lang].copy()
    df_val_lang = df_val_clean[df_val_clean['lang'] == lang]
    df_train_lang['wordcount'] = df_train_lang['question'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)


    maxId = df_train_lang['wordcount'].idxmax()
    longest_question = df_train_lang.loc[maxId, "question"]
    max_words = df_train_lang.loc[maxId, "wordcount"]

    # print(f"Language: {lang}")
    # print(f"  Longest train question (index {maxId}): {longest_question}")
    # print(f"  Word count: {max_words}")

    totalWordCount_train = df_train_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount_val = df_val_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount.append((lang, totalWordCount_train, totalWordCount_val))
    print(f"Language: {lang}, Train Total Words: {totalWordCount_train}, Validation Total Words: {totalWordCount_val}")



Language: ar, Train Questions: 2558, Validation Questions: 415
Language: ar, Train Total Words: 16202, Validation Total Words: 2621
Language: ko, Train Questions: 2422, Validation Questions: 356
Language: ko, Train Total Words: 11840, Validation Total Words: 1729
Language: te, Train Questions: 1355, Validation Questions: 384
Language: te, Train Total Words: 7668, Validation Total Words: 2299


In [4]:
def wordCount(df, lang):
    allWords = []
    df = df[df['lang'] == lang].copy()
    
    # df = df[df['answerable'] == False]
    df['question'] = df['question'].astype(str)

    for q in df['question']:
        allWords.extend(q.split()) 
    
    wordDict = dict(Counter(allWords))
    wordDict = dict(sorted(wordDict.items(), key=lambda item: item[1], reverse=True))

    return wordDict
            

for lang in langForStat:
    wordDict = wordCount(df_train_clean, lang)
    distinctWordCount.append((lang, len(wordDict)))
    print(f"Language: {lang}, Distinct Words: {len(wordDict)}")

    allChars = []
    for word in wordDict.keys():
        allChars.extend(list(word))
    
    charDict = dict(Counter(allChars))
    charDict = dict(sorted(charDict.items(), key=lambda item: item[1], reverse=True))
    distinctCharCount.append((lang, len(charDict)))
    print(f"Language: {lang}, Distinct Characters: {len(charDict)}")
    calculatedTotalWords = sum(wordDict.values())
    print(f"Language: {lang}, Calculated Total Words from Distinct Words: {calculatedTotalWords}")

    top5Words = list(wordDict.items())[:10]
    print(f"Language: {lang}, Top 5 Words: {top5Words}")

Language: ar, Distinct Words: 5427
Language: ar, Distinct Characters: 106
Language: ar, Calculated Total Words from Distinct Words: 16202
Language: ar, Top 5 Words: [('في', 592), ('من', 584), ('متى', 535), ('ما', 441), ('هو', 349), ('هل', 329), ('هي', 268), ('كم', 256), ('عدد', 161), ('أول', 157)]
Language: ko, Distinct Words: 4394
Language: ko, Distinct Characters: 819
Language: ko, Calculated Total Words from Distinct Words: 11840
Language: ko, Top 5 Words: [('가장', 527), ('무엇인가', 497), ('언제', 336), ('몇', 234), ('어디인가', 228), ('큰', 194), ('누구인가', 186), ('세상에서', 142), ('누구인가요', 105), ('무엇인가요', 95)]
Language: te, Distinct Words: 2420
Language: te, Distinct Characters: 91
Language: te, Calculated Total Words from Distinct Words: 7668
Language: te, Top 5 Words: [('ఎవరు', 274), ('ఏది', 192), ('ఎన్ని', 165), ('ఎప్పుడు', 154), ('ఏ', 142), ('ఎంత', 116), ('చిత్ర', 97), ('ఎక్కడ', 96), ('మొదటి', 86), ('ఉంది', 83)]


In [5]:
def arabicClassifier(question, context):
    goodWords = ['متى','ما','هو','هي','كم','عدد','أول','في']
    badWords = ['هل', 'يمكن']
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
        return True
    else:
        return True
        # return np.random.choice([True, False])

def koreanClassifier(question, context):
    goodWords = ['가장', '무엇인가', '언제', '몇']
    badWords = ['수 '] # '시차는', '중력과'
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
        return True
    else:
        return True
        # return np.random.choice([True, False])
    
def teluguClassifier(question, context):
    goodWords = []
    badWords = ['విస్తీర్ణం', 'జనాభా', 'ఆఫ్రికాలో']
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
            return True
    # if any(word in question for word in badWords):
    #     return False
    else:
        return True
        # return np.random.choice([True, False])

### --- Arabic ---
arabicDf = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
arabicDf['prediction'] = arabicDf.apply(lambda row: arabicClassifier(row['question'], row['context']), axis=1)
accuracy = (arabicDf['answerable'] == arabicDf['prediction']).mean()
print(f"Arabic Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {arabicDf['answerable'].value_counts(normalize=True).to_dict()}")

### --- Korean ---
koreanDf = df_val_clean[df_val_clean['lang'] == 'ko'].copy()
koreanDf['prediction'] = koreanDf.apply(lambda row: koreanClassifier(row['question'], row['context']), axis=1)
accuracy = (koreanDf['answerable'] == koreanDf['prediction']).mean()
print(f"Korean Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {koreanDf['answerable'].value_counts(normalize=True).to_dict()}")

### --- Telugu ---
teluguDf = df_val_clean[df_val_clean['lang'] == 'te'].copy()
teluguDf['prediction'] = teluguDf.apply(lambda row: teluguClassifier(row['question'], row['context']), axis=1)
accuracy = (teluguDf['answerable'] == teluguDf['prediction']).mean()
print(f"Telugu Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {teluguDf['answerable'].value_counts(normalize=True).to_dict()}")

Arabic Classifier Accuracy (validation): 96.87%
True distribution in validation set: {True: 0.8746987951807229, False: 0.12530120481927712}
Korean Classifier Accuracy (validation): 94.94%
True distribution in validation set: {True: 0.9466292134831461, False: 0.05337078651685393}
Telugu Classifier Accuracy (validation): 79.17%
True distribution in validation set: {True: 0.7578125, False: 0.2421875}


In [6]:
def overlap_ratio(question, context):
    q_words = set(question.split())
    c_words = set(context.split())
    if not q_words: return 0
    return len(q_words & c_words) / len(q_words)


import re
def contains_digit(text):
    return bool(re.search(r"\d+", text))


def arabicClassifier(question, context):
    goodWords = ['متى','ما','هو','هي','كم','عدد','أول','في']
    badWords = ['هل', 'يمكن']
    
    # Rule 1: keyword spotting
    if any(word in question for word in badWords):
        return False
    if any(word in question for word in goodWords):
        return True
    
    # # Rule 2: overlap
    # if overlap_ratio(question, context) < 0.1:
    #     return False
    
    # # Rule 3: digits
    # if contains_digit(question) and not contains_digit(context):
    #     return False
    
    # Rule 4: context length
    if len(context.split()) < 25:
        return False
    
    return True

# Arabic
arabicDf = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
arabicDf['prediction'] = arabicDf.apply(lambda row: arabicClassifier(row['question'], row['context']), axis=1)
arabic_acc = (arabicDf['answerable'] == arabicDf['prediction']).mean()
print(f"Arabic Classifier Accuracy (validation): {arabic_acc*100:.2f}%")

Arabic Classifier Accuracy (validation): 96.87%


In [7]:
df_val_clean

Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,ఒరెగాన్ రాష్ట్రంలోని అతిపెద్ద నగరం ఏది,Portland is the largest city in the U.S. state...,te,True,0,Portland,
1,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,"The word cholera is from ""kholera"" from χολή ""...",te,True,99,Indian subcontinent,
2,కలరా వ్యాధిని మొదటగా ఏ దేశంలో కనుగొన్నారు,Since it became widespread in the 19th century...,te,True,451,England,
3,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I occurred from 1914 to 1918. In ter...,te,True,26,1914,
4,మొదటి ప్రపంచ యుద్ధం ఎప్పుడు మొదలయింది,World War I often abbreviated as WWI or WW1 al...,te,True,155,28 July 1914,
...,...,...,...,...,...,...,...
3006,2011 జనగణన ప్రకారం రెయ్యలగడ్ద గ్రామములో పురుషు...,Reyyalagadda is a village belonging to Gangara...,te,True,378,37,37
3007,2011 జనాభా లెక్కల ప్రకారం బూతుమిల్లిపాడు గ్రామ...,Boothumillipadu is a village in Gannavaram man...,te,True,308,433,433
3008,2011 జనాభా లెక్కల ప్రకారం మల్లవేముల గ్రామ జనాభ...,Mallavemula is a village belonging to Chagalam...,te,False,-1,1131,1131
3009,2011 నాటికి రష్యా దేశ ప్రధాన మంత్రి ఎవరు,Andria Urushadze born April 25 1968 is a Geor...,te,False,-1,Vladimir Putin,వ్లాదిమిర్ పుతిన్


## Week 37

In [8]:
arabicDf_train = df_train_clean[df_train_clean['lang'] == 'ar'].copy()
teluguDf_train = df_train_clean[df_train_clean['lang'] == 'te'].copy()
koreanDf_train = df_train_clean[df_train_clean['lang'] == 'ko'].copy()

arabicDf_val = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
teluguDf_val = df_val_clean[df_val_clean['lang'] == 'te'].copy()
koreanDf_val = df_val_clean[df_val_clean['lang'] == 'ko'].copy()

In [9]:
# arabicDf_train = df_train[df_train['lang'] == 'ar']
# teluguDf_train = df_train[df_train['lang'] == 'te']
# koreanDf_train = df_train[df_train['lang'] == 'ko']

# arabicDf_val = df_val[df_val['lang'] == 'ar']
# teluguDf_val = df_val[df_val['lang'] == 'te']
# koreanDf_val = df_val[df_val['lang'] == 'ko']

In [10]:
def compute_probability(question, unigram_fd, bigram_fd):
    tokens = nltk.word_tokenize(question)
    bigrams = list(ngrams(tokens, 2))
    prob = 1.0
    for bigram in bigrams:
        bigram_count = bigram_fd[bigram]
        unigram_count = unigram_fd[(bigram[0],)]
        if unigram_count > 0 and bigram_count > 0:
            prob *= bigram_count / unigram_count
        else:
            prob *= 1e-6
    return prob

def compute_logprob(question, unigram_fd, bigram_fd):
    tokens = nltk.word_tokenize(question)
    bigrams_list = list(ngrams(tokens, 2))
    log_prob = 0.0
    for bigram in bigrams_list:
        bigram_count = bigram_fd[bigram]
        unigram_count = unigram_fd[(bigram[0],)]
        if unigram_count > 0 and bigram_count > 0:
            prob = bigram_count / unigram_count
        else:
            prob = 1e-6  # smoothing for unseen bigrams
        log_prob += math.log(prob)
    return log_prob, len(tokens)  # return log_prob and number of tokens


In [11]:
def build_counts(corpus):
    allUnigrams = []
    allBigrams = []
    allTrigrams = []

    for text in corpus:
        tokens = nltk.word_tokenize(text)
        allUnigrams.extend(tokens)
        allBigrams.extend(list(ngrams(tokens, 2)))
        allTrigrams.extend(list(ngrams(tokens, 3)))

    unigram_fd = FreqDist(allUnigrams)
    bigram_fd = FreqDist(allBigrams)
    trigram_fd = FreqDist(allTrigrams)

    return unigram_fd, bigram_fd, trigram_fd


# build counts
unigram_fd, bigram_fd, trigram_fd = build_counts(df_train_clean['context'])
V = len(unigram_fd)

def conditional_prob_unigram(w, unigram_fd):
    return unigram_fd[w] / sum(unigram_fd.values())

def conditional_prob_bigram(w2, w1, bigram_fd, unigram_fd, k=0.0):
    # Add-k for bigram if you like, or simple MLE
    bi = bigram_fd[(w1,w2)]
    uni = unigram_fd[w1]
    if uni>0:
        return bi/uni
    return 1.0/V

def conditional_prob_trigram(w3, w1, w2, trigram_fd, bigram_fd, V, k=0.0):
    tri = trigram_fd[(w1,w2,w3)]
    bi  = bigram_fd[(w1,w2)]
    if bi>0:
        return (tri + k) / (bi + k*V)
    return 1.0/V

def sentence_logprob_interpolated(sentence, unigram_fd, bigram_fd, trigram_fd,
                                  V, lambdas=(0.1,0.3,0.6), k=0.0):
    # lambdas: (lambda_uni, lambda_bi, lambda_tri) must sum to 1
    lam1, lam2, lam3 = lambdas
    toks = nltk.word_tokenize(sentence)
    trigs = list(ngrams(toks, 3))
    logp = 0.0
    for w1,w2,w3 in trigs:
        p_uni = conditional_prob_unigram(w3, unigram_fd)
        p_bi  = conditional_prob_bigram(w3, w2, bigram_fd, unigram_fd, k)
        p_tri = conditional_prob_trigram(w3, w1, w2, trigram_fd, bigram_fd, V, k)
        p = lam1*p_uni + lam2*p_bi + lam3*p_tri
        logp += math.log(p)
    return logp, len(toks)

### Arabic

In [12]:
# --- TRAINING: unigram model ---
allUnigrams_ar = []

for q in arabicDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_ar.extend(tokens)

unigram_fd_ar = FreqDist(allUnigrams_ar)
total_tokens_train = sum(unigram_fd_ar.values())
V = len(unigram_fd_ar)  # vocabulary size

# print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under unigram model ---
def question_logprob_unigram(sentence, unigram_fd, total_tokens, V, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    log_prob = 0.0
    for w in tokens:
        count = unigram_fd[w]
        if count > 0:
            prob = count / total_tokens
        else:
            prob = smoothing  # unseen word → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_ar = 0.0
total_tokens_ar = 0

for q in arabicDf_val['question']:
    logp, n = question_logprob_unigram(q, unigram_fd_ar, total_tokens_train, V)
    total_log_prob_ar += logp
    total_tokens_ar += n

perplexity_uni_ar = math.exp(-total_log_prob_ar / total_tokens_ar)
# print("Validation Perplexity (Unigram) for Arabic:", perplexity_uni_ar)

In [13]:
# Make a list of all bigrams in arabicDf questions
allBigrams_ar = []
allUnigrams_ar = []
for q in arabicDf_train['question']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_ar.extend(bigrams)
    allUnigrams_ar.extend(unigrams)

unigram_fd_ar = FreqDist(allUnigrams_ar)
bigram_fd_ar = FreqDist(allBigrams_ar)
# print(unigram_fd_ar.most_common(10))
# print(bigram_fd_ar.most_common(10))
cfdist_ar = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_ar)
# print(cfdist_ar['ما'].most_common(10))

In [14]:
arabicDf_val['question_prob'] = arabicDf_val['question'].apply(lambda q: compute_probability(q, unigram_fd_ar, bigram_fd_ar))

In [15]:
# Calculate total log probability and total tokens over validation set
total_log_prob_ar = 0.0
total_tokens_ar  = 0
for q in arabicDf_val['question']:
    logp, n = compute_logprob(q, unigram_fd_ar, bigram_fd_ar)
    total_log_prob_ar += logp
    total_tokens_ar += n

# Perplexity
perplexity_bi_ar = math.exp(-total_log_prob_ar / total_tokens_ar)
# print("Validation Perplexity for Arabic:", perplexity_bi_ar)

In [16]:
### Trigram model - Arabic

# --- TRAINING: trigram + bigram + unigram ---
allTrigrams_ar = []
allBigrams_ar = []
allUnigrams_ar = []

for q in arabicDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_ar.extend(tokens)
    allBigrams_ar.extend(list(ngrams(tokens, 2)))
    allTrigrams_ar.extend(list(ngrams(tokens, 3)))

unigram_fd_ar = FreqDist(allUnigrams_ar)
bigram_fd_ar = FreqDist(allBigrams_ar)
trigram_fd_ar = FreqDist(allTrigrams_ar)

# print(f"Most common unigrams: {unigram_fd_ar.most_common(10)}")
# print(f"Most common bigrams: {bigram_fd_ar.most_common(10)}")
# print(f"Most common trigrams: {trigram_fd_ar.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under trigram model ---
def question_logprob(sentence, bigram_fd, trigram_fd, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    trigrams = list(ngrams(tokens, 3))
    log_prob = 0.0
    for w1, w2, w3 in trigrams:
        trigram_count = trigram_fd[(w1, w2, w3)]
        bigram_count = bigram_fd[(w1, w2)]
        if bigram_count > 0 and trigram_count > 0:
            prob = trigram_count / bigram_count
        else:
            prob = smoothing  # unseen trigram → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_ar = 0.0
total_tokens_ar = 0

for q in arabicDf_val['question']:
    logp, n = question_logprob(q, bigram_fd_ar, trigram_fd_ar)
    total_log_prob_ar += logp
    total_tokens_ar += n

perplexity_tri_ar = math.exp(-total_log_prob_ar / total_tokens_ar)
# print("Validation Perplexity (Trigram) for Arabic:", perplexity_tri_ar)

In [17]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in arabicDf_val['question']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
perplexity_inter_ar = math.exp(-total_log/total_tokens)
# print("Interpolated trigram perplexity for Arabic:", perplexity_inter_ar)

In [18]:
# Analysis of the different models
print(f"Unigram Perplexity (Arabic): {perplexity_uni_ar}")
print(f"Bigram Perplexity (Arabic): {perplexity_bi_ar}")
print(f"Trigram Perplexity (Arabic): {perplexity_tri_ar}")
print(f"Interpolated Trigram Perplexity (Arabic): {perplexity_inter_ar}")

Unigram Perplexity (Arabic): 3744.866432577014
Bigram Perplexity (Arabic): 3475.076581820621
Trigram Perplexity (Arabic): 3721.809419391342
Interpolated Trigram Perplexity (Arabic): 378.3794847875941


### Korean

In [19]:
# --- TRAINING: unigram model ---
allUnigrams_ko = []

for q in koreanDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_ko.extend(tokens)

unigram_fd_ko = FreqDist(allUnigrams_ko)
total_tokens_train = sum(unigram_fd_ko.values())
V = len(unigram_fd_ko)  # vocabulary size

# print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under unigram model ---
def question_logprob_unigram(sentence, unigram_fd, total_tokens, V, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    log_prob = 0.0
    for w in tokens:
        count = unigram_fd[w]
        if count > 0:
            prob = count / total_tokens
        else:
            prob = smoothing  # unseen word → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_ko = 0.0
total_tokens_ko = 0

for q in koreanDf_val['question']:
    logp, n = question_logprob_unigram(q, unigram_fd_ko, total_tokens_train, V)
    total_log_prob_ko += logp
    total_tokens_ko += n

perplexity_uni_ko = math.exp(-total_log_prob_ko / total_tokens_ko)
# print("Validation Perplexity (Unigram) for Korean:", perplexity_uni_ko)

In [20]:
# Make a list of all bigrams in arabicDf questions
allBigrams_ko = []
allUnigrams_ko = []
for q in koreanDf_train['question']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_ko.extend(bigrams)
    allUnigrams_ko.extend(unigrams)

unigram_fd_ko = FreqDist(allUnigrams_ko)
bigram_fd_ko = FreqDist(allBigrams_ko)
print(unigram_fd_ko.most_common(10))
print(bigram_fd_ko.most_common(10))
cfdist_ko = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_ko)



[(('가장',), 527), (('무엇인가',), 497), (('언제',), 336), (('몇',), 234), (('어디인가',), 228), (('큰',), 194), (('누구인가',), 186), (('세상에서',), 142), (('누구인가요',), 105), (('무엇인가요',), 95)]
[(('가장', '큰'), 172), (('세상에서', '가장'), 138), (('가장', '많은'), 66), (('나라는', '어디인가'), 64), (('몇', '년도에'), 63), (('사람은', '누구인가'), 55), (('가장', '높은'), 48), (('몇', '개의'), 44), (('지도자는', '누구인가'), 37), (('얼마나', '되나요'), 34)]


In [21]:
koreanDf_val['question_prob'] = koreanDf_val['question'].apply(lambda q: compute_probability(q, unigram_fd_ko, bigram_fd_ko))

In [22]:
# Calculate total log probability and total tokens over validation set
total_log_prob_ko = 0.0
total_tokens_ko = 0
for q in koreanDf_val['question']:
    logp, n = compute_logprob(q, unigram_fd_ko, bigram_fd_ko)
    total_log_prob_ko += logp
    total_tokens_ko += n

# Perplexity
perplexity_bi_ko = math.exp(-total_log_prob_ko / total_tokens_ko)
print("Validation Perplexity for Korean:", perplexity_bi_ko)

Validation Perplexity for Korean: 1061.2832744231378


In [23]:
### Trigram model - Korean

# --- TRAINING: trigram + bigram + unigram ---
allTrigrams_ko = []
allBigrams_ko = []
allUnigrams_ko = []

for q in koreanDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_ko.extend(tokens)
    allBigrams_ko.extend(list(ngrams(tokens, 2)))
    allTrigrams_ko.extend(list(ngrams(tokens, 3)))

unigram_fd_ko = FreqDist(allUnigrams_ko)
bigram_fd_ko = FreqDist(allBigrams_ko)
trigram_fd_ko = FreqDist(allTrigrams_ko)

# print(f"Most common unigrams: {unigram_fd_ko.most_common(10)}")
# print(f"Most common bigrams: {bigram_fd_ko.most_common(10)}")
# print(f"Most common trigrams: {trigram_fd_ko.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under trigram model ---
def question_logprob(sentence, bigram_fd, trigram_fd, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    trigrams = list(ngrams(tokens, 3))
    log_prob = 0.0
    for w1, w2, w3 in trigrams:
        trigram_count = trigram_fd[(w1, w2, w3)]
        bigram_count = bigram_fd[(w1, w2)]
        if bigram_count > 0 and trigram_count > 0:
            prob = trigram_count / bigram_count
        else:
            prob = smoothing  # unseen trigram → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_ko = 0.0
total_tokens_ko = 0

for q in koreanDf_val['question']:
    logp, n = question_logprob(q, bigram_fd_ko, trigram_fd_ko)
    total_log_prob_ko += logp
    total_tokens_ko += n

perplexity_tri_ko = math.exp(-total_log_prob_ko / total_tokens_ko)
# print("Validation Perplexity (Trigram) for Korean:", perplexity_tri_ko)

In [24]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in koreanDf_val['question']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
perplexity_inter_ko = math.exp(-total_log/total_tokens)
# print("Interpolated trigram perplexity for Telugu:", perplexity_inter_te)

In [25]:
# Analysis of the different models
print(f"Unigram Perplexity (Korean): {perplexity_uni_ko}")
print(f"Bigram Perplexity (Korean): {perplexity_bi_ko}")
print(f"Trigram Perplexity (Korean): {perplexity_tri_ko}")
print(f"Interpolated Trigram Perplexity (Korean): {perplexity_inter_ko}")

Unigram Perplexity (Korean): 4539.477360388252
Bigram Perplexity (Korean): 1061.2832744231378
Trigram Perplexity (Korean): 682.3894216748316
Interpolated Trigram Perplexity (Korean): 148.2215742629018


### Telugu

In [26]:
# --- TRAINING: unigram model ---
allUnigrams_te = []

for q in teluguDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_te.extend(tokens)

unigram_fd_te = FreqDist(allUnigrams_te)
total_tokens_train = sum(unigram_fd_te.values())
V = len(unigram_fd_te)  # vocabulary size

# print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under unigram model ---
def question_logprob_unigram(sentence, unigram_fd, total_tokens, V, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    log_prob = 0.0
    for w in tokens:
        count = unigram_fd[w]
        if count > 0:
            prob = count / total_tokens
        else:
            prob = smoothing  # unseen word → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_te = 0.0
total_tokens_te = 0

for q in teluguDf_val['question']:
    logp, n = question_logprob_unigram(q, unigram_fd_te, total_tokens_train, V)
    total_log_prob_te += logp
    total_tokens_te += n

perplexity_uni_te = math.exp(-total_log_prob_te / total_tokens_te)
# print("Validation Perplexity (Unigram) for Telugu:", perplexity_uni_te)

In [27]:
# Make a list of all bigrams in arabicDf questions
allBigrams_telugu = []
allUnigrams_telugu = []
for q in teluguDf_train['question']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_telugu.extend(bigrams)
    allUnigrams_telugu.extend(unigrams)

unigram_fd_te = FreqDist(allUnigrams_telugu)
bigram_fd_te = FreqDist(allBigrams_telugu)
# print(unigram_fd_te.most_common(10))
# print(bigram_fd_te.most_common(10))
cfdist_telugu = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_telugu)

In [28]:
teluguDf_val['question_prob'] = teluguDf_val['question'].apply(lambda q: compute_probability(q, unigram_fd_te, bigram_fd_te))

In [29]:
# Calculate total log probability and total tokens over validation set
total_log_prob_te = 0.0
total_tokens_te = 0
for q in teluguDf_val['question']:
    logp, n = compute_logprob(q, unigram_fd_te, bigram_fd_te)
    total_log_prob_te += logp
    total_tokens_te += n

# Perplexity
perplexity_bi_te = math.exp(-total_log_prob_te / total_tokens_te)
# print("Validation Perplexity for Telugu:", perplexity_bi_te)

In [30]:
### Trigram model - Telugu

# --- TRAINING: trigram + bigram + unigram ---
allTrigrams_te = []
allBigrams_te = []
allUnigrams_te = []

for q in teluguDf_train['question']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_te.extend(tokens)
    allBigrams_te.extend(list(ngrams(tokens, 2)))
    allTrigrams_te.extend(list(ngrams(tokens, 3)))

unigram_fd_te = FreqDist(allUnigrams_te)
bigram_fd_te = FreqDist(allBigrams_te)
trigram_fd_te = FreqDist(allTrigrams_te)

# print(f"Most common unigrams: {unigram_fd_te.most_common(10)}")
# print(f"Most common bigrams: {bigram_fd_te.most_common(10)}")
# print(f"Most common trigrams: {trigram_fd_te.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under trigram model ---
def question_logprob(sentence, bigram_fd, trigram_fd, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    trigrams = list(ngrams(tokens, 3))
    log_prob = 0.0
    for w1, w2, w3 in trigrams:
        trigram_count = trigram_fd[(w1, w2, w3)]
        bigram_count = bigram_fd[(w1, w2)]
        if bigram_count > 0 and trigram_count > 0:
            prob = trigram_count / bigram_count
        else:
            prob = smoothing  # unseen trigram → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_te = 0.0
total_tokens_te = 0

for q in teluguDf_val['question']:
    logp, n = question_logprob(q, bigram_fd_te, trigram_fd_te)
    total_log_prob_te += logp
    total_tokens_te += n

perplexity_tri_te = math.exp(-total_log_prob_te / total_tokens_te)
# print("Validation Perplexity (Trigram) for Telugu:", perplexity_tri_te)

In [31]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in teluguDf_val['question']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
perplexity_inter_te = math.exp(-total_log/total_tokens)
# print("Interpolated trigram perplexity for Telugu:", perplexity_inter_te)

In [32]:
# Analysis of the different models
print(f"Unigram Perplexity (Telugu): {perplexity_uni_te}")
print(f"Bigram Perplexity (Telugu): {perplexity_bi_te}")
print(f"Trigram Perplexity (Telugu): {perplexity_tri_te}")
print(f"Interpolated Trigram Perplexity (Telugu): {perplexity_inter_te}")

Unigram Perplexity (Telugu): 2567.776781647863
Bigram Perplexity (Telugu): 540.9787802403338
Trigram Perplexity (Telugu): 429.6502383147644
Interpolated Trigram Perplexity (Telugu): 191.84511292710405


### English

In [33]:
# --- TRAINING: unigram model ---
allUnigrams_en = []

for q in df_train['context']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_en.extend(tokens)

unigram_fd_en = FreqDist(allUnigrams_en)
total_tokens_train = sum(unigram_fd_en.values())
V = len(unigram_fd_en)  # vocabulary size

# print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under unigram model ---
def question_logprob_unigram(sentence, unigram_fd, total_tokens, V, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    log_prob = 0.0
    for w in tokens:
        count = unigram_fd[w]
        if count > 0:
            prob = count / total_tokens
        else:
            prob = smoothing  # unseen word → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_en = 0.0
total_tokens_en = 0

for q in df_val['context']:
    logp, n = question_logprob_unigram(q, unigram_fd_en, total_tokens_train, V)
    total_log_prob_en += logp
    total_tokens_en += n

perplexity_uni_en = math.exp(-total_log_prob_en / total_tokens_en)
# print("Validation Perplexity (Unigram) for English:", perplexity_en)

In [34]:
# Make a list of all bigrams in context
allBigrams_en = []
allUnigrams_en = []
for q in df_train_clean['context']:
    tokens = nltk.word_tokenize(q)
    bigrams = list(ngrams(tokens, 2))
    unigrams = list(ngrams(tokens, 1))
    allBigrams_en.extend(bigrams)
    allUnigrams_en.extend(unigrams)

unigram_fd_en = FreqDist(allUnigrams_en)
bigram_fd_en = FreqDist(allBigrams_en)
# print(unigram_fd_en.most_common(10))
# print(bigram_fd_en.most_common(10))
cfdist_en = ConditionalFreqDist((bigram[0], bigram) for bigram in allBigrams_en)

In [35]:
df_val['context_prob'] = df_val['context'].apply(lambda q: compute_probability(q, unigram_fd_en, bigram_fd_en))

In [36]:
# Calculate total log probability and total tokens over validation set
total_log_prob_en = 0.0
total_tokens_en = 0
for q in df_val['context']:
    logp, n = compute_logprob(q, unigram_fd_en, bigram_fd_en)
    total_log_prob_en += logp
    total_tokens_en += n

# Perplexity
perplexity_bi_en = math.exp(-total_log_prob_en / total_tokens_en)
# print("Validation Perplexity (Bigram) for English:", perplexity_bi_en)

In [37]:
### Trigram model - English

# --- TRAINING: trigram + bigram + unigram ---
allTrigrams_en = []
allBigrams_en = []
allUnigrams_en = []

for q in df_train['context']:
    tokens = nltk.word_tokenize(q)
    allUnigrams_en.extend(tokens)
    allBigrams_en.extend(list(ngrams(tokens, 2)))
    allTrigrams_en.extend(list(ngrams(tokens, 3)))

unigram_fd_en = FreqDist(allUnigrams_en)
bigram_fd_en = FreqDist(allBigrams_en)
trigram_fd_en = FreqDist(allTrigrams_en)

# print(f"Most common unigrams: {unigram_fd_en.most_common(10)}")
# print(f"Most common bigrams: {bigram_fd_en.most_common(10)}")
# print(f"Most common trigrams: {trigram_fd_en.most_common(10)}")

# --- FUNCTION: log-probability of a sentence under trigram model ---
def question_logprob(sentence, bigram_fd, trigram_fd, smoothing=1e-6):
    tokens = nltk.word_tokenize(sentence)
    trigrams = list(ngrams(tokens, 3))
    log_prob = 0.0
    for w1, w2, w3 in trigrams:
        trigram_count = trigram_fd[(w1, w2, w3)]
        bigram_count = bigram_fd[(w1, w2)]
        if bigram_count > 0 and trigram_count > 0:
            prob = trigram_count / bigram_count
        else:
            prob = smoothing  # unseen trigram → small probability
        log_prob += math.log(prob)
    return log_prob, len(tokens)

# --- VALIDATION: calculate perplexity ---
total_log_prob_en = 0.0
total_tokens_en = 0

for q in df_val['context']:
    logp, n = question_logprob(q, bigram_fd_en, trigram_fd_en)
    total_log_prob_en += logp
    total_tokens_en += n

perplexity_tri_en = math.exp(-total_log_prob_en / total_tokens_en)
# print("Validation Perplexity (Trigram) for English:", perplexity_tri_en)

In [38]:
# Evaluate perplexity on validation set with interpolation
total_log, total_tokens = 0.0, 0
for s in df_val['context']:
    lp, n = sentence_logprob_interpolated(s, unigram_fd, bigram_fd, trigram_fd, V,
                                          lambdas=(0.1,0.3,0.6), k=0.1)
    total_log += lp
    total_tokens += n
perplexity_inter_en = math.exp(-total_log/total_tokens)
# print("Interpolated trigram perplexity for English:", perplexity_inter_en)

In [39]:
# Analysis of the different models
print(f"Unigram Perplexity (English): {perplexity_uni_en}")
print(f"Bigram Perplexity (English): {perplexity_bi_en}")
print(f"Trigram Perplexity (English): {perplexity_tri_en}")
print(f"Interpolated Trigram Perplexity (English): {perplexity_inter_en}")

Unigram Perplexity (English): 1991.2012701142155
Bigram Perplexity (English): 1088.9448235837644
Trigram Perplexity (English): 8453.467175831895
Interpolated Trigram Perplexity (English): 720.8373580260715
