NLP Project for KU Course

In [116]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt


## Load the dataset
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])


In [117]:
## Remove unwanted characters from the questions

def cleanDf(df):
    
    pattern = re.compile(r"[?؟,;\/\\\[\]#():]")
    df['question'] = df['question'].apply(lambda x: pattern.sub("", x))
    return df

In [118]:
langForStat = ['ar','ko','te']

numQuestions = []
totalWordCount = []
distinctWordCount = []
distinctCharCount = []

df_train_clean = cleanDf(df_train)
df_val_clean = cleanDf(df_val)

for lang in langForStat:
    numQuestions_train = df_train_clean[df_train_clean['lang'] == lang].shape[0]
    numQuestions_val = df_val_clean[df_val_clean['lang'] == lang].shape[0]
    numQuestions.append((lang, numQuestions_train, numQuestions_val))
    print(f"Language: {lang}, Train Questions: {numQuestions_train}, Validation Questions: {numQuestions_val}")

    # Compute word and character statistics
    df_train_lang = df_train_clean[df_train_clean['lang'] == lang].copy()
    df_val_lang = df_val_clean[df_val_clean['lang'] == lang]
    df_train_lang['wordcount'] = df_train_lang['question'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)


    maxId = df_train_lang['wordcount'].idxmax()
    longest_question = df_train_lang.loc[maxId, "question"]
    max_words = df_train_lang.loc[maxId, "wordcount"]

    # print(f"Language: {lang}")
    # print(f"  Longest train question (index {maxId}): {longest_question}")
    # print(f"  Word count: {max_words}")

    totalWordCount_train = df_train_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount_val = df_val_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount.append((lang, totalWordCount_train, totalWordCount_val))
    print(f"Language: {lang}, Train Total Words: {totalWordCount_train}, Validation Total Words: {totalWordCount_val}")



Language: ar, Train Questions: 2558, Validation Questions: 415
Language: ar, Train Total Words: 16202, Validation Total Words: 2621
Language: ko, Train Questions: 2422, Validation Questions: 356
Language: ko, Train Total Words: 11840, Validation Total Words: 1729
Language: te, Train Questions: 1355, Validation Questions: 384
Language: te, Train Total Words: 7668, Validation Total Words: 2299


In [119]:
# from googletrans import Translator

def wordCount(df, lang):
    allWords = []
    df = df[df['lang'] == lang].copy()
    
    df = df[df['answerable'] == False]
    df['question'] = df['question'].astype(str)

    for q in df['question']:
        allWords.extend(q.split()) 
    
    wordDict = dict(Counter(allWords))
    wordDict = dict(sorted(wordDict.items(), key=lambda item: item[1], reverse=True))

    return wordDict
            

for lang in langForStat:
    wordDict = wordCount(df_val_clean, lang)
    distinctWordCount.append((lang, len(wordDict)))
    print(f"Language: {lang}, Distinct Words: {len(wordDict)}")

    allChars = []
    for word in wordDict.keys():
        allChars.extend(list(word))
    
    charDict = dict(Counter(allChars))
    charDict = dict(sorted(charDict.items(), key=lambda item: item[1], reverse=True))
    distinctCharCount.append((lang, len(charDict)))
    print(f"Language: {lang}, Distinct Characters: {len(charDict)}")
    calculatedTotalWords = sum(wordDict.values())
    print(f"Language: {lang}, Calculated Total Words from Distinct Words: {calculatedTotalWords}")

    top5Words = list(wordDict.items())[:10]
    print(f"Language: {lang}, Top 5 Words: {top5Words}")

Language: ar, Distinct Words: 240
Language: ar, Distinct Characters: 40
Language: ar, Calculated Total Words from Distinct Words: 377
Language: ar, Top 5 Words: [('هل', 50), ('في', 16), ('من', 8), ('يمكن', 6), ('مع', 3), ('هناك', 3), ('بين', 3), ('الهند', 3), ('أكثر', 3), ('تم', 3)]
Language: ko, Distinct Words: 72
Language: ko, Distinct Characters: 138
Language: ko, Calculated Total Words from Distinct Words: 95
Language: ko, Top 5 Words: [('있는가', 8), ('살', 3), ('수', 3), ('시차는', 2), ('중력과', 2), ('관련이', 2), ('있을까', 2), ('화성의', 2), ('대기에', 2), ('인간이', 2)]
Language: te, Distinct Words: 186
Language: te, Distinct Characters: 57
Language: te, Calculated Total Words from Distinct Words: 561
Language: te, Top 5 Words: [('ఎవరు', 25), ('ఏ', 20), ('ఏది', 19), ('ఎంత', 14), ('జనాభా', 11), ('నాటికి', 10), ('దేశం', 10), ('అతిపెద్ద', 9), ('దేశ', 8), ('విస్తీర్ణం', 8)]


In [None]:
def arabicClassifier(question, context):
    goodWords = ['متى','ما','هو','هي','كم','عدد','أول','في']
    badWords = ['هل', 'يمكن']
    if any(word in question for word in goodWords):
        return True
    elif any(word in question for word in badWords):
        return False
    else:
        return True
        # return np.random.choice([True, False])

def koreanClassifier(question, context):
    goodWords = ['가장', '무엇인가', '언제', '몇']
    badWords = [] # '시차는', '중력과'
    if any(word in question for word in goodWords):
        return True
    elif any(word in question for word in badWords):
        return False
    else:
        return True
        # return np.random.choice([True, False])
    
def teluguClassifier(question, context):
    goodWords = []
    badWords = ['విస్తీర్ణం', 'జనాభా', 'ఆఫ్రికాలో']
    if any(word in question for word in goodWords):
        return True
    elif any(word in question for word in badWords):
        return False
    else:
        return True
        # return np.random.choice([True, False])

### --- Arabic ---
arabicDf = df_val_clean[df_val_clean['lang'] == 'ar'].copy()
arabicDf['prediction'] = arabicDf.apply(lambda row: arabicClassifier(row['question'], row['context']), axis=1)
accuracy = (arabicDf['answerable'] == arabicDf['prediction']).mean()
print(f"Arabic Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {arabicDf['answerable'].value_counts(normalize=True).to_dict()}")

### --- Korean ---
koreanDf = df_val_clean[df_val_clean['lang'] == 'ko'].copy()
koreanDf['prediction'] = koreanDf.apply(lambda row: koreanClassifier(row['question'], row['context']), axis=1)
accuracy = (koreanDf['answerable'] == koreanDf['prediction']).mean()
print(f"Korean Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {koreanDf['answerable'].value_counts(normalize=True).to_dict()}")

### --- Telugu ---
teluguDf = df_val_clean[df_val_clean['lang'] == 'te'].copy()
teluguDf['prediction'] = teluguDf.apply(lambda row: teluguClassifier(row['question'], row['context']), axis=1)
accuracy = (teluguDf['answerable'] == teluguDf['prediction']).mean()
print(f"Telugu Classifier Accuracy (validation): {accuracy * 100:.2f}%")
print(f"True distribution in validation set: {teluguDf['answerable'].value_counts(normalize=True).to_dict()}")

Arabic Classifier Accuracy (validation): 91.81%
True distribution in validation set: {True: 0.8746987951807229, False: 0.12530120481927712}
Korean Classifier Accuracy (validation): 94.66%
True distribution in validation set: {True: 0.9466292134831461, False: 0.05337078651685393}
Telugu Classifier Accuracy (validation): 79.17%
True distribution in validation set: {True: 0.7578125, False: 0.2421875}
