NLP Project for KU Course

In [62]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt


## Load the dataset
splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])


In [63]:
## Remove unwanted characters from the questions

def cleanDf(df):
    
    pattern = re.compile(r"[?؟,;\/\\\[\]#():]")
    df['question'] = df['question'].apply(lambda x: pattern.sub("", x))
    return df

In [67]:
langForStat = ['ar','ko','te']

numQuestions = []
totalWordCount = []
distinctWordCount = []
distinctCharCount = []

df_train_clean = cleanDf(df_train)
df_train_val = cleanDf(df_val)

for lang in langForStat:
    numQuestions_train = df_train_clean[df_train_clean['lang'] == lang].shape[0]
    numQuestions_val = df_train_val[df_train_val['lang'] == lang].shape[0]
    numQuestions.append((lang, numQuestions_train, numQuestions_val))
    print(f"Language: {lang}, Train Questions: {numQuestions_train}, Validation Questions: {numQuestions_val}")

    # Compute word and character statistics
    df_train_lang = df_train_clean[df_train_clean['lang'] == lang].copy()
    df_val_lang = df_train_val[df_train_val['lang'] == lang]
    df_train_lang['wordcount'] = df_train_lang['question'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)


    maxId = df_train_lang['wordcount'].idxmax()
    longest_question = df_train_lang.loc[maxId, "question"]
    max_words = df_train_lang.loc[maxId, "wordcount"]

    # print(f"Language: {lang}")
    # print(f"  Longest train question (index {maxId}): {longest_question}")
    # print(f"  Word count: {max_words}")

    totalWordCount_train = df_train_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount_val = df_val_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount.append((lang, totalWordCount_train, totalWordCount_val))
    print(f"Language: {lang}, Train Total Words: {totalWordCount_train}, Validation Total Words: {totalWordCount_val}")



Language: ar, Train Questions: 2558, Validation Questions: 415
Language: ar, Train Total Words: 16202, Validation Total Words: 2621
Language: ko, Train Questions: 2422, Validation Questions: 356
Language: ko, Train Total Words: 11840, Validation Total Words: 1729
Language: te, Train Questions: 1355, Validation Questions: 384
Language: te, Train Total Words: 7668, Validation Total Words: 2299


In [150]:
from googletrans import Translator

def wordCount(df, lang):
    allWords = []
    df = df[df['lang'] == lang].copy()
    
    df = df[df['answerable'] == False]
    df['question'] = df['question'].astype(str)

    for q in df['question']:
        allWords.extend(q.split()) 
    
    wordDict = dict(Counter(allWords))
    wordDict = dict(sorted(wordDict.items(), key=lambda item: item[1], reverse=True))

    return wordDict
            

for lang in langForStat:
    wordDict = wordCount(df_train_clean, lang)
    distinctWordCount.append((lang, len(wordDict)))
    print(f"Language: {lang}, Distinct Words: {len(wordDict)}")

    allChars = []
    for word in wordDict.keys():
        allChars.extend(list(word))
    
    charDict = dict(Counter(allChars))
    charDict = dict(sorted(charDict.items(), key=lambda item: item[1], reverse=True))
    distinctCharCount.append((lang, len(charDict)))
    print(f"Language: {lang}, Distinct Characters: {len(charDict)}")
    calculatedTotalWords = sum(wordDict.values())
    print(f"Language: {lang}, Calculated Total Words from Distinct Words: {calculatedTotalWords}")

    top5Words = list(wordDict.items())[:10]
    print(f"Language: {lang}, Top 5 Words: {top5Words}")

Language: ar, Distinct Words: 1071
Language: ar, Distinct Characters: 52
Language: ar, Calculated Total Words from Distinct Words: 1808
Language: ar, Top 5 Words: [('هل', 247), ('في', 80), ('من', 52), ('يمكن', 31), ('على', 18), ('هو', 13), ('يوجد', 11), ('ان', 9), ('توجد', 9), ('يعتبر', 9)]
Language: ko, Distinct Words: 279
Language: ko, Distinct Characters: 309
Language: ko, Calculated Total Words from Distinct Words: 325
Language: ko, Top 5 Words: [('수', 11), ('있는가', 10), ('있을까', 8), ('승리했나요', 2), ('땅과', 2), ('물', 2), ('둘', 2), ('다에서', 2), ('살', 2), ('범주에', 2)]
Language: te, Distinct Words: 134
Language: te, Distinct Characters: 55
Language: te, Calculated Total Words from Distinct Words: 269
Language: te, Top 5 Words: [('ఏ', 16), ('ఏది', 10), ('అతిపెద్ద', 8), ('దేశం', 8), ('ఎంత', 7), ('ఎవరు', 7), ('దేశంలో', 5), ('నాటికి', 4), ('ఆఫ్రికాలో', 4), ('జనాభా', 4)]


In [163]:
def arabicClassifier(sentence):
    goodWords = ['متى','ما','هو','هي','كم','عدد','أول','في']
    badWords = ['من','هل']
    if any(word in sentence for word in goodWords):
        return True
    elif any(word in sentence for word in badWords):
        return False
    else:
        return True
    

## Example usage of the Arabic classifier
arabicDf = df_train_clean[df_train_clean['lang'] == 'ar'].copy()
arabicDf['prediction'] = arabicDf['question'].apply(arabicClassifier)

accuracy = (arabicDf['answerable'] == arabicDf['prediction']).mean()

print(f"Arabic Classifier Accuracy: {accuracy * 100:.2f}%")

Arabic Classifier Accuracy: 88.04%
