NLP Project for KU Course

In [40]:
import pandas as pd
import re
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

splits = {'train': 'train.parquet', 'validation': 'validation.parquet'}
df_train = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["train"])
df_val = pd.read_parquet("hf://datasets/coastalcph/tydi_xor_rc/" + splits["validation"])


In [41]:
def cleanDf(df):
    
    pattern = re.compile(r"[?؟,;\/\\\[\]#():]")
    df['question'] = df['question'].apply(lambda x: pattern.sub("", x))
    return df

In [42]:
langForStat = ['ar','ko','te']

numQuestions = []
totalWordCount = []
distinctWordCount = []
distinctCharCount = []

dfClean = cleanDf(df_train)


for lang in langForStat:
    numQuestions_train = df_train[df_train['lang'] == lang].shape[0]
    numQuestions_val = df_val[df_val['lang'] == lang].shape[0]
    numQuestions.append((lang, numQuestions_train, numQuestions_val))
    print(f"Language: {lang}, Train Questions: {numQuestions_train}, Validation Questions: {numQuestions_val}")

    # Compute word and character statistics
    df_train_lang = df_train[df_train['lang'] == lang].copy()
    df_val_lang = df_val[df_val['lang'] == lang]
    df_train_lang['wordcount'] = df_train_lang['question'].apply(lambda x: len(x.split()) if isinstance(x, str) else 0)


    maxId = df_train_lang['wordcount'].idxmax()
    longest_question = df_train_lang.loc[maxId, "question"]
    max_words = df_train_lang.loc[maxId, "wordcount"]

    print(f"Language: {lang}")
    print(f"  Longest train question (index {maxId}): {longest_question}")
    print(f"  Word count: {max_words}")

    totalWordCount_train = df_train_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount_val = df_val_lang['question'].apply(lambda x: len(x.split())).sum()
    totalWordCount.append((lang, totalWordCount_train, totalWordCount_val))
    print(f"Language: {lang}, Train Total Words: {totalWordCount_train}, Validation Total Words: {totalWordCount_val}")


df_train
dfClean

Language: ar, Train Questions: 2558, Validation Questions: 415
Language: ar
  Longest train question (index 11656): كم دامت فترة الخليفة العباسي الخامس أبو جعفر هارون بن محمد المهدي بن أبي جعفر المنصور
  Word count: 16
Language: ar, Train Total Words: 16202, Validation Total Words: 2807
Language: ko, Train Questions: 2422, Validation Questions: 356
Language: ko
  Longest train question (index 7022): 갈릴레이 위성Galilean moons 또는 갈릴레오 위성은 1610년 갈릴레오 갈릴레이가 목성 주변에서 발견한 4개의 위성을 뜻하나요
  Word count: 15
Language: ko, Train Total Words: 11840, Validation Total Words: 1729
Language: te, Train Questions: 1355, Validation Questions: 384
Language: te
  Longest train question (index 14146): 2019 వరకు లోక్‌సభ ఎన్నికలలో పులివెందుల నుండి అత్యధిక మెజారిటీ తో గెలిచిన వ్యక్తి ఎవరు
  Word count: 12
Language: te, Train Total Words: 7668, Validation Total Words: 2368


Unnamed: 0,question,context,lang,answerable,answer_start,answer,answer_inlang
0,উইকিলিকস কত সালে সর্বপ্রথম ইন্টারনেটে প্রথম তথ...,WikiLeaks () is an international non-profit or...,bn,True,182,2006,
1,দ্বিতীয় বিশ্বযুদ্ধে কোন দেশ পরাজিত হয়,The war in Europe concluded with an invasion o...,bn,True,48,Germany,
2,মার্কিন যুক্তরাষ্ট্রের সংবিধান অনুযায়ী মার্কিন...,Same-sex marriage in the United States expande...,bn,False,-1,no,
3,আরব-ইসরায়েলি যুদ্ধে আরবের মোট কয়জন সৈন্যের মৃ...,The exact number of Arab casualties is unknown...,bn,True,39,unknown,
4,বিশ্বে প্রথম পুঁজিবাদী সমাজ কবে গড়ে ওঠে,"As Thomas Hall (2000) notes, ""The Sung Empire ...",bn,True,1219,17th century,
...,...,...,...,...,...,...,...
15338,소말리아는 2차 개헌을 언제 했나요,"In February 2012, Somali government officials ...",ko,True,923,23 June 2012,
15339,세상에서 가장 먼저 시작된 교통수단은 무엇인가,The first earth tracks were created by humans ...,ko,True,160,animals,
15340,2019년 이집트의 지도자는 누구인가,"Abdel Fattah Saeed Hussein Khalil El-Sisi ( """"...",ko,True,0,Abdel Fattah Saeed Hussein Khalil El-Sisi,
15341,독일에서 가장 인구밀도가 높은 도시는 무엇인가,Munich (; ; ) is the capital and most populous...,ko,True,205,Berlin,


In [None]:
from deep_translator import GoogleTranslator

def wordCount(df, lang):
    allWords = []
    df = df[df['lang'] == lang].copy()
    
    df['question'] = df['question'].astype(str)

    for q in df['question']:
        allWords.extend(q.split()) 
    
    wordDict = dict(Counter(allWords))
    wordDict = dict(sorted(wordDict.items(), key=lambda item: item[1], reverse=True))

    return wordDict
            

for lang in langForStat:
    wordDict = wordCount(dfClean, lang)
    distinctWordCount.append((lang, len(wordDict)))
    print(f"Language: {lang}, Distinct Words: {len(wordDict)}")

    allChars = []
    for word in wordDict.keys():
        allChars.extend(list(word))
    
    charDict = dict(Counter(allChars))
    charDict = dict(sorted(charDict.items(), key=lambda item: item[1], reverse=True))
    distinctCharCount.append((lang, len(charDict)))
    print(f"Language: {lang}, Distinct Characters: {len(charDict)}")
    calculatedTotalWords = sum(wordDict.values())
    print(f"Language: {lang}, Calculated Total Words from Distinct Words: {calculatedTotalWords}")

    top5Words = list(wordDict.items())[:5]
    for word, count in top5Words:
        print(f"Word: {word}, Count: {count}")
        translator = Translator()
        translation = translator.translate(word, src=lang, dest='en')
        print(f"  English translation: {translation.text}")

    

ModuleNotFoundError: No module named 'googletrans'