In [1]:
'''Testing how many words from the test set, once transliterated into Dev, can be found as they are
in the collected corpora'''

'Testing how many words from the test set, once transliterated into Dev, can be found as they are\nin the collected corpora'

In [2]:
from indictrans import Transliterator
import os
import json
from collections import Counter, defaultdict
import pandas as pd

In [3]:
# Reading corpora 
DATA_DIR = "../../../data/crawled_cleaned/"
acc_data = dict()

for file in os.listdir(DATA_DIR):
    lang = file.split(".")[0]
    acc_data[lang] = open(DATA_DIR+file, "r").read().split(" ")

In [21]:
# Reading lexicons of test sets
with open("../../eval_data/stats/word_frequencies_againsthin.json", "r") as f:
    lexicons = json.load(f)


In [22]:
trn = Transliterator(source='eng', target='hin')

stats = defaultdict(lambda: dict())

common_all = dict()

In [23]:
for lang in lexicons:
    
    if lang not in acc_data:
        print("LANG DOES NOT HAVE CORPUS: {}".format(lang))
        continue
        
    print("LANG: ", lang)

    test_freq = {word:freq for (word, freq) in lexicons[lang]}
    transliterated_test_freq = {trn.transform(word): freq for word, freq in test_freq.items()}
    
    corpus_freq = Counter(acc_data[lang])
    
    common = set(transliterated_test_freq.keys()).intersection(set(corpus_freq.keys()))
    common_all[lang] = set(common)
    
    train_covered = sum([corpus_freq[word] for word in common])/sum(corpus_freq.values())
    test_covered = sum([transliterated_test_freq[word] for word in common])/sum(transliterated_test_freq.values())
    
#     print("Total (non-unique) words in corpus: ", sum(corpus_freq.values()))
#     print("Total (unique) words in corpus: ", len(corpus_freq))
#     print("Total (non-unique) words in test: ", sum(transliterated_test_freq.values()))
#     print("Total (unique) words in test: ", len(transliterated_test_freq))
    
#     print("Common: ", len(common))
#     print("Fraction covered from corpus: ", train_covered)
#     print("Fraction covered from test: ", test_covered)
    
    stats[lang]["Total in corpus"] = sum(corpus_freq.values())
    stats[lang]["Unique in corpus"] = len(corpus_freq)
    stats[lang]["Total in test"] = sum(transliterated_test_freq.values())
    stats[lang]["Unique in test"] = len(transliterated_test_freq)
    stats[lang]["Common"] = len(common)
    stats[lang]["Frac covered in corpus"] = round(train_covered, 2)
    stats[lang]["Frac covered in test"] = round(test_covered, 2)
    
    
    
    

LANG:  brajbhasha
LANG:  angika
LANG:  maithili
LANG:  magahi
LANG:  hindi-urdu
LANG:  awadhi
LANG:  rajasthani
LANG:  hariyanvi
LANG:  bhil
LANG:  chattisgarhi
LANG:  nepali
LANG:  bajjika
LANG:  koraku
LANG:  malwi
LANG:  sindhi
LANG DOES NOT HAVE CORPUS: bagheli
LANG:  bhojpuri
LANG:  garwali
LANG:  marathi
LANG:  kumaoni
LANG:  bundeli


In [24]:
df = pd.DataFrame(stats).transpose()

In [25]:
df["Total in corpus"] = df["Total in corpus"].astype('int')
df["Unique in corpus"] = df["Unique in corpus"].astype('int')
df["Total in test"] = df["Total in test"].astype('int')
df["Unique in test"] = df["Unique in test"].astype('int')
df["Common"] = df["Common"].astype('int')

In [19]:
df

Unnamed: 0,Total in corpus,Unique in corpus,Total in test,Unique in test,Common,Frac covered in corpus,Frac covered in test
brajbhasha,156986,30194,613,166,97,0.13,0.65
angika,1253545,91757,691,180,111,0.1,0.6
maithili,218491,41434,627,162,89,0.09,0.54
magahi,79405,16942,667,174,82,0.11,0.65
hindi-urdu,7100394,197355,673,172,165,0.25,0.98
awadhi,490877,53103,603,154,116,0.05,0.82
rajasthani,187708,34360,691,174,131,0.12,0.83
hariyanvi,232526,27431,611,159,125,0.14,0.85
bhil,27246,5557,649,179,69,0.12,0.49
chattisgarhi,83073,14463,591,142,98,0.16,0.75


In [26]:
print(df.to_latex())

\begin{tabular}{lrrrrrrr}
\toprule
{} &  Total in corpus &  Unique in corpus &  Total in test &  Unique in test &  Common &  Frac covered in corpus &  Frac covered in test \\
\midrule
brajbhasha   &           156986 &             30194 &            299 &             161 &      93 &                    0.12 &                  0.65 \\
angika       &          1253545 &             91757 &            310 &             165 &     102 &                    0.09 &                  0.60 \\
maithili     &           218491 &             41434 &            273 &             147 &      81 &                    0.09 &                  0.54 \\
magahi       &            79405 &             16942 &            326 &             172 &      81 &                    0.11 &                  0.64 \\
hindi-urdu   &          7100394 &            197355 &            336 &             171 &     165 &                    0.25 &                  0.98 \\
awadhi       &           490877 &             53103 &            2

In [None]:
df