## Load Conllu data

In [94]:
import os
from tqdm import tqdm

In [47]:
dcs_pth = 'data/raw/dcs/'
dirs = os.listdir(dcs_pth)
conllu_pths = {}
for d in tqdm(dirs):
    d_pth = os.path.join(dcs_pth,d)
    files = os.listdir(d_pth)
    conllu_pths[d_pth] = [os.path.join(d_pth,f) for f in files if '.conllu' in f]

100%|███████████████████████████████████████| 257/257 [00:00<00:00, 8611.16it/s]


In [48]:
from conllu import parse

### Create DCS corpus

In [49]:
outdir_pth = 'data/processed/'

In [50]:
for d_pth, f_pths in tqdm(conllu_pths.items()):
    txt = []
    for f_pth in f_pths:
        sentences = parse(open(f_pth, 'r', encoding='utf-8').read())
        for sent in sentences:
            if len(sent) == 0: continue
            txt.append(' '.join([token['lemma'] for token in sent if token['lemma'] != '_']))
    with open(f"{d_pth.replace('raw/dcs','processed')}.txt", 'w') as fp:
        fp.write('\n'.join(txt))

100%|█████████████████████████████████████████| 257/257 [02:07<00:00,  2.02it/s]


## GRETIL Processing

In [19]:
import regex as re

gretil_pth = 'data/raw/gretil_txt/'
files = os.listdir(gretil_pth)

In [18]:
from transformers import pipeline

pipe = pipeline(model="mahesh27/t5lemmatizer")
pipe("namaste rudra manyava utota iṣave namaḥ\n bāhubhyām uta te namaḥ</s>")

[{'generated_text': 'namas tvad rudra manyu uta iṣu namas bāhu uta tvad namas'}]

In [44]:
batch_size = 8

for f_name in tqdm(files):
    f_pth = os.path.join(gretil_pth, f_name)
    out_pth = os.path.join('data/processed/', f_name)
    txt = open(f_pth, 'r').read()
    txt = re.sub('[,.";_=\-@#\[\]\\^%\(\)X*+\|/]|[0-9]', '', txt)
    txt = re.sub('ṁ','ṃ',txt)
    txt = re.sub('\n','</s>\n',txt+'\n')
    lines = txt.split('\n')
    out_lines = pipe(lines, batch_size= batch_size, max_new_tokens=256)
    out_lines_txt = '\n'.join([x['generated_text'] for x in out_lines])
    with open(f"data/raw/gretil_lemmas/{f_name}",'w') as fp:
        fp.write(out_lines_txt)

  txt = re.sub('[,.";_=\-@#\[\]\\^%\(\)X*+\|/]|[0-9]', '', txt)
  0%|                                       | 3/1214 [01:55<12:59:03, 38.60s/it]
  txt = re.sub('[,.";_=\-@#\[\]\\^%\(\)X*+\|/]|[0-9]', '', txt)


KeyboardInterrupt: 

### Testing lemmatizer

In [211]:
import pandas as pd
tqdm.pandas()

In [228]:
f_pth = "test/sup_test.txt"

df = pd.read_csv(f_pth, sep='\t', names=['label', 'verse'])
df.dropna(inplace=True)
df.reset_index(inplace=True, drop=True)
df

Unnamed: 0,label,verse
0,__label__n27,ayaṁ sa yo divas pari raghuyāmā pavitra ā | si...
1,__label__n6,pra mā yuyujre prayujo janānāṁ vahāmi sma pūṣa...
2,__label__n5,nū indra rāye varivas kṛdhī na ā te mano vavṛt...
3,__label__n0,tvaṁ no asyā uṣaso vyuṣṭau tvaṁ sūra udite bod...
4,__label__n27,yathā pūrvebhyaḥ śatasā amṛdhraḥ sahasrasāḥ pa...
...,...,...
1419,__label__n24,śatam ahaṁ tirindire sahasram parśāv ā dade | ...
1420,__label__n5,ayuyutsann anavadyasya senām ayātayanta kṣitay...
1421,__label__n8,aśvā ived aruṣāsaḥ sabandhavaḥ śūrā iva prayud...
1422,__label__n5,ye te santi daśagvinaḥ śatino ye sahasriṇaḥ | ...


In [229]:
from transformers import pipeline

pipe = pipeline(model="mahesh27/t5lemmatizer")

pipe(''.join(df['verse'][0].split('||')[0].split('|'))+'</s>', max_length=64)

[{'generated_text': 'idam tad yad div pari raghu yāman pavitra ā sindhu ūrmi vi kṣar'}]

In [230]:
df['verse'] = df.progress_apply(lambda x: ''.join(x['verse'].split('||')[0].split('|'))+'</s>',axis=1)

100%|███████████████████████████████████| 1424/1424 [00:00<00:00, 187602.13it/s]


In [231]:
verses = df['verse'].tolist()

In [232]:
diatrics_corr = {'r'+'̣':'ṛ', 's'+'̣':'ṣ', 'r'+'̣'+'̄': 'ṝ', 't'+'̣':'ṭ', 'd'+'̣':'ḍ', 
                 'n'+'̣':'ṇ', 'l'+'̱':'ḻ', 'a'+'̄':'ā', 'i'+'̄':'ī', 'u'+'̄':'ū', 's'+'́':'ś',
                 'n'+'̇': 'ṅ', 'n'+'̃' : 'ñ',
                }

def corr_diatrics(sent):
    new_sent = ''
    sent = list(sent)
    i = 0
    while i<len(sent):
        if i+1 < len(sent):
            c2 = sent[i] + sent[i+1]
            if c2 in diatrics_corr:
                new_sent += diatrics_corr[c2]
                i += 2
                continue
        new_sent += sent[i]
        i += 1
    return new_sent

In [233]:
from itertools import batched
import math

new_verses = []
batch_size = 8
for batch in tqdm(batched(verses, batch_size), total=math.ceil(len(verses)/batch_size)):
    lemmas = pipe(list(batch), batch_size=batch_size, max_new_tokens=64)
    new_verses += [corr_diatrics(l['generated_text']) for l in lemmas]

100%|█████████████████████████████████████████| 178/178 [00:45<00:00,  3.95it/s]


In [234]:
df['verse'] = pd.Series(new_verses)

stop_words = open("stop_words.txt",'r').read().split('\n')
def remove_stop(sent):
    sent = sent.split()
    res = []
    for w in sent:
        if w not in stop_words:
            res.append(w)
    return ' '.join(res)

df['verse'] = df.apply(lambda x: remove_stop(x['verse']), axis=1)
df

Unnamed: 0,label,verse
0,__label__n27,div pari raghu yāman pavitra sindhu ūrmi vi kṣar
1,__label__n6,pra yuj prayuj jana vah sma pūṣan antareṇa adh...
2,__label__n5,nu indra rai varivas manas vṛt magha gomat aśv...
3,__label__n0,uṣas vyuṣṭi tvac sūra udi gopā janman nitya ta...
4,__label__n27,sā amṛdhra sā paryaya vāja indu pū suvita navy...
...,...,...
1419,__label__n24,tirindira parśu dā rādhas yādvan
1420,__label__n5,ayuyutt anavadya senā yātay kṣiti navagva vṛṣā...
1421,__label__n8,aśva id aruṣa sabandhu śūra prayudh pra uta yu...
1422,__label__n5,daśagvin śatin sahasrin aśva vṛṣan raghudru tūya


In [235]:
df.to_csv(f_pth, sep='\t', index=False, header=False)

## Stopwords

In [113]:
text = open('data/processed.txt','r').read()

In [114]:
words = text.split()

In [115]:
len(words)

7418939

In [116]:
w_df = pd.DataFrame({'words': words})

In [117]:
freq_df = w_df.groupby('words').size().reset_index(name='freq')

In [118]:
freq_df = freq_df.sort_values(by=['freq'], ascending=False).reset_index()

In [120]:
freq_df['words'][:200].tolist()

['tad',
 'ca',
 'iti',
 'mad',
 'eva',
 'yad',
 'na',
 'tvad',
 'idam',
 'etad',
 'bhū',
 'kṛ',
 'sarva',
 'as',
 'vac',
 'tu',
 'api',
 'mahat',
 'vai',
 'deva',
 'hi',
 'tatas',
 'tathā',
 'vā',
 'atha',
 'rājan',
 'agni',
 'iva',
 'gam',
 'evam',
 'dṛś',
 'su',
 'vid',
 'loka',
 'indra',
 'tatra',
 'yathā',
 'ka',
 'ātman',
 'sa',
 'dharma',
 'eka',
 'putra',
 'artha',
 'dā',
 'punar',
 'ādi',
 'yat',
 'brahman',
 'anya',
 'brū',
 'han',
 'śru',
 'karman',
 'enad',
 'ha',
 'a',
 'para',
 'sva',
 'ratha',
 'yajña',
 'tri',
 'jan',
 'sthā',
 'tva',
 'rūpa',
 'ah',
 'pitṛ',
 'tadā',
 'bala',
 'kāla',
 'rasa',
 'kāma',
 'manas',
 'kaścit',
 'tasmāt',
 'saha',
 'iha',
 'brāhmaṇa',
 'ā',
 'yaj',
 'hu',
 'puruṣa',
 'atra',
 'prāṇa',
 'yuj',
 'paśu',
 'ap',
 'grah',
 'soma',
 'yadi',
 'vāc',
 'go',
 'śata',
 'ṛṣi',
 'u',
 'iṣ',
 'dhā',
 'prāp',
 'adas',
 'phala',
 'vara',
 'bahu',
 'pā',
 'pūrva',
 'yā',
 'mā',
 'nara',
 'sama',
 'pāṇḍava',
 'tapas',
 'guṇa',
 'viśva',
 'śara',
 'sahasra',


In [122]:
stop_words = '\n'.join(freq_df['words'][:200].tolist())
with open("stop_words.txt", 'w') as fp:
    fp.write(stop_words)

In [124]:
text = open('data/processed.txt','r').read().split('\n')
stop_words = open('stop_words.txt','r').read().split()

In [125]:
processed_stopw = []
for line in text:
    words = line.split()
    new_line = []
    for w in words:
        if w not in stop_words:
            new_line.append(w)
    processed_stopw.append(' '.join(new_line))
with open('processed_stop.txt','w') as fp:
    fp.write('\n'.join(processed_stopw))