In [1]:
import re
import gc
import os 
from multiprocessing import Pool
from glob import glob
from googletrans import Translator
import itertools
from fastai.text import *
from gensim.models.phrases import Phrases, Phraser

In [2]:
path = os.getcwd() + '/data/'
files = glob(path+'*.txt')

In [3]:
def rm_punc(text):
    pattern = r'[.,!]'
    text = re.sub(pattern, '', text)
    return text

def tokenize_corpus(docs):
    tokens = [doc.split() for doc in docs]
    return tokens

def get_vocab(tok_texts):
    all_tokens = list(itertools.chain(*tok_texts))
    vocabulary = sorted(list(set(all_tokens)))
    vocabulary.insert(0, '/0')
    word2idx = {w: idx for idx, w in enumerate(vocabulary)}
    idx2word = {idx: w for idx, w in enumerate(vocabulary)}
    vocabulary_size = len(vocabulary)
    return vocabulary, vocabulary_size, word2idx, idx2word

In [4]:
translator = Translator()
hin_texts = [open(file, 'r', encoding="utf-8").read() for file in files] 
eng_texts = [translator.translate(text,dest='en').text for text in hin_texts] 

In [None]:
pool = Pool(3)
# remove punctuations
hin_texts = pool.map(rm_punc, hin_texts)
eng_texts = pool.map(rm_punc, eng_texts)

In [5]:
# tokenized list
hin_tok = tokenize_corpus(hin_texts)
eng_tok = tokenize_corpus(eng_texts)

In [6]:
# hindi parameters:
hvocab, h_vs, hw2i, hi2w  = get_vocab(hin_tok)
# english paramaters:
evocab, e_vs, ew2i, ei2w  = get_vocab(eng_tok)

In [12]:
def gen_skipg(wds, tokens, word2idx):
    xs, ys = [], []
    for i, tok in enumerate(tokens):
        t_c = []
        t_tr = word2idx[tokens[i]]
        for j in range(1,wds+1):      
            if i+j < len(tokens):
                t_c.append(word2idx[tokens[i+j]])
            if i-j > -1:
                t_c.append(word2idx[tokens[i-j]])           
        ys.append(t_c)
        xs.append([t_tr]*len(t_c))
    xs = list(itertools.chain(*xs))
    xs.insert(0, xs[0])
    xs.insert(-1, xs[-1])
    ys = list(itertools.chain(*ys))
    ys.insert(0, 0)
    ys.append(0)
    return xs, ys

def gen_data(wds, tokenized_corpus, word2idx):
    xs = []; ys = []
    for i in range(len(tokenized_corpus)):
        xt, yc = gen_skipg(wds, tokenized_corpus[i], word2idx)
        xs.append(xt); ys.append(yc)
    xs = list(itertools.chain(*xs))
    ys = list(itertools.chain(*ys))
    return xs, ys

In [13]:
# hindi skipgram data
hxs, hys = gen_data(4, hin_tok, hw2i)
hxs = np.array(hxs).reshape(-1,1)
hys = np.array(hys)

# eng skipgram data
exs, eys = gen_data(4, eng_tok, ew2i)
exs = np.array(exs).reshape(-1,1)
eys = np.array(eys)