# Data Preparation for English to Hindi Translation

This Notebook works on cleaning and processing data for English to Hindi translation model

In [1]:
import numpy as np 
import pandas as pd 

import re
import string
from string import digits

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

import matplotlib.pyplot as plt
import os
from tqdm import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim

# load and build datasets
import torchtext
from torchtext.data.functional import to_map_style_dataset
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets

# Data Preprocessing 

In [3]:
df = pd.read_csv("/kaggle/input/hindi-english-truncated-corpus/Hindi_English_Truncated_Corpus.csv")
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
df["english_sentence"] = df["english_sentence"].astype("str")
df["hindi_sentence"] = df["hindi_sentence"].astype("str")

In [5]:
df.loc[1, "english_sentence"]

"I'd like to tell you about one such child,"

In [6]:
df.dtypes

source              object
english_sentence    object
hindi_sentence      object
dtype: object

Choose data with max sentence less than 30 char

In [7]:
MAX_FILTER_LEN = 20

In [8]:
df["eng_len"] = df["english_sentence"].apply(lambda x: len(str(x).split(" ")))
df["hin_len"] = df["hindi_sentence"].apply(lambda x: len(str(x).split(" ")))

In [9]:
df

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...",12,14
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...,9,11
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।,10,9
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते,12,11
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।,9,8
...,...,...,...,...,...
127602,indic2012,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...,15,15
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।,6,7
127604,tides,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द...",36,34
127605,tides,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .,11,10


In [10]:
def check_english(text):
    list1 = text.split(" ")
    l="abcdefghijklmnopqrstuvwxyz0987654321ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    l2=[char for char in l]
    l2 = set(l2)
    for char in list1:
        for alp in char:
            if alp in l2:
                return True
    return False

In [11]:
def check_hindi(text):
    list1 = text.split(" ")
    l="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    l2=[char for char in l]
    l2 = set(l2)
    for char in list1:
        for alp in char:
            if alp in l2:
                return False
    return True

In [12]:
df["eng_check"] = df["english_sentence"].apply(check_english)
df["hi_check"] = df["hindi_sentence"].apply(check_hindi)
df = df[df["eng_check"] == True]
df = df[df["hi_check"] == True]

In [13]:
d1=df[df["eng_check"]==False]
# df[df["eng_check"]==False].shape

In [14]:
d1

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len,eng_check,hi_check


In [15]:
text="फिर मैंने उसे Roald Dahl की “James and the Giant Peach”"

In [16]:
# d1.loc[127571,"hindi_sentence"]

In [17]:
df = df[(df['eng_len'] <= MAX_FILTER_LEN) & (df['hin_len'] <= MAX_FILTER_LEN)]
print(f"Size of dataset to use: {df.shape}")

Size of dataset to use: (83733, 7)


In [18]:
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n' + '।'
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))

In [19]:
# df['english_sentence'] = df['english_sentence'].apply(lambda x: "[start] " + x + " [end]")
# df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: "[start] " + x + " [end]")
# df.head()

In [20]:
def normalize(line, is_english = True):
    if is_english:
        line = str(line.strip().lower())
    else:
        line = str(line.strip())

    line = re.sub(r"^([^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(\s[^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(?!\s)([^ \w])$", r" \1", line)
    line = re.sub(r"(?!\s)([^ \w]\s)", r" \1", line)
    return line


In [21]:
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=200):
    df = df.sample(frac=1.,random_state=seed)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[:train_end, :]
    validate = df.iloc[train_end:validate_end, :]
    test = df.iloc[validate_end:, :]
    return train, validate, test

In [22]:
train_df, val_df, test_df = train_validate_test_split(
                                        df, 
                                        train_percent=.8, 
                                        validate_percent=.15, 
                                        seed=7)
print(f"train dataset shape : {train_df.shape}")
print(f"val dataset shape : {val_df.shape}")
print(f"test dataset shape : {test_df.shape}")

train dataset shape : (66986, 7)
val dataset shape : (12559, 7)
test dataset shape : (4188, 7)


In [23]:
train_df

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len,eng_check,hi_check
11873,indic2012,this is universal that mahabharath crosses by ...,यह सर्वमान्य है कि महाभारत का आधुनिक रुप कई अव...,9,15,True,True
94260,indic2012,In the East and SouthEast of the Sindhu there ...,सिन्धु नदी के पश्चिम और दक्षिणपश्चिम में बलोचि...,13,12,True,True
63854,ted,Thomas Edison would have been very very comfor...,थोमस एडीसन के लिए बहुत सुविधापूर्ण होता,8,7,True,True
31412,ted,“I've lost it,“मैंने दिमाग खो दिया है,3,5,True,True
111582,ted,in a country 81 percent Hindu,एक ऐसे देश में जहा ८१ प्रतिशत हिन्दू है,6,9,True,True
...,...,...,...,...,...,...,...
67130,indic2012,City administration,नागर प्रशासन,2,2,True,True
27087,tides,They will give you a form on which you should ...,आपको उनसे जो फ़ार्म मिलेगा उसमें आफको अपने नु...,17,16,True,True
66145,tides,We do not know how long he stayed in Mangaliveda,हमें नहीं पता कि वह कब तक मंगलीवेडा में रहे,11,11,True,True
33604,tides,This might be the first step towards his extra...,यह शायद उसके भारत को प्रत्यर्पित करने में पहल ...,12,12,True,True


In [24]:
def create_iterable(sample_df, lang1, lang2):
    """
    This converts pandas dataframe into list of tuples
    consisting of (german sentences, english sentences).
    
    This iterable in used in our data preparation
    """
    sample_iter = sample_df.to_dict(orient='records')
    out_iter = [(dict1[lang1], dict1[lang2]) for dict1 in sample_iter]
    print(f"length of iterable: {len(out_iter)}")
    return out_iter

In [25]:
train_df["english_sentence"]

11873     this is universal that mahabharath crosses by ...
94260     In the East and SouthEast of the Sindhu there ...
63854     Thomas Edison would have been very very comfor...
31412                                         “I've lost it
111582                        in a country 81 percent Hindu
                                ...                        
67130                                   City administration
27087     They will give you a form on which you should ...
66145     We do not know how long he stayed in Mangaliveda 
33604     This might be the first step towards his extra...
79331     Official place made for election advertisement...
Name: english_sentence, Length: 66986, dtype: object

In [26]:
from enum import Enum

# class syntax
class Translate_dir(Enum):
    E2H = 1
    H2E = 2


In [27]:
direction = "E2H"
lang1 = "english_sentence"
lang2 = "hindi_sentence"

In [28]:
lang1, lang2

('english_sentence', 'hindi_sentence')

In [29]:
train_iter = create_iterable(train_df, lang1, lang2)
val_iter = create_iterable(val_df, lang1, lang2)
test_iter = create_iterable(test_df, lang1, lang2)

length of iterable: 66986
length of iterable: 12559
length of iterable: 4188


In [30]:
train_iter[0][0]

'this is universal that mahabharath crosses by different crises'

In [31]:
val_iter[0][0]

"And that's more intelligent in a way"

In [32]:
test_iter[0][0]

'She remained in Vrindavan for many days and then went to Dwarika'

In [33]:
for from_tuple in train_iter:
    print(from_tuple[0])
    break

this is universal that mahabharath crosses by different crises


# Build Vocabulary

In [34]:
import spacy
eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text

In [35]:
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]


In [36]:
def hiTokenize(text):
    """
    Tokenize a Hindi text and return a list of tokens
    """
    # Remove the fullstop from hindi sentences which is '।'
    char = '।'
    return [token for token in text.split() if token != char]


In [37]:
text = "to invoice hospitals"
print(engTokenize(text))

['to', 'invoice', 'hospitals']


In [38]:
text = "हॉस्पिटलों को बिल कैसे भेजें ।"
print(hiTokenize(text))

['हॉस्पिटलों', 'को', 'बिल', 'कैसे', 'भेजें']


In [39]:
def yield_tokens(data_iter, tokenizer, index: int):
    """
    Return the tokens for the appropriate language.

    Args:
        data_iter:    text here 
        tokenizer:    tokenizer for the language
        index:        index of the language in the tuple 

    Yields:
        sequences based on index       
    """
    c=0 
    for from_tuple in data_iter:
        if c<5:
            print(from_tuple[index])
            c +=1
        yield tokenizer(from_tuple[index])

In [40]:
def build_vocabulary(
                    eng, 
                    train_iter, 
                    val_iter, 
                    test_iter, 
                    direction,
                    min_freq: int = 2):
    """
    Input Args:
    eng : English Tokenizer
    train_iter : Train dataset iterable 
    val_iter : Val dataset iterable
    test_iter : Test dataset iterable
    direction : for english to hindi translation = E2H and H2E for vice versa
    """
  
    def engTokenize(text):
        """
        Tokenize an English text and return a list of tokens
        """
        return [token.text for token in eng.tokenizer(text)]


    def hiTokenize(text):
        """
        Tokenize a Hindi text and return a list of tokens
        """
        # Remove the fullstop from hindi sentences which is '।'
        char = '।'
        return [token for token in text.split() if token != char]


    train = train_iter
    val = val_iter
    test = test_iter
    
    if direction == Translate_dir.E2H.name:
        english_index = 0
        hindi_index = 1
    else:
        hindi_index = 0
        english_index = 1
        
    print(f"english index : {english_index}")
    print(f"hindi index : {hindi_index}")

    # generate source vocabulary
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, 
                     engTokenize, 
                     index=english_index), # tokens for each English sentence 
        min_freq=min_freq, 
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )

    print("Building English Vocabulary...")

    # generate target vocabulary
    vocab_trg = build_vocab_from_iterator(
        yield_tokens(train + val + test, 
                     engTokenize, 
                     index=hindi_index), # tokens for each Hindi sentence 
        min_freq=2, # 
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )
    print("Building Hindi Vocabulary...")

    # set default token for out-of-vocabulary words (OOV)
    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_trg.set_default_index(vocab_trg["<unk>"])

    return vocab_src, vocab_trg

In [41]:
def load_vocab(
        eng, 
        train_iter, 
        val_iter, 
        test_iter, 
        direction,
        min_freq: int = 2):
    """
    Args:
        eng:     English tokenizer
        min_freq:     minimum frequency needed to include a word in the vocabulary
        direction : for english to hindi translation = E2H and H2E for vice versa

    Returns:
        vocab_src:    Source vocabulary
        vocab_trg:     Target vocabulary       
    """
    if not os.path.exists("vocab.pt"):
        # build the Hindi/English vocabulary if it does not exist
        vocab_src, vocab_trg = build_vocabulary(eng, 
                                                train_iter, 
                                                val_iter, 
                                                test_iter, 
                                                direction,
                                                min_freq)
        # save it to a file
        torch.save((vocab_src, vocab_trg), "vocab.pt")
    else:
        # load the vocab if it exists
        vocab_src, vocab_trg = torch.load("vocab.pt")

    print("Finished.\nVocabulary sizes:")
    print("\tSource:", len(vocab_src))
    print("\tTarget:", len(vocab_trg))
    return vocab_src, vocab_trg

In [42]:
vocab_src, vocab_trg = load_vocab(
                                eng, 
                                train_iter, 
                                val_iter, 
                                test_iter, 
                                direction,)

english index : 0
hindi index : 1
this is universal that mahabharath crosses by different crises
In the East and SouthEast of the Sindhu there lies desert of Baluchistan
Thomas Edison would have been very very comfortable
“I've lost it
in a country 81 percent Hindu
Building English Vocabulary...
यह सर्वमान्य है कि महाभारत का आधुनिक रुप कई अवस्थाओं से गुजर कर बना है
सिन्धु नदी के पश्चिम और दक्षिणपश्चिम में बलोचिस्तान का इलाका मरुस्थल है
थोमस एडीसन के लिए बहुत सुविधापूर्ण होता
“मैंने दिमाग खो दिया है
एक ऐसे देश में जहा ८१ प्रतिशत हिन्दू है
Building Hindi Vocabulary...
Finished.
Vocabulary sizes:
	Source: 22529
	Target: 22658


In [43]:
dir(vocab_src)

['T_destination',
 '__annotations__',
 '__call__',
 '__class__',
 '__contains__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattr__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__jit_unused_properties__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__prepare_scriptable__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_apply',
 '_backward_hooks',
 '_backward_pre_hooks',
 '_buffers',
 '_call_impl',
 '_compiled_call_impl',
 '_forward_hooks',
 '_forward_hooks_always_called',
 '_forward_hooks_with_kwargs',
 '_forward_pre_hooks',
 '_forward_pre_hooks_with_kwargs',
 '_get_backward_hooks',
 '_get_backward_pre_hooks',
 '_get_name',
 '_is_full_backward_hook',
 '_load_from_state_dict',
 '_load_state_dict_post_hooks',
 '_load_state_dict_pre_hooks

In [44]:
v1 = vocab_trg.vocab.get_stoi()

In [45]:
vocab_trg.vocab.get_stoi()["01481"]

18270

In [46]:
vocab_src.vocab.get_stoi()["01481"]

15999

In [47]:
l = [(k,v) for k,v in v1.items()]
l.sort(key = lambda x: x[0])
l

[(' ', 14),
 ('  ', 593),
 ('   ', 2684),
 ("'", 52),
 ('0', 9843),
 ('000', 15759),
 ('01', 15760),
 ('0114', 2685),
 ('0117', 18266),
 ('0121', 18267),
 ('01273', 18268),
 ('0131', 18269),
 ('01481', 18270),
 ('01524', 18271),
 ('01624', 18272),
 ('01634', 18273),
 ('01787', 18274),
 ('020', 3283),
 ('028', 18275),
 ('031', 18276),
 ('05', 18277),
 ('0800', 15761),
 ('0808', 18278),
 ('0845', 9844),
 ('0870', 8209),
 ('1', 473),
 ('10', 794),
 ('100', 2145),
 ('1000', 3642),
 ('10000', 4282),
 ('100000', 8210),
 ('101', 18279),
 ('103', 11261),
 ('1046', 11262),
 ('1076', 18280),
 ('1091939', 11263),
 ('11', 1508),
 ('110', 8936),
 ('1100', 15762),
 ('112', 18281),
 ('113', 15763),
 ('12', 954),
 ('120', 8937),
 ('1200', 18282),
 ('12000', 15764),
 ('120000', 18283),
 ('121', 18284),
 ('122', 18285),
 ('1226', 18286),
 ('123', 15765),
 ('124', 18287),
 ('124ए', 18288),
 ('125', 8211),
 ('12891', 11264),
 ('13', 1600),
 ('130', 11265),
 ('13000', 18289),
 ('132', 11266),
 ('132139', 1

In [48]:
vocab_src.vocab.get_stoi()

{'śaunakīya': 22527,
 'zeitgeist': 22523,
 'zamindari': 22521,
 'yogas': 22516,
 'yielded': 22514,
 'yhe': 22513,
 'yearbook': 22510,
 'yajurveda': 22508,
 'yajur': 22507,
 'yajuh': 22506,
 'y': 22504,
 'x': 22502,
 'writs': 22500,
 'wreck': 22495,
 'worshipping': 22494,
 'worshippers': 22493,
 'worldquality': 22489,
 'worldfamous': 22488,
 'wordgroups': 22486,
 'woody': 22485,
 'womens': 22483,
 'wolves': 22481,
 'withholds': 22480,
 'withhold': 22479,
 'wisely': 22476,
 'wiping': 22475,
 'wining': 22474,
 'windshield': 22473,
 'windmills': 22472,
 'winch': 22471,
 'wilds': 22468,
 'wilderness': 22466,
 'wikidictionary': 22465,
 'widgets': 22463,
 'widest': 22462,
 'whosoever': 22460,
 'wholesome': 22459,
 'whitewashed': 22458,
 'whish': 22457,
 'whipped': 22456,
 'whine': 22455,
 'wheelleg': 22453,
 'whale': 22449,
 'welltodo': 22446,
 'wellknit': 22444,
 'welldefined': 22443,
 'welder': 22441,
 'weed': 22436,
 'wedge': 22435,
 'webs': 22434,
 'webpages': 22433,
 'webcam': 22432,
 'w