# Data Preparation for English to Hindi Translation

This Notebook works on cleaning and processing data for English to Hindi translation model

In [1]:
import numpy as np 
import pandas as pd 

import re
import string
from string import digits

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

import matplotlib.pyplot as plt
import os
from tqdm import tqdm

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

In [2]:
# torch packages
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch import Tensor
import torch.optim as optim

# load and build datasets
import torchtext
from torchtext.data.functional import to_map_style_dataset
from torch.nn.functional import pad
from torch.utils.data import DataLoader
from torchtext.vocab import build_vocab_from_iterator
import torchtext.datasets as datasets

# Data Preprocessing 

In [3]:
df = pd.read_csv("/kaggle/input/hindi-english-truncated-corpus/Hindi_English_Truncated_Corpus.csv")
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [4]:
df["english_sentence"] = df["english_sentence"].astype("str")
df["hindi_sentence"] = df["hindi_sentence"].astype("str")

In [5]:
df.loc[1, "english_sentence"]

"I'd like to tell you about one such child,"

In [6]:
df.dtypes

source              object
english_sentence    object
hindi_sentence      object
dtype: object

Choose data with max sentence less than 30 char

In [7]:
MAX_FILTER_LEN = 20

In [8]:
df["eng_len"] = df["english_sentence"].apply(lambda x: len(str(x).split(" ")))
df["hin_len"] = df["hindi_sentence"].apply(lambda x: len(str(x).split(" ")))

In [9]:
df

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...",12,14
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...,9,11
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।,10,9
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते,12,11
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।,9,8
...,...,...,...,...,...
127602,indic2012,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...,15,15
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।,6,7
127604,tides,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द...",36,34
127605,tides,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .,11,10


In [10]:
max(df["hin_len"]), max(df["eng_len"])

(418, 398)

In [11]:
# Since both the columns could contain hindi or english sentences

def check_english(text):
    list1 = text.split(" ")
    l="abcdefghijklmnopqrstuvwxyz0987654321ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    l2=[char for char in l]
    l2 = set(l2)
    # Make sure the sentence is in English by checking if any letter from the sentence contains english language
    for char in list1:
        for alp in char:
            if alp in l2:
                return True
    return False

In [12]:
def check_hindi(text):
    list1 = text.split(" ")
#     l="abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ"
    l="abcdefghijklmnopqrstuvwxyz0987654321ABCDEFGHIJKLMNOPQRSTUVWXYZ"
    l2=[char for char in l]
    l2 = set(l2)
    # Make sure the sentence is in Hindi by checking if any letter from the 
    # sentence contains english language and marking the sentence as False
    for char in list1:
        for alp in char:
            if alp in l2:
                return False
    return True

In [13]:
df["eng_check"] = df["english_sentence"].apply(check_english)
df["hi_check"] = df["hindi_sentence"].apply(check_hindi)
df = df[df["eng_check"] == True]
df = df[df["hi_check"] == True]

In [14]:
d1=df[df["eng_check"]==False]
# df[df["eng_check"]==False].shape

In [15]:
d1

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len,eng_check,hi_check


In [16]:
text="फिर मैंने उसे Roald Dahl की “James and the Giant Peach”"

In [17]:
df = df[(df['eng_len'] <= MAX_FILTER_LEN) & (df['hin_len'] <= MAX_FILTER_LEN)]
print(f"Size of dataset to use: {df.shape}")

Size of dataset to use: (78801, 7)


In [18]:
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n' + '।'
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))

In [19]:
# df['english_sentence'] = df['english_sentence'].apply(lambda x: "[start] " + x + " [end]")
# df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: "[start] " + x + " [end]")
# df.head()

In [20]:
def normalize(line, is_english = True):
    if is_english:
        line = str(line.strip().lower())
    else:
        line = str(line.strip())

    line = re.sub(r"^([^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(\s[^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(?!\s)([^ \w])$", r" \1", line)
    line = re.sub(r"(?!\s)([^ \w]\s)", r" \1", line)
    return line


In [21]:
def train_validate_test_split(df, train_percent=.8, validate_percent=.1, seed=200):
    df = df.sample(frac=1.,random_state=seed)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[:train_end, :]
    validate = df.iloc[train_end:validate_end, :]
    test = df.iloc[validate_end:, :]
    return train, validate, test

In [22]:
train_df, val_df, test_df = train_validate_test_split(
                                        df, 
                                        train_percent=.8, 
                                        validate_percent=.15, 
                                        seed=7)
print(f"train dataset shape : {train_df.shape}")
print(f"val dataset shape : {val_df.shape}")
print(f"test dataset shape : {test_df.shape}")

train dataset shape : (63040, 7)
val dataset shape : (11820, 7)
test dataset shape : (3941, 7)


In [23]:
train_df

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len,eng_check,hi_check
33984,tides,Paying council tax,आप कौंसिल कर भर रहे है,4,7,True,True
83709,tides,They make the fires in sixteen different spots...,वे सोलह भिन्न स्थानों पर और चार भिन्न समूहों म...,14,14,True,True
99260,tides,Two days later he spoke for the last time to ...,दो दिन बाद इंडियन लीजन से उन्होंने आखिरी बार म...,15,12,True,True
18790,ted,And now with one dollar,और अब आप एक डॉलर में,5,6,True,True
20337,ted,And I'm going to play it all the way through,और अब मैं इसे बिना रुके लगातार बजाऊँगा,10,8,True,True
...,...,...,...,...,...,...,...
118265,indic2012,She was the leader of many national movements ...,उन्होंने अनेक राष्ट्रीय आंदोलनों का नेतृत्व कि...,13,11,True,True
69778,tides,These shapes indicate regional developments an...,ये आकार क्षेत्रीय विकास और कालानुक्रमिक विकास ...,10,13,True,True
90209,indic2012,God must love so deeply of the same to happen ...,इतना गहरा प्रेम करो कि वही तुम्हारे लिए परमात्...,12,11,True,True
84675,tides,Is this true,क्या यह सच है,4,5,True,True


In [24]:
def create_iterable(sample_df, lang1, lang2):
    """
    This converts pandas dataframe into list of tuples
    consisting of (german sentences, english sentences).
    
    This iterable in used in our data preparation
    """
    sample_iter = sample_df.to_dict(orient='records')
    out_iter = [(dict1[lang1], dict1[lang2]) for dict1 in sample_iter]
    print(f"length of iterable: {len(out_iter)}")
    return out_iter

In [25]:
train_df["english_sentence"]

33984                                   Paying council tax 
83709     They make the fires in sixteen different spots...
99260     Two days later  he spoke for the last time to ...
18790                               And now with one dollar
20337          And I'm going to play it all the way through
                                ...                        
118265    She was the leader of many national movements ...
69778     These shapes indicate regional developments an...
90209     God must love so deeply of the same to happen ...
84675                                         Is this true 
43879     This spirit appears to be lacking in our crick...
Name: english_sentence, Length: 63040, dtype: object

In [26]:
from enum import Enum

# class syntax
class Translate_dir(Enum):
    E2H = 1
    H2E = 2


In [27]:
direction = "E2H"
lang1 = "english_sentence"
lang2 = "hindi_sentence"

In [28]:
lang1, lang2

('english_sentence', 'hindi_sentence')

In [29]:
train_iter = create_iterable(train_df, lang1, lang2)
val_iter = create_iterable(val_df, lang1, lang2)
test_iter = create_iterable(test_df, lang1, lang2)

length of iterable: 63040
length of iterable: 11820
length of iterable: 3941


In [30]:
train_iter[0][0]

'Paying council tax '

In [31]:
val_iter[0][0]

'This for example is my tumble dryer'

In [32]:
test_iter[0][0]

'Journalists encounter scuffles among artistes  with each wanting to be interviewed '

In [33]:
for from_tuple in train_iter:
    print(from_tuple[0])
    break

Paying council tax 


# Build Vocabulary

In [34]:
import spacy
eng = spacy.load("en_core_web_sm") # Load the English model to tokenize English text

In [35]:
def engTokenize(text):
    """
    Tokenize an English text and return a list of tokens
    """
    return [token.text for token in eng.tokenizer(text)]


In [36]:
def hiTokenize(text):
    """
    Tokenize a Hindi text and return a list of tokens
    """
    # Remove the fullstop from hindi sentences which is '।'
    char = '।'
    return [token for token in text.split() if token != char]


In [37]:
text = "to invoice hospitals"
print(engTokenize(text))

['to', 'invoice', 'hospitals']


In [38]:
text = "हॉस्पिटलों को बिल कैसे भेजें ।"
print(hiTokenize(text))

['हॉस्पिटलों', 'को', 'बिल', 'कैसे', 'भेजें']


In [39]:
def yield_tokens(data_iter, tokenizer, index: int):
    """
    Return the tokens for the appropriate language.

    Args:
        data_iter:    text here 
        tokenizer:    tokenizer for the language
        index:        index of the language in the tuple 

    Yields:
        sequences based on index       
    """
    c=0 
    for from_tuple in data_iter:
#         if c<5:
#             print(from_tuple[index])
#             c +=1
        yield tokenizer(from_tuple[index])

In [40]:
def build_vocabulary(
                    eng, 
                    train_iter, 
                    val_iter, 
                    test_iter, 
                    direction,
                    min_freq: int = 2):
    """
    Input Args:
    eng : English Tokenizer
    train_iter : Train dataset iterable 
    val_iter : Val dataset iterable
    test_iter : Test dataset iterable
    direction : for english to hindi translation = E2H and H2E for vice versa
    """
  
    def engTokenize(text):
        """
        Tokenize an English text and return a list of tokens
        """
        return [token.text for token in eng.tokenizer(text)]


    def hiTokenize(text):
        """
        Tokenize a Hindi text and return a list of tokens
        """
        # Remove the fullstop from hindi sentences which is '।'
        char = '।'
        return [token for token in text.split() if token != char]


    train = train_iter
    val = val_iter
    test = test_iter
    
    if direction == Translate_dir.E2H.name:
        english_index = 0
        hindi_index = 1
    else:
        hindi_index = 0
        english_index = 1
        
    print(f"english index : {english_index}")
    print(f"hindi index : {hindi_index}")

    # generate source vocabulary
    vocab_src = build_vocab_from_iterator(
        yield_tokens(train + val + test, 
                     engTokenize, 
                     index=english_index), # tokens for each English sentence 
        min_freq=min_freq, 
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )

    print("Building English Vocabulary...")

    # generate target vocabulary
    vocab_trg = build_vocab_from_iterator(
        yield_tokens(train + val + test, 
                     engTokenize, 
                     index=hindi_index), # tokens for each Hindi sentence 
        min_freq=2, # 
        specials=["<bos>", "<eos>", "<pad>", "<unk>"],
    )
    print("Building Hindi Vocabulary...")

    # set default token for out-of-vocabulary words (OOV)
    vocab_src.set_default_index(vocab_src["<unk>"])
    vocab_trg.set_default_index(vocab_trg["<unk>"])

    return vocab_src, vocab_trg

In [41]:
def load_vocab(
        eng, 
        train_iter, 
        val_iter, 
        test_iter, 
        direction,
        min_freq: int = 2):
    """
    Args:
        eng:     English tokenizer
        min_freq:     minimum frequency needed to include a word in the vocabulary
        direction : for english to hindi translation = E2H and H2E for vice versa

    Returns:
        vocab_src:    Source vocabulary
        vocab_trg:     Target vocabulary       
    """
    if not os.path.exists("vocab.pt"):
        # build the Hindi/English vocabulary if it does not exist
        vocab_src, vocab_trg = build_vocabulary(eng, 
                                                train_iter, 
                                                val_iter, 
                                                test_iter, 
                                                direction,
                                                min_freq)
        # save it to a file
        torch.save((vocab_src, vocab_trg), "vocab.pt")
    else:
        # load the vocab if it exists
        vocab_src, vocab_trg = torch.load("vocab.pt")

    print("Finished.\nVocabulary sizes:")
    print("\tSource:", len(vocab_src))
    print("\tTarget:", len(vocab_trg))
    return vocab_src, vocab_trg

In [42]:
vocab_src, vocab_trg = load_vocab(
                                eng, 
                                train_iter, 
                                val_iter, 
                                test_iter, 
                                direction,)

english index : 0
hindi index : 1
Building English Vocabulary...
Building Hindi Vocabulary...
Finished.
Vocabulary sizes:
	Source: 21236
	Target: 21256


In [43]:
# View target vocabulary
trg_vocab_dict = vocab_trg.vocab.get_stoi()

In [44]:
# View source vocabulary in this case English
src_vocab_dict = vocab_src.vocab.get_stoi()

In [45]:
# Create dictionary where there are same keys in the source and target vocab dictionary
S2D_dict = {}
for k, v in src_vocab_dict.items():
    if k in trg_vocab_dict:
        S2D_dict[k] = [v, trg_vocab_dict[k]]
        
mixed_keys = list(S2D_dict.keys())
print(f"common keys are : {mixed_keys}")

common keys are : ['<', '>', '<unk>', '”', '<bos>', '  ', '♫', '“', '‘', '<eos>', '<pad>', '   ', ' ', 'ऋ', "'"]
