# Data Preparation for English to Hindi Translation

This Notebook works on cleaning and processing data for English to Hindi translation model

In [1]:
import numpy as np 
import pandas as pd 

import re
import string
from string import digits

import nltk
from nltk.corpus import stopwords
from nltk import word_tokenize

import matplotlib.pyplot as plt
import os

pd.set_option('display.max_rows', 500)
pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

# Data Preprocessing 

In [2]:
df = pd.read_csv("/kaggle/input/hindi-english-truncated-corpus/Hindi_English_Truncated_Corpus.csv")
df.head()

Unnamed: 0,source,english_sentence,hindi_sentence
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर..."
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।


In [3]:
df["english_sentence"] = df["english_sentence"].astype("str")
df["hindi_sentence"] = df["hindi_sentence"].astype("str")

In [4]:
df.loc[1, "english_sentence"]

"I'd like to tell you about one such child,"

In [5]:
df.dtypes

source              object
english_sentence    object
hindi_sentence      object
dtype: object

Choose data with max sentence less than 30 char

In [6]:
MAX_FILTER_LEN = 30

In [7]:
df["eng_len"] = df["english_sentence"].apply(lambda x: len(str(x).split(" ")))
df["hin_len"] = df["hindi_sentence"].apply(lambda x: len(str(x).split(" ")))

In [8]:
df

Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len
0,ted,politicians do not have permission to do what ...,"राजनीतिज्ञों के पास जो कार्य करना चाहिए, वह कर...",12,14
1,ted,"I'd like to tell you about one such child,",मई आपको ऐसे ही एक बच्चे के बारे में बताना चाहू...,9,11
2,indic2012,This percentage is even greater than the perce...,यह प्रतिशत भारत में हिन्दुओं प्रतिशत से अधिक है।,10,9
3,ted,what we really mean is that they're bad at not...,हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते,12,11
4,indic2012,.The ending portion of these Vedas is called U...,इन्हीं वेदों का अंतिम भाग उपनिषद कहलाता है।,9,8
...,...,...,...,...,...
127602,indic2012,Examples of art deco construction can be found...,आर्ट डेको शैली के निर्माण मैरीन ड्राइव और ओवल ...,15,15
127603,ted,and put it in our cheeks.,और अपने गालों में डाल लेते हैं।,6,7
127604,tides,"As for the other derivatives of sulphur , the ...","जहां तक गंधक के अन्य उत्पादों का प्रश्न है , द...",36,34
127605,tides,its complicated functioning is defined thus in...,Zरचना-प्रकिया को उसने एक पहेली में यों बांधा है .,11,10


In [9]:
df = df[(df['eng_len'] <= MAX_FILTER_LEN) & (df['hin_len'] <= MAX_FILTER_LEN)]
print(f"Size of dataset to use: {df.shape}")

Size of dataset to use: (106939, 5)


In [10]:
filters = '!"#$%&()*+,-./:;=?@[\\]^_`{|}~\t\n'
df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))
df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['english_sentence']=df['english_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hindi_sentence']=df['hindi_sentence'].apply(lambda x: ''.join(ch for ch in x if ch not in filters))


In [11]:
df['english_sentence'] = df['english_sentence'].apply(lambda x: "[start] " + x + " [end]")
df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: "[start] " + x + " [end]")
df.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['english_sentence'] = df['english_sentence'].apply(lambda x: "[start] " + x + " [end]")
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['hindi_sentence'] = df['hindi_sentence'].apply(lambda x: "[start] " + x + " [end]")


Unnamed: 0,source,english_sentence,hindi_sentence,eng_len,hin_len
0,ted,[start] politicians do not have permission to ...,[start] राजनीतिज्ञों के पास जो कार्य करना चाहि...,12,14
1,ted,[start] I'd like to tell you about one such ch...,[start] मई आपको ऐसे ही एक बच्चे के बारे में बत...,9,11
2,indic2012,[start] This percentage is even greater than t...,[start] यह प्रतिशत भारत में हिन्दुओं प्रतिशत स...,10,9
3,ted,[start] what we really mean is that they're ba...,[start] हम ये नहीं कहना चाहते कि वो ध्यान नहीं...,12,11
4,indic2012,[start] The ending portion of these Vedas is c...,[start] इन्हीं वेदों का अंतिम भाग उपनिषद कहलात...,9,8


In [12]:
def normalize(eng, hin):
    """Normalize a line of text and split into two at the tab character"""
    line = str(eng.strip().lower()) + "\t" + str(hin.strip())
    line = re.sub(r"^([^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(\s[^ \w])(?!\s)", r"\1 ", line)
    line = re.sub(r"(?!\s)([^ \w])$", r" \1", line)
    line = re.sub(r"(?!\s)([^ \w]\s)", r" \1", line)
    eng, hind = line.split('\t')
    return eng, hin

text_pairs = []
for index in df.index:
    text_pairs.append(normalize(df['english_sentence'][index], df['hindi_sentence'][index]))
print(len(text_pairs))

106939


In [13]:
df.loc[3, "english_sentence"], df.loc[3, "hindi_sentence"]

("[start] what we really mean is that they're bad at not paying attention [end]",
 '[start] हम ये नहीं कहना चाहते कि वो ध्यान नहीं दे पाते [end]')