In [1]:
import collections
import numpy as np
import json

from keras.preprocessing.text import Tokenizer
from keras.utils import pad_sequences
from keras.models import Model, Sequential
from keras.layers import Input, Dense, Embedding, GRU, LSTM, Bidirectional, Dropout, Activation, TimeDistributed, RepeatVector
from keras.optimizers import Adam
from keras.losses import sparse_categorical_crossentropy

import pandas as pd

In [2]:
data = pd.read_csv('hindi_english_parallel.csv\hindi_english_parallel.csv')
data

Unnamed: 0,hindi,english
0,अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें,Give your application an accessibility workout
1,एक्सेर्साइसर पहुंचनीयता अन्वेषक,Accerciser Accessibility Explorer
2,निचले पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the bottom panel
3,ऊपरी पटल के लिए डिफोल्ट प्लग-इन खाका,The default plugin layout for the top panel
4,उन प्लग-इनों की सूची जिन्हें डिफोल्ट रूप से नि...,A list of plugins that are disabled by default
...,...,...
1561836,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...,स्पष्टीकरण.–जहां इस उपधारा के अधीन हानि और लाभ...
1561837,मैंने गौर किया है कि यह न केवल अपने महत्त्वपूर...,है। I note that this is a landmark meeting – n...
1561838,उन्होंने मेरे समक्ष जो प्रदर्शन किया उसमें से ...,है। In the presentations that they made before...
1561839,खाद्य और जल सुरक्षा; पर्यावरण की दृष्टि से वहन...,्त है। Issues such as food and water security;...


In [3]:
english_sentences = data.iloc[:10000,1].values
english_sentences = list(english_sentences)

In [4]:
hindi_sentences = data.iloc[:10000,0].values
hindi_sentences = list(hindi_sentences)

In [5]:
for i in range(len(english_sentences)):
    if type(english_sentences[i]) != str:
        english_sentences[i] = 'security'
        

In [6]:
for i in range(len(hindi_sentences)):
    if type(hindi_sentences[i]) != str:
       hindi_sentences[i] = 'सुरक्षा'

In [7]:
sum = 0
for i in range(len(hindi_sentences)):
    if type(hindi_sentences[i]) != str:
        print(hindi_sentences[i], i)
        sum += 1
sum

0

In [8]:
print(hindi_sentences[7])
print(english_sentences[7])

सीमांत (बोर्डर) के रंग को हाइलाइट करें
Highlight border color


In [9]:
print(len(english_sentences))
print(len(hindi_sentences))

10000
10000


In [10]:
english_words_counter = collections.Counter([word for sentence in english_sentences for word in sentence.split()])
hindi_words_counter = collections.Counter([word for sentence in hindi_sentences for word in sentence.split()])

print('{} English words.'.format(len([word for sentence in english_sentences for word in sentence.split()])))
print('{} unique English words.'.format(len(english_words_counter)))
print('10 Most common words in the English dataset:')
print('"' + '" "'.join(list(zip(*english_words_counter.most_common(10)))[0]) + '"')

print()
print('{} hindi words.'.format(len([word for sentence in hindi_sentences for word in sentence.split()])))
print('{} unique hindi words.'.format(len(hindi_words_counter)))
print('10 Most common words in the hindi dataset:')
print('"' + '" "'.join(list(zip(*hindi_words_counter.most_common(10)))[0]) + '"')

34471 English words.
2484 unique English words.
10 Most common words in the English dataset:
"the" "of" "_" "a" "to" "Move" "~" "onto" "Remove" "s"

40928 hindi words.
2351 unique hindi words.
10 Most common words in the hindi dataset:
"का" "को" "के" "करें" "(_" "में" "एक" "की" "पर" "ले"


In [11]:
def tokenize(x):
    tokenizer = Tokenizer()
    tokenizer.fit_on_texts(x)
    return tokenizer.texts_to_sequences(x), tokenizer

text_sentences = [
    'The quick brown fox jumps over the lazy dog .',
    'By Jove , my quick study of lexicography won a prize .',
    'This is a short sentence .'
]

text_tokenized, text_tokenizer = tokenize(text_sentences)
print(text_tokenizer.word_index)
print()

for sample_i, (sent, token_sent) in enumerate(zip(text_sentences, text_tokenized)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('Input: {}'.format(sent))
    print(' Output: {}'.format(token_sent))
    

{'the': 1, 'quick': 2, 'a': 3, 'brown': 4, 'fox': 5, 'jumps': 6, 'over': 7, 'lazy': 8, 'dog': 9, 'by': 10, 'jove': 11, 'my': 12, 'study': 13, 'of': 14, 'lexicography': 15, 'won': 16, 'prize': 17, 'this': 18, 'is': 19, 'short': 20, 'sentence': 21}

Sequence 1 in x
Input: The quick brown fox jumps over the lazy dog .
 Output: [1, 2, 4, 5, 6, 7, 1, 8, 9]
Sequence 2 in x
Input: By Jove , my quick study of lexicography won a prize .
 Output: [10, 11, 12, 2, 13, 14, 15, 16, 3, 17]
Sequence 3 in x
Input: This is a short sentence .
 Output: [18, 19, 3, 20, 21]


In [12]:
def pad(x, length=None):
    if length is None:
        length = max([len(sentence) for sentence in x])
    return pad_sequences(x, maxlen=length, padding='post')

test_pad = pad(text_tokenized)
for sample_i, (token_sent, pad_sent) in enumerate(zip(text_tokenized, test_pad)):
    print('Sequence {} in x'.format(sample_i + 1))
    print('  Input:  {}'.format(np.array(token_sent)))
    print('  Output: {}'.format(pad_sent))

Sequence 1 in x
  Input:  [1 2 4 5 6 7 1 8 9]
  Output: [1 2 4 5 6 7 1 8 9 0]
Sequence 2 in x
  Input:  [10 11 12  2 13 14 15 16  3 17]
  Output: [10 11 12  2 13 14 15 16  3 17]
Sequence 3 in x
  Input:  [18 19  3 20 21]
  Output: [18 19  3 20 21  0  0  0  0  0]


In [13]:
def preprocess(x, y):
    preprocess_x, x_tk = tokenize(x)
    preprocess_y, y_tk = tokenize(y)

    preprocess_x = pad(preprocess_x)
    preprocess_y = pad(preprocess_y)

    preprocess_y = preprocess_y.reshape(*preprocess_y.shape,1)

    return preprocess_x, preprocess_y, x_tk, y_tk

preproc_english_sentences, preproc_hindi_sentences, english_tokenizer, hindi_tokenizer = preprocess(english_sentences, hindi_sentences)

max_english_sequence_length = preproc_english_sentences.shape[1]
max_hindi_sequence_length = preproc_hindi_sentences.shape[1]
english_vocab_size = len(english_tokenizer.word_index)
hindi_vocab_size = len(hindi_tokenizer.word_index)

print('Data Preprocessed')
print("Max English sentence length:", max_english_sequence_length)
print("Max hindi sentence length:", max_hindi_sequence_length)
print("English vocabulary size:", english_vocab_size)
print("hindi vocabulary size:", hindi_vocab_size)

Data Preprocessed
Max English sentence length: 41
Max hindi sentence length: 48
English vocabulary size: 1558
hindi vocabulary size: 2038


In [14]:
def logits_to_text(logits, tokenizer):
    index_to_words = {id: word for word, id in tokenizer.word_index.items()}
    index_to_words[0] = '<PAD>'

    return ''.join([index_to_words[prediction] for prediction in np.argmax(logits,1)])


In [15]:
for i in range(preproc_hindi_sentences.shape[0]):
    for j in range(preproc_hindi_sentences.shape[1]):
        if preproc_hindi_sentences[i][j] > 2037:
            preproc_hindi_sentences[i][j] = 0

In [16]:
for i in range(preproc_english_sentences.shape[0]):
    for j in range(preproc_english_sentences.shape[1]):
        if preproc_english_sentences[i][j] > 1557:
            preproc_english_sentences[i][j] = 0

In [17]:
sum = 0
for i in range(preproc_hindi_sentences.shape[0]):
    for j in range(preproc_hindi_sentences.shape[1]):
        if preproc_hindi_sentences[i][j] > 2037:
            sum += 1

In [18]:
def bidirectional_embed_model(input_shape, output_sequence_length, english_vocab_size, hindi_vocab_size):
    
    # Hyperparameters
    learning_rate = 0.01
    
    # Build the layers
    model = Sequential()
    model.add(Embedding(english_vocab_size, 256, input_length=input_shape[1], input_shape=input_shape[1:]))
    model.add(Bidirectional(GRU(256, return_sequences=True)))
    model.add(TimeDistributed(Dense(1024, activation='relu')))
    model.add(Dropout(0.5))
    model.add(TimeDistributed(Dense(hindi_vocab_size, activation='softmax')))
    
    # Compile model
    model.compile(loss = sparse_categorical_crossentropy,
                  optimizer = Adam(learning_rate),
                  metrics = ['accuracy'])
    
    return model

tmp_x = pad(preproc_english_sentences, max_hindi_sequence_length)
tmp_x = tmp_x.reshape((-1, preproc_hindi_sentences.shape[-2]))

# Build the model
embed_rnn_model = bidirectional_embed_model(
    tmp_x.shape,
    max_hindi_sequence_length,
    english_vocab_size,
    hindi_vocab_size)

print(embed_rnn_model.summary())

embed_rnn_model.fit(tmp_x, preproc_hindi_sentences, batch_size=512, epochs=5, validation_split=0.2)

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 48, 256)           398848    
                                                                 
 bidirectional (Bidirectiona  (None, 48, 512)          789504    
 l)                                                              
                                                                 
 time_distributed (TimeDistr  (None, 48, 1024)         525312    
 ibuted)                                                         
                                                                 
 dropout (Dropout)           (None, 48, 1024)          0         
                                                                 
 time_distributed_1 (TimeDis  (None, 48, 2038)         2088950   
 tributed)                                                       
                                                        

<keras.callbacks.History at 0x1f0ad111ee0>

In [19]:
(tmp_x[4000])[0]

12

In [20]:
# Print prediction(s)
print("Prediciton:")
print(logits_to_text(embed_rnn_model.predict(tmp_x[:1])[0], hindi_tokenizer))

print("\nCorrect Translation:")
print(hindi_sentences[:1])

print('\nOriginal text:')
print(english_sentences[:1])

Prediciton:
<PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD><PAD>

Correct Translation:
['अपने अनुप्रयोग को पहुंचनीयता व्यायाम का लाभ दें']

Original text:
['Give your application an accessibility workout']


In [21]:
embed_rnn_model.save('english_to_hindi_model')
# Serialize English Tokenizer to JSON
with open('english_tokenizer_1.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(english_tokenizer.to_json(), ensure_ascii=False))
    
# Serialize Hindi Tokenizer to JSON
with open('hindi_tokenizer.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(hindi_tokenizer.to_json(), ensure_ascii=False))
    
# Save max lengths
max_hindi_sequence_length_json = max_hindi_sequence_length
with open('sequence_length_hindi.json', 'w', encoding='utf8') as f:
    f.write(json.dumps(max_hindi_sequence_length_json, ensure_ascii=False))



INFO:tensorflow:Assets written to: english_to_hindi_model\assets


INFO:tensorflow:Assets written to: english_to_hindi_model\assets
