<a href="https://colab.research.google.com/github/nikhilcss97/Keras_practise/blob/master/eng_to_french.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [74]:
!ls

fra.txt  sample_data


In [93]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from keras.models import Sequential, Model
from keras.layers import Dense, LSTM, CuDNNLSTM, Input, Embedding, TimeDistributed, Flatten, Dropout
from keras.callbacks import ModelCheckpoint


import string
import operator

Using TensorFlow backend.


In [0]:
data= pd.read_csv('fra.txt', sep= '\t', header=None)
data= data.head(5000)

In [0]:
lower= lambda x: str.lower(x)
data= data.applymap(lower)

In [0]:
exclude = set(string.punctuation)
rm_punc= lambda s: ''.join(ch for ch in s if ch not in exclude)
data= data.applymap(rm_punc)


In [0]:
rm_digits = lambda s: ''.join([i for i in s if not i.isdigit()])
data= data.applymap(rm_digits)

In [80]:
data.columns= ['en', 'fr']
data

Unnamed: 0,en,fr
0,go,va
1,hi,salut
2,run,cours
3,run,courez
4,wow,ça alors
5,fire,au feu
6,help,à laide
7,jump,saute
8,stop,ça suffit
9,stop,stop


In [0]:
#Appending SOS and EOS to sentences
append= lambda s: 'SOS_ '+ s + ' _EOS'
data= data.applymap(append)

In [0]:
%%capture
#Create word dictionaries
en_words= dict()
fr_words= dict()

def add_en_word(sent):
  for i in sent.split():
    if i in en_words:
      en_words[i]= en_words[i] + 1
    else:
      en_words[i]= 1

def add_fr_word(sent):
  for i in sent.split():
    if i in fr_words:
      fr_words[i]= fr_words[i] + 1
    else:
      fr_words[i]= 1

data.iloc[:, 0].apply(add_en_word)
data.iloc[:, 1].apply(add_fr_word)

In [0]:
# A list of tuples sorted according to the values
sorted_en_words = sorted(en_words.items(), key=operator.itemgetter(1))
sorted_fr_words = sorted(fr_words.items(), key=operator.itemgetter(1))

en_words= dict(sorted_en_words)
fr_words= dict(sorted_fr_words)

In [0]:
max_en= 0
max_fr= 0
def find_max_length_en(sent):
  global max_en
  if len(sent.split()) > max_en:
    max_en= len(sent.split())
    
def find_max_length_fr(sent):
  global max_fr
  if len(sent.split()) > max_fr:
    max_fr= len(sent.split())

In [0]:
%%capture
data.iloc[:, 0].apply(find_max_length_en)
data.iloc[:, 1].apply(find_max_length_fr)

In [86]:
num_samples= data.shape[0]
num_samples

5000

In [0]:
input_words = sorted(list(en_words))
target_words = sorted(list(fr_words))

In [89]:
len(target_words)

2755

In [0]:
en_token_to_int = dict()
en_int_to_token = dict()

fr_token_to_int = dict()
fr_int_to_token = dict()

for i,token in enumerate(input_words):
    en_token_to_int[token] = i
    en_int_to_token[i]     = token

for i,token in enumerate(target_words):
    fr_token_to_int[token] = i
    fr_int_to_token[i]     = token

In [0]:
encoder_input_data = np.zeros((num_samples, max_en), dtype='float32')
decoder_input_data = np.zeros((num_samples, max_fr), dtype='float32')
decoder_target_data = np.zeros((num_samples, max_fr, len(target_words)), dtype='float32')

In [0]:
for i in range(data.shape[0]):
  en_sent, fr_sent= data['en'].iloc[i], data['fr'].iloc[i]
  
  for t, word in enumerate(en_sent.split()):
    encoder_input_data[i, t] = en_token_to_int[word]
      
  for t, word in enumerate(fr_sent.split()):
    decoder_input_data[i, t] = fr_token_to_int[word]
    if t > 0:
      decoder_target_data[i, t - 1, fr_token_to_int[word]] = 1. 

In [0]:
EMBEDDING_SIZE= 300   # Length of the vector that we willl get from the embedding layer
UNITS         = 1024  # Hidden layers dimension 
DROPOUT       = 0.2   # Rate of the dropout layers
BATCH_SIZE    = 50    # Batch size
EPOCHS        = 30    # Number of epochs

encoder_input = Input(shape=(max_en,))

encoder_embedding = Embedding(input_dim = len(input_words), output_dim = EMBEDDING_SIZE)(encoder_input)
encoder_dropout   = (TimeDistributed(Dropout(rate = DROPOUT)))(encoder_embedding)
encoder_LSTM      = CuDNNLSTM(UNITS, return_sequences=True)(encoder_dropout)

encoder_LSTM2_layer = CuDNNLSTM(UNITS, return_state=True)
encoder_outputs, state_h, state_c = encoder_LSTM2_layer(encoder_LSTM)

encoder_states = [state_h, state_c]

In [0]:
decoder_input = Input(shape=(max_fr,))

# Hidden layers of the decoder :
decoder_embedding_layer = Embedding(input_dim = len(target_words), output_dim = EMBEDDING_SIZE)
decoder_embedding = decoder_embedding_layer(decoder_input)

decoder_dropout_layer = (TimeDistributed(Dropout(rate = DROPOUT)))
decoder_dropout = decoder_dropout_layer(decoder_embedding)

decoder_LSTM_layer = CuDNNLSTM(UNITS, return_sequences=True)
decoder_LSTM = decoder_LSTM_layer(decoder_dropout, initial_state = encoder_states)

decoder_LSTM_2_layer = CuDNNLSTM(UNITS, return_sequences=True, return_state=True)
decoder_LSTM_2,_,_ = decoder_LSTM_2_layer(decoder_LSTM)

# Output layer of the decoder :
decoder_dense = Dense(len(target_words), activation='softmax')
decoder_outputs = decoder_dense(decoder_LSTM_2)

In [104]:
model = Model([encoder_input, decoder_input], decoder_outputs)
model.summary()


__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_5 (InputLayer)            (None, 6)            0                                            
__________________________________________________________________________________________________
embedding_5 (Embedding)         (None, 6, 300)       374400      input_5[0][0]                    
__________________________________________________________________________________________________
input_6 (InputLayer)            (None, 12)           0                                            
__________________________________________________________________________________________________
time_distributed_5 (TimeDistrib (None, 6, 300)       0           embedding_5[0][0]                
__________________________________________________________________________________________________
embedding_

In [106]:
model.compile(optimizer='rmsprop', loss='categorical_crossentropy')
model.fit([encoder_input_data, decoder_input_data], decoder_target_data, batch_size=BATCH_SIZE, epochs=EPOCHS)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<keras.callbacks.History at 0x7f38d2305ef0>