<a href="https://colab.research.google.com/github/martinpius/Applied-Predictive-Modeling2/blob/master/Language_Model_English_French_Translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
try:
  drive.mount("/content/drive", force_remount = True)
  import tensorflow
  COLAB = True
  print(f"You are using Goole colab with tensorflow version: {tensorflow.__version__}")
except:
  COLAB = False
  print("Not connected!")
  

Mounted at /content/drive
You are using Goole colab with tensorflow version: 2.3.0


In [2]:
def time_set(x):
  h = int(x/(60*60))
  m = int(x%(60*60)/60)
  s = int(x%60)
  return f"{h}: {m:>03}: {s:>05.2f}"

In [3]:
#Machine translation of english sentences into french sentence (short senteces)
#This is an encoder-decoder LSTM
#We start with the encoder LSTM without outputs at each time step but last
#The final time step will rerults into a vector of context for the whole sentence
#Decoder takes the context vector as input and output probabilities of each possible letter
#Maximum probable letter is selected for the firt entry in the decoder
#This also is fed to the next time step as an input conditional to the context vectors
#The procedure repeats until the last letter of the decoder network is met.


In [4]:
import numpy as np
import time
import os
import tensorflow as tf
from tensorflow import keras

In [5]:
#Download the dataset
!!curl -O http://www.manythings.org/anki/fra-eng.zip
!!unzip fra-eng.zip



['Archive:  fra-eng.zip',
 '  inflating: _about.txt              ',
 '  inflating: fra.txt                 ']

In [6]:
#Setting the hyperparameters
epochs = 100
batch = 64
latent_dim = 256
num_sample = 10000
path = 'fra.txt'


In [7]:
#Data pre-processing
input_texts = []
target_texts = []
input_chars = set()
output_chars = set()
with open(path, 'r', encoding = 'utf-8') as f:
  lines = f.read().split('\n')
  #Building the data by iterating over each line and each word within each line
for line in lines[:min(num_sample,len(lines)-1)]:
  input_text, target_text, _ = line.split('\t')
  target_text = '\t' + target_text +'\n'
  input_texts.append(input_text)
  target_texts.append(target_text)
  for char in input_text:
    if char not in input_chars:
      input_chars.add(char)
  for char in target_text:
    if char not in output_chars:
      output_chars.add(char)

input_chars = sorted(list(input_chars))
output_chars = sorted(list(output_chars))
num_encoder_tokens = len(input_chars)
num_decoder_tokens = len(output_chars)
max_encoder_seq_len = max([len(text) for text in input_texts])
max_decoder_seq_len = max([len(text) for text in target_texts])

print(f"Sample size: {len(input_texts)}\n num_unique_input_tokens: {len(input_chars)}\n num_unique_output_tokens: {len(output_chars)}\n maximum seq_len for input: {max_encoder_seq_len}\n maximum seq_len for decoder: {max_decoder_seq_len}")

input_token_index = dict([(char,i) for i, char in enumerate(input_chars)])
target_token_index = dict([(char,i) for i,char in enumerate(output_chars)])
#create numpy arrays containers for the data

encoder_input_data = np.zeros((
    len(input_texts),max_encoder_seq_len, len(input_chars)), dtype = 'float32'
)
decoder_input_data = np.zeros((
    len(input_texts), max_decoder_seq_len, len(output_chars)), dtype = 'float32'
    )
target_output_data = np.zeros((len(input_texts), max_decoder_seq_len, len(output_chars)),
                               dtype ='float32')

for i,(input_text, target_text) in enumerate(zip(input_texts, target_texts)):
  for t, char in enumerate(input_text):
    encoder_input_data[i,t,input_token_index[char]]= 1.0
  encoder_input_data[i,t+1:input_token_index[" "]] = 1.0
  for t, char in enumerate(target_text):
    decoder_input_data[i,t,target_token_index[char]] = 1.0
    if t>0:
      target_output_data[i, t-1, target_token_index[char]] = 1.0
  decoder_input_data[i,t+1:,target_token_index[" "]] = 1.0
  target_output_data[i,t:,target_token_index[" "]] = 1.0



Sample size: 10000
 num_unique_input_tokens: 71
 num_unique_output_tokens: 93
 maximum seq_len for input: 15
 maximum seq_len for decoder: 59


In [8]:
#Encoder Model
encoder_input = keras.Input(shape = (None, num_encoder_tokens))
encoder_lstm = keras.layers.LSTM(units = latent_dim, return_state=True)
encoder_out, state_h, state_c = encoder_lstm(encoder_input)
encoder_states = [state_h, state_c] #Grabs the states without the output for the encoder
#Decoder Model
decoder_input = keras.Input(shape = (None, num_decoder_tokens))
decoder_lstm = keras.layers.LSTM(units = latent_dim, return_state=True, return_sequences= True)
decoder_out,_,_ = decoder_lstm(decoder_input, initial_state = encoder_states)
decoder_dense = keras.layers.Dense(units = num_decoder_tokens, activation = 'softmax')#Get the probabilities for each possible letter
decoder_out = decoder_dense(decoder_out)
model = keras.Model(inputs = [encoder_input, decoder_input], outputs = decoder_out)#Merge the encoder and decoder models


In [9]:
model.summary()

Model: "functional_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            [(None, None, 71)]   0                                            
__________________________________________________________________________________________________
input_2 (InputLayer)            [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
lstm (LSTM)                     [(None, 256), (None, 335872      input_1[0][0]                    
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  358400      input_2[0][0]                    
                                                                 lstm[0][1]            

In [10]:
#Train the model
model.compile(loss = 'categorical_crossentropy', optimizer= 'Adam', metrics = ['accuracy'])
model.fit([encoder_input_data, decoder_input_data], target_output_data, epochs = epochs, verbose = 2, batch_size=batch, validation_split=0.2)
model.save('LanguageModel')


Epoch 1/100
125/125 - 2s - loss: 1.3963 - accuracy: 0.7123 - val_loss: 1.1357 - val_accuracy: 0.6913
Epoch 2/100
125/125 - 1s - loss: 0.9368 - accuracy: 0.7419 - val_loss: 0.9703 - val_accuracy: 0.7328
Epoch 3/100
125/125 - 1s - loss: 0.7982 - accuracy: 0.7842 - val_loss: 0.8280 - val_accuracy: 0.7705
Epoch 4/100
125/125 - 1s - loss: 0.6810 - accuracy: 0.8092 - val_loss: 0.7272 - val_accuracy: 0.7910
Epoch 5/100
125/125 - 1s - loss: 0.6133 - accuracy: 0.8252 - val_loss: 0.6730 - val_accuracy: 0.8052
Epoch 6/100
125/125 - 1s - loss: 0.5738 - accuracy: 0.8341 - val_loss: 0.6398 - val_accuracy: 0.8142
Epoch 7/100
125/125 - 1s - loss: 0.5427 - accuracy: 0.8427 - val_loss: 0.6176 - val_accuracy: 0.8198
Epoch 8/100
125/125 - 1s - loss: 0.5193 - accuracy: 0.8480 - val_loss: 0.5978 - val_accuracy: 0.8236
Epoch 9/100
125/125 - 1s - loss: 0.5003 - accuracy: 0.8533 - val_loss: 0.5853 - val_accuracy: 0.8259
Epoch 10/100
125/125 - 1s - loss: 0.4839 - accuracy: 0.8574 - val_loss: 0.5655 - val_accura

In [14]:
#The above model is overfit and great improvement can be made by regularization
#Predictions
mymodel = keras.models.load_model('LanguageModel')#Load the saved model from disk/drive
encoder_inputs = mymodel.input[0] #Retrieve the first input (lstm_encoder's input)
encoder_outputs, state_h_enc, state_c_enc = mymodel.layers[2].output #From the encoder LSTM (3rd layer is the output layer)
encoder_states1 = [state_h_enc, state_c_enc] #Grabs the input for the decoder in the 2nd lstm model
encoder_model = keras.Model(inputs = encoder_inputs, outputs = encoder_states1) #Get the context for the decoder
### do similar for the decoder...retrieves the layers and fit 
decoder_inputs = mymodel.input[1]#grab the inputs for the 2nd lstm
decoder_input_state_h = keras.Input(shape = (latent_dim,), name = 'input_3')
decoder_input_state_c = keras.Input(shape = (latent_dim,), name = 'input_4')
decoder_states_inputs = [decoder_input_state_h, decoder_input_state_c]
decoder_lstm1 = mymodel.layers[3] #grab the lstm layer for the decoder
decoder_outputs, decoder_state_h, decoder_state_c =decoder_lstm1(decoder_inputs, initial_state = decoder_states_inputs)
decoder_states = [decoder_state_h, decoder_state_c]
decoder_dense = mymodel.layers[4]
decoder_outputs = decoder_dense(decoder_outputs)
decoder_model = keras.Model([decoder_inputs]+decoder_states_inputs, [decoder_outputs]+decoder_states)

#Re-index the sequences to obtain meaningful sentences
reverse_input_char_index = dict((i, char) for char, i in input_token_index.items())
reverse_target_char_index = dict((i, char) for char, i in target_token_index.items())


def decode_sequence(input_seq):
    # Encode the input as state vectors.
    states_value = encoder_model.predict(input_seq)

    # Generate empty target sequence of length 1.
    target_seq = np.zeros((1, 1, num_decoder_tokens))
    # Populate the first character of target sequence with the start character.
    target_seq[0, 0, target_token_index["\t"]] = 1.0

    # Sampling loop for a batch of sequences
    # (to simplify, here we assume a batch of size 1).
    stop_condition = False
    decoded_sentence = ""
    while not stop_condition:
        output_tokens, h, c = decoder_model.predict([target_seq] + states_value)

        # Sample a token
        sampled_token_index = np.argmax(output_tokens[0, -1, :])
        sampled_char = reverse_target_char_index[sampled_token_index]
        decoded_sentence += sampled_char

        # Exit condition: either hit max length
        # or find stop character.
        if sampled_char == "\n" or len(decoded_sentence) > max_decoder_seq_len:
            stop_condition = True

        # Update the target sequence (of length 1).
        target_seq = np.zeros((1, 1, num_decoder_tokens))
        target_seq[0, 0, sampled_token_index] = 1.0

        # Update states
        states_value = [h, c]
    return decoded_sentence






In [15]:
decoder_model.summary()

Model: "functional_7"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, None, 93)]   0                                            
__________________________________________________________________________________________________
input_3 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
input_4 (InputLayer)            [(None, 256)]        0                                            
__________________________________________________________________________________________________
lstm_1 (LSTM)                   [(None, None, 256),  358400      input_2[0][0]                    
                                                                 input_3[0][0]         

In [16]:
encoder_model.summary()

Model: "functional_5"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_1 (InputLayer)         [(None, None, 71)]        0         
_________________________________________________________________
lstm (LSTM)                  [(None, 256), (None, 256) 335872    
Total params: 335,872
Trainable params: 335,872
Non-trainable params: 0
_________________________________________________________________


In [19]:
#Generating the predicted sentences(Decoded)
for seq_index in range(20):
    # Take one sequence (part of the training set)
    # for trying out decoding.
    input_seq = encoder_input_data[seq_index : seq_index + 1]
    decoded_sentence = decode_sequence(input_seq)
    print("-----------*----------")
    print("English sentence:", input_texts[seq_index])
    print("French sentence:", decoded_sentence)

-----------*----------
English sentence: Go.
French sentence: Va !

-----------*----------
English sentence: Hi.
French sentence: Salut !

-----------*----------
English sentence: Hi.
French sentence: Salut !

-----------*----------
English sentence: Run!
French sentence: Courez !

-----------*----------
English sentence: Run!
French sentence: Courez !

-----------*----------
English sentence: Who?
French sentence: Qui ?

-----------*----------
English sentence: Wow!
French sentence: Ça alors !

-----------*----------
English sentence: Fire!
French sentence: Au feu !

-----------*----------
English sentence: Help!
French sentence: À l'aide !

-----------*----------
English sentence: Jump.
French sentence: Saute.

-----------*----------
English sentence: Stop!
French sentence: Ça suffit !

-----------*----------
English sentence: Stop!
French sentence: Ça suffit !

-----------*----------
English sentence: Stop!
French sentence: Ça suffit !

-----------*----------
English sentence: Wait!