# Project : French to English 

In [None]:
!wget http://www.manythings.org/anki/fra-eng.zip

# Opening the DataSet


In [None]:
!unzip ./fra-eng.zip

# Importing the important Libraries

In [None]:
import numpy as np
import re
from sklearn.model_selection import train_test_split
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import EarlyStopping,ModelCheckpoint
from tensorflow.keras.models import Model,Sequential
from tensorflow.keras.layers import Dense,LSTM,Input,Embedding,TimeDistributed,RepeatVector
from nltk.translate.bleu_score import SmoothingFunction,corpus_bleu
smoothie = SmoothingFunction().method4

# Data cleaning

In [None]:
data_path = './fra.txt' # path of the file
num_sentences = 20000 # no of sentences from the dataset that we are going to use


In [None]:
# opening the text file and getting the data 
with open(data_path,'r') as f:
    lines = f.read().split('\n')


In [None]:
len(lines)

190207

In [None]:
c=0 # for to count the number of sentences


# data cleaning

source_texts,target_texts = [],[]
for line in lines: # going through each lines
    if c == num_sentences: # if we have 20000 sentences than we will get out of this loop
        break 
    elif '\t' in line:
        op_data,ip_data,_ = line.lower().rstrip().split('\t') # lowering the data and then spliting the data
        
        # to remove the punctuation we did not include last character
        source_text = ip_data[:-1].strip()
        target_text = op_data[:-1].strip()
        # removing the unprintable character
        # for english and french we will take anly alphabets of brespective languages and numbers
        target_text = re.sub("[^a-z 1-9\'-]","",target_text) 
        source_text = re.sub("[^a-zàâãçéèêëîïôœùûüÿ 1-9\'-]","",source_text) 
        
        source_texts.append(source_text)
        target_texts.append(target_text)
        c+=1

In [None]:
for i in range(10):
    print(source_texts[i] + " " + target_texts[i])

va go
marche go
bouge go
salut hi
salut hi
cours run
courez run
prenez vos jambes à vos cous run
file run
filez run


In [None]:
# train_test_split of the source and target data
source_train,source_test,target_train,target_test = train_test_split(source_texts,target_texts,test_size = 0.2, random_state= 0)

# Making required function 

In [None]:
# tokenizer for data
def create_tokenizer(texts):
    tokenizer = Tokenizer(oov_token='<UNK>')
    tokenizer.fit_on_texts(texts)
    return tokenizer

# one_hot encoding of the target data
def one_hot(pad_seq,max_sent_length,num_vocab):
    target_data_one_hot = np.zeros((len(pad_seq),max_sent_length,num_vocab))
    for i,w in enumerate(pad_seq):
        for j,d in enumerate(w):
            target_data_one_hot[i,j,d] = 1
    return target_data_one_hot

# for padding the data
def encoding_text(tokenizer,text,max_length):
    text_seq = tokenizer.texts_to_sequences(text)
    pad_seq = pad_sequences(text_seq,maxlen= max_length)
    return pad_seq

# to find the maximum length of the sentence from data
def max_length(text):
    return max(len(l.split()) for l in text)

# Preparing Training and Testing the data

In [None]:
# preparing source tokenizer and getting relevant information
source_tokenizer = create_tokenizer(source_train)
source_vocab = source_tokenizer.word_index
num_source_vocab = len(source_vocab)+1
max_source_length = max_length(source_train)

# preparing target tokenizer and getting relevant information
target_tokenizer = create_tokenizer(target_train)
target_vocab = target_tokenizer.word_index
num_target_vocab = len(target_vocab)+1
max_target_length = max_length(target_train)

# preparing the training data
source_train_seq_pad = encoding_text(source_tokenizer,source_train,max_source_length) # padding of the source sentences
target_train_seq_pad = encoding_text(target_tokenizer,target_train,max_target_length) # padding of the target sentences
target_train_seq_pad = one_hot(target_train_seq_pad,max_target_length,num_target_vocab) # one hot encoding of the padded target senteces

# preparing the test data
source_test_seq_pad = encoding_text(source_tokenizer,source_test,max_source_length) # padding of the source sentences
target_test_seq_pad = encoding_text(target_tokenizer,target_test,max_target_length) # padding of the target sentences
target_test_seq_pad = one_hot(target_test_seq_pad,max_target_length,num_target_vocab) # one hot encoding of the padded target senteces
 
print(num_source_vocab,num_target_vocab,max_source_length,max_target_length)

5969 3189 11 5


In [None]:
model = Sequential()
model.add(Input(shape=(max_source_length,)))
model.add(Embedding(num_source_vocab,512,mask_zero=True))
model.add(LSTM(512,return_sequences = False))
model.add(RepeatVector(max_target_length))
model.add(LSTM(512,return_sequences = True))
model.add(TimeDistributed(Dense(num_target_vocab,activation = 'softmax')))

model.compile(optimizer='rmsprop', loss='categorical_crossentropy',metrics=['acc'])

model.summary()



Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 11, 512)           3056128   
_________________________________________________________________
lstm (LSTM)                  (None, 512)               2099200   
_________________________________________________________________
repeat_vector (RepeatVector) (None, 5, 512)            0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 5, 512)            2099200   
_________________________________________________________________
time_distributed (TimeDistri (None, 5, 3189)           1635957   
Total params: 8,890,485
Trainable params: 8,890,485
Non-trainable params: 0
_________________________________________________________________


In [None]:
es = EarlyStopping(monitor='val_acc',patience= 5,min_delta=0.01) # EarlyStoping callback to stop the fitting before all epochs
filepath = './fre2eng.h5' # filepath required for checkpoint
checkpoint = ModelCheckpoint(filepath, monitor='val_acc', verbose=1, save_best_only=True, mode='max') # ModelCheckPoint to save the best model

history = model.fit(source_train_seq_pad, target_train_seq_pad, 
                    epochs= 50,
                    batch_size=128, 
                    validation_data = (source_test_seq_pad,target_test_seq_pad), 
                    verbose=1,
                    callbacks=[checkpoint,es])

Epoch 1/50

Epoch 00001: val_acc improved from -inf to 0.42780, saving model to ./fre2eng.h5
Epoch 2/50

Epoch 00002: val_acc improved from 0.42780 to 0.45690, saving model to ./fre2eng.h5
Epoch 3/50

Epoch 00003: val_acc improved from 0.45690 to 0.49435, saving model to ./fre2eng.h5
Epoch 4/50

Epoch 00004: val_acc improved from 0.49435 to 0.51845, saving model to ./fre2eng.h5
Epoch 5/50

Epoch 00005: val_acc improved from 0.51845 to 0.54370, saving model to ./fre2eng.h5
Epoch 6/50

Epoch 00006: val_acc improved from 0.54370 to 0.55955, saving model to ./fre2eng.h5
Epoch 7/50

Epoch 00007: val_acc improved from 0.55955 to 0.57820, saving model to ./fre2eng.h5
Epoch 8/50

Epoch 00008: val_acc improved from 0.57820 to 0.59825, saving model to ./fre2eng.h5
Epoch 9/50

Epoch 00009: val_acc improved from 0.59825 to 0.60755, saving model to ./fre2eng.h5
Epoch 10/50

Epoch 00010: val_acc improved from 0.60755 to 0.61590, saving model to ./fre2eng.h5
Epoch 11/50

Epoch 00011: val_acc improved

In [None]:
# loading the weights from the best saved model
model.load_weights(filepath)

In [None]:
# a dictionary having key is a token number for a particular word and value is a word
# this will required to decode the predicted sequence
target_vocab_idx = {v:k for k,v in target_tokenizer.word_index.items()}

# function to predict the decoded sequence
def predict_sequence(model,sent,vocab_idx):
    prediction = model.predict(sent.reshape(1,max_source_length))[0]
    integers = [np.argmax(vector) for vector in prediction]
    target = []
    for i in integers:
        if i != 0:
            word = vocab_idx[i]
            if word is None:
                break
            target.append(word)
            
    return ' '.join(target)


In [None]:
# for evaluation of the model through BLEU_score
def bleu_score(model,ip,ip_raw,op_raw,vocab_idx):
    
    prediction,actual = [],[]
    for i,sent in enumerate(ip):
        
        if i%10 == 0: # to print the progress
            print('\rprogress ',(i+1)*100//len(ip),'%',sep='',end='',flush = True)
        
        translation = predict_sequence(model,sent,vocab_idx)
        
        prediction.append(translation)
        actual.append(op_raw[i])
    
    print()
    # printing the first ten sentences
    for i in range(10):
        print('French_sentence -',ip_raw[i],' | ',
            'English_actual_sentence -',op_raw[i],' | ',
            'English_predicted_sentence -',prediction[i])
    
    print()
    # printing the BLEU_score
    print('BLEU_SCORE')
    print('BLEU score-1: %f' % corpus_bleu(actual, prediction, weights=(1.0, 0, 0, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-2: %f' % corpus_bleu(actual, prediction, weights=(0.5, 0.5, 0, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-3: %f' % corpus_bleu(actual, prediction, weights=(0.3, 0.3, 0.3, 0),smoothing_function=smoothie,auto_reweigh=False))
    print('BLEU score-4: %f' % corpus_bleu(actual, prediction, weights=(0.25, 0.25, 0.25, 0.25),smoothing_function=smoothie,auto_reweigh=False))

# Evaluating the model on training dataset

In [None]:
bleu_score(model,source_train_seq_pad,source_train,target_train,target_vocab_idx)

progress 99%
French_sentence - que vous êtes grandes  |  English_actual_sentence - how tall you are  |  English_predicted_sentence - how tall you are
French_sentence - j'ai rencontré mes amis  |  English_actual_sentence - i met my friends  |  English_predicted_sentence - i met my friends
French_sentence - restez assis sans bouger  |  English_actual_sentence - sit still  |  English_predicted_sentence - sit still
French_sentence - fais-en simplement l'expérience  |  English_actual_sentence - just try it out  |  English_predicted_sentence - just try it out
French_sentence - j'utilise firefox  |  English_actual_sentence - i use firefox  |  English_predicted_sentence - i use firefox
French_sentence - demande de l'aide  |  English_actual_sentence - call for help  |  English_predicted_sentence - call for help
French_sentence - garez-vous s'il vous plaît  |  English_actual_sentence - please pull over  |  English_predicted_sentence - please pull over
French_sentence - quelqu'un rigola  |  Engli

# Evaluating the model on testing dataset

In [None]:

bleu_score(model,source_test_seq_pad,source_test,target_test,target_vocab_idx)

progress 99%
French_sentence - ce n'était pas une course  |  English_actual_sentence - it wasn't a race  |  English_predicted_sentence - it didn't a bike
French_sentence - je me suis dégonflée  |  English_actual_sentence - i wimped out  |  English_predicted_sentence - i got american
French_sentence - vous êtes créatives  |  English_actual_sentence - you're creative  |  English_predicted_sentence - you're silly
French_sentence - c'était nécessaire  |  English_actual_sentence - it was necessary  |  English_predicted_sentence - that was stupid
French_sentence - c'était vague  |  English_actual_sentence - it was vague  |  English_predicted_sentence - it was likes enticing
French_sentence - oust  |  English_actual_sentence - get out  |  English_predicted_sentence - get out
French_sentence - je me suis remise  |  English_actual_sentence - i recovered  |  English_predicted_sentence - i recovered
French_sentence - elles travaillent toutes les deux  |  English_actual_sentence - they both work  