In [71]:
from __future__ import unicode_literals, print_function, division
from io import open
import unicodedata
import re
import random
import os
import time

import pandas as pd
import numpy as np

import torch
import torch.nn as nn
from torch import optim
import torch.nn.functional as F
from torch.autograd import Variable

import numpy as np
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

import gc
from tqdm import tqdm
import math

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
#device = torch.device('cpu')
device

device(type='cpu')

Reference: https://github.com/sayarghoshroy/Statistical-Machine-Translation/blob/master/SMT_English_to_Hindi.ipynb

 Clean Data Functions

In [72]:
# Turn a Unicode string to plain ASCII, thanks to
# https://stackoverflow.com/a/518232/2809427
def unicodeToAscii(s):
    return ''.join(
        c for c in unicodedata.normalize('NFD', s)
        if unicodedata.category(c) != 'Mn'
    )

# Lowercase, trim, and remove non-letter characters
def normalizeString(s):
    s = unicodeToAscii(s.lower().strip())
    s = re.sub(r"([.!?])", r" \1", s)
    s = re.sub(r"[^a-zA-Z!?]+", r" ", s)
    return s.strip()

In [73]:
start = time.time()

if os.path.isfile('./data/valid_subset.csv'):
    # cleaned, filtered by length, 10% dataset
    df = pd.read_csv('./data/valid_subset.csv', index_col=False)
    
elif os.path.isfile('./data/valid_cleaned_data.csv'):
    # cleaned, filtered by length dataset
    df = pd.read_csv('./data/valid_cleaned_data.csv', index_col=False)
    
elif os.path.isfile('./data/cleaned_data.csv'):
    # cleaned dataset
    df = pd.read_csv('./data/cleaned_data.csv', index_col=False)
else:

    df = pd.read_csv('./data/en-fr.csv')

end = time.time()
display(end - start)

df.head()

0.257213830947876

Unnamed: 0,en,fr
0,site map,plan du site
1,feedback,retroaction
2,credits,credits
3,francais,english
4,what is light ?,qu est ce que la lumiere ?


In [74]:
# Clean data only if not available    

start = time.time()
df.dropna(inplace=True)

if not os.path.isfile('./data/cleaned_data.csv'):
    df['en'] = df['en'].apply(lambda x: normalizeString(str(x)))
    df['fr'] = df['fr'].apply(lambda x: normalizeString(str(x)))
    df.to_csv('./data/cleaned_data.csv', index=False)
    
end = time.time()
display(end-start)

0.02581310272216797

In [75]:
### Filter dataset by length
MAX_LENGTH = 35

if not os.path.isfile('./data/valid_cleaned_data.csv'):
    # Assume df is your DataFrame and 'column_name' is the column you want to check
    string_length_required = 35  # Length of string in the column
    df = df[df['en'].str.len() <= string_length_required]
    df = df[df['fr'].str.len() <= string_length_required]

    
    df.to_csv('./data/valid_cleaned_data.csv', index=False)


df

Unnamed: 0,en,fr
0,site map,plan du site
1,feedback,retroaction
2,credits,credits
3,francais,english
4,what is light ?,qu est ce que la lumiere ?
...,...,...
392285,regional programmes,les programmes regionaux
392286,papular genital lesions,pertes vaginales
392287,chancroid haemophilus ducreyi,chancre mou h mophilis ducreyi
392288,turnor lake b province,turnor lake b province


Tokenize and Create Vocabulary

In [76]:
df_train = df.sample(frac=0.9)

df_test = pd.concat([df, df_train])
df_test.drop_duplicates(keep=False, inplace=True)
df_train.reset_index(inplace=True, drop=True)
df_test.reset_index(inplace=True, drop=True)

In [77]:
import nltk
from nltk.tokenize import word_tokenize

english_sentences = df_train['en'].tolist()
french_sentences = df_train['fr'].tolist()

tokenized_en = [word_tokenize(sentence) for sentence in english_sentences]
tokenized_fr = [word_tokenize(sentence) for sentence in french_sentences]

In [78]:
from collections import Counter
import numpy as np

# Flatten lists and count word frequencies
en_words = Counter(word for sentence in tokenized_en for word in sentence)
fr_words = Counter(word for sentence in tokenized_fr for word in sentence)

en_vocab = len(en_words)
fr_vocab = len(fr_words)


In [79]:
# creating the 't'
t = {}
uniform = 1 / (en_vocab * fr_vocab)

Expectation-Maximization Algorithm (from reference)

In [80]:
n_iters = 0
max_iters = 50

fine_tune = 1
has_converged = False
if not os.path.isfile('translation_model.pkl'):
    while n_iters < max_iters and has_converged == False:
        has_converged = True
        max_change = -1

        n_iters += 1
        count = {}
        total = {}
        for index in range(len(tokenized_en)):
            s_total = {}
            for en_word in tokenized_en[index]:
                s_total[en_word] = 0
                for fr_word in tokenized_fr[index]:
                    if (en_word, fr_word) not in t:
                        t[(en_word, fr_word)] = uniform
                    s_total[en_word] += t[(en_word, fr_word)]

            for en_word in tokenized_en[index]:
                for fr_word in tokenized_fr[index]:
                    if (en_word, fr_word) not in count:
                        count[(en_word, fr_word)] = 0
                    count[(en_word, fr_word)] += (t[(en_word, fr_word)] / s_total[en_word])

                    if fr_word not in total:
                        total[fr_word] = 0
                    total[fr_word] += (t[(en_word, fr_word)] / s_total[en_word])

        # estimating the probabilities

        if fine_tune == 0:
            updated = {}

            for index in range(len(tokenized_en)):
                for fr_word in tokenized_fr[index]:
                    for en_word in tokenized_en[index]:
                        if (en_word, fr_word) in updated:
                            continue
                        updated[(en_word, fr_word)] = 1
                        if abs(t[(en_word, fr_word)] - count[(en_word, fr_word)] / total[fr_word]) > 0.01:
                            has_converged = False
                            max_change = max(max_change, abs(t[(en_word, fr_word)] - count[(en_word, fr_word)] / total[fr_word]))
                        t[(en_word, fr_word)] = count[(en_word, fr_word)] / total[fr_word]
        
        elif fine_tune == 1:
                # train it only for 1000 most frequent words in English and French
                max_words = 1000
                n_fr_words = 0
                updates = 0

                for fr_word_tuples in sorted(fr_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
                    fr_word = fr_word_tuples[0]
                    n_fr_words += 1
                    if n_fr_words > max_words:
                        break
                    n_en_words = 0
                    for en_word_tuples in sorted(en_words.items(), key = lambda k:(k[1], k[0]), reverse = True):
                        en_word = en_word_tuples[0]
                        n_en_words += 1
                        if n_en_words > max_words:
                            break
                        if (en_word, fr_word) not in count or fr_word not in total:
                            continue
                            # assume = uniform in this case
                        else:
                            if abs(t[(en_word, fr_word)] - count[(en_word, fr_word)] / total[fr_word]) > 0.005:
                                has_converged = False
                                max_change = max(max_change, abs(t[(en_word, fr_word)] - count[(en_word, fr_word)] / total[fr_word]))
                            t[(en_word, fr_word)] = count[(en_word, fr_word)] / total[fr_word]
                

                print("Iteration " + str(n_iters) + " Completed, Maximum Change: " + str(max_change))

Iteration 1 Completed, Maximum Change: 0.3873120883833188
Iteration 2 Completed, Maximum Change: 0.3779131577679606
Iteration 3 Completed, Maximum Change: 0.11644947148332752
Iteration 4 Completed, Maximum Change: 0.06989713947912918
Iteration 5 Completed, Maximum Change: 0.05399860149211433
Iteration 6 Completed, Maximum Change: 0.039101991368349964
Iteration 7 Completed, Maximum Change: 0.02915428347836463
Iteration 8 Completed, Maximum Change: 0.02500380364812338
Iteration 9 Completed, Maximum Change: 0.021059834762957474
Iteration 10 Completed, Maximum Change: 0.017462948788182775
Iteration 11 Completed, Maximum Change: 0.014428741448640092
Iteration 12 Completed, Maximum Change: 0.013170636150230397
Iteration 13 Completed, Maximum Change: 0.012005883242324877
Iteration 14 Completed, Maximum Change: 0.010935993145647804
Iteration 15 Completed, Maximum Change: 0.009958588791638634
Iteration 16 Completed, Maximum Change: 0.009068769131450716
Iteration 17 Completed, Maximum Change: 0.

Save Data in pickle file

In [81]:
import pickle
# saving the translation model
file = open("translation_model.pkl","wb")
pickle.dump(t, file)
file.close()

In [82]:
import pickle
# using the model trained until convergence
model_name = "translation_model.pkl"
pickle_in = open(model_name,"rb")
t = pickle.load(pickle_in)

Display best translation pairs

In [83]:
limit = 400
for element in sorted(t.items(), key = lambda k:(k[1], k[0]), reverse = True):
  print(element)
  limit -= 1
  if limit <= 0:
    break


(('html', 'html'), 0.765667329718205)
(('pdf', 'pdf'), 0.7593277191290286)
(('kb', 'ko'), 0.7393645135843259)
(('gst', 'tps'), 0.7099352737425695)
(('nation', 'nation'), 0.7002959511812783)
(('management', 'gestion'), 0.6947445719917178)
(('cihr', 'irsc'), 0.675069494995577)
(('hst', 'tvh'), 0.6744432862372122)
(('vision', 'vision'), 0.671790013757021)
(('first', 'first'), 0.6701625260213313)
(('x', 'x'), 0.6685504513606617)
(('qc', 'qc'), 0.6682243701989722)
(('institution', 'institution'), 0.6581979148755476)
(('date', 'date'), 0.6551523569329727)
(('april', 'avril'), 0.6541978536570784)
(('european', 'europeenne'), 0.6541393685921115)
(('strategy', 'strategie'), 0.6529762803486532)
(('criteria', 'criteres'), 0.6498389717918145)
(('related', 'connexes'), 0.6495501983735816)
(('financial', 'financiers'), 0.6475069678284671)
(('industry', 'industrie'), 0.6474918690748956)
(('principles', 'principes'), 0.6448987757527863)
(('rule', 'regle'), 0.6421718906774232)
(('quality', 'qualite'), 

In [86]:
english_sentence_tests = df_test['en'].tolist()
french_sentence_tests = df_test['fr'].tolist()

tokenized_en_test = [word_tokenize(sentence) for sentence in english_sentence_tests]
tokenized_fr_test = [word_tokenize(sentence) for sentence in french_sentence_tests]

Testing Code and BLEU Metrics

In [107]:
import nltk

def keywithmaxval(d):
     """ a) create a list of the dict's keys and values; 
         b) return the key with the max value"""  
     v = list(d.values())
     k = list(d.keys())
     return k[v.index(max(v))]


limit = 1
i = 0
all_preds = {}
for en_sent in tokenized_en_test:
    if limit == 21:
        break
    pred = []
    for en_word in en_sent:
        filtered_t = {key: value for key, value in t.items() if key[0] == en_word}
        if len(filtered_t) > 0:
            best_en, best_fr = keywithmaxval(filtered_t)
        else:
            best_fr = 'UNKNOWN!!!'
        pred.append(best_fr)
    
    actual = tokenized_fr_test[i]
    
    BLEUscore = nltk.translate.bleu_score.sentence_bleu([actual], pred) 

    
    print("Sentence " + str(limit) + "/20")
    print("Prediction:", pred)
    print("Actual:", actual)
    print("BLEU Score:", BLEUscore)
    
    i += 1
    limit += 1

    all_preds[(' '.join(pred), ' '.join(actual))] = BLEUscore

"""
# displaying the most confident translation pairs
limit = 40
for element in sorted(all_preds.items(), key = lambda k:(k[1], k[0]), reverse = True):
  print(element)
  limit -= 1
  if limit <= 0:
    break
"""



The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


Sentence 1/20
Prediction: ['sont', 'nous', 'solitude', '?']
Actual: ['sommes', 'nous', 'seuls', '?']
BLEU Score: 1.5319719891192393e-231
Sentence 2/20
Prediction: ['it', 'etait', 'the', 'first', 'vrai', 'photometre']
Actual: ['c', 'est', 'le', 'premier', 'vrai', 'photometre']
BLEU Score: 7.57965434483665e-155
Sentence 3/20
Prediction: ['quels', 'quel', 'nous', 'faire', 'conge', 'it', '?']
Actual: ['que', 'ferions', 'nous', 'sans', 'elle', '?']
BLEU Score: 1.331960397810445e-231
Sentence 4/20
Prediction: ['the', 'faut', 'etude', 'x', 'photons']
Actual: ['le', 'soleil', 'vu', 'sous', 'les', 'rayons', 'x']
BLEU Score: 8.166726842395623e-232
Sentence 5/20
Prediction: ['UNKNOWN!!!', 'votre', 'declaration', 'sont', 'droit']
Actual: ['amerindien', 'vos', 'parents', 'ont', 'raison']
BLEU Score: 0
Sentence 6/20
Prediction: ['technicien', 'vous', 're', 'merci', 'droit']
Actual: ['technicien', 'tu', 'as', 'raison']
BLEU Score: 1.2183324802375697e-231
Sentence 7/20
Prediction: ['les', 'soignent', 

'\n# displaying the most confident translation pairs\nlimit = 40\nfor element in sorted(all_preds.items(), key = lambda k:(k[1], k[0]), reverse = True):\n  print(element)\n  limit -= 1\n  if limit <= 0:\n    break\n'