# Build an efficient supervised word translator for English to French and French to English

Based on : "Exploiting Similarities among Languages for Machine Translation" of Tomas Mikolov, Quoc V. Le & Ilya Sutskever (2013)

In [20]:
import io

import numpy as np
from numpy import dot
from numpy.linalg import norm

import sklearn
from sklearn.metrics.pairwise import cosine_similarity

In [21]:
# load function for pretrained versions of word embeddings
def load_embeddings(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

#### FRENCH to ENGLISH

* source language = french
* target language = english

In [22]:
eng_path = '/Users/louismonier/Downloads/Monolingual/wiki.en.vec' 
fr_path = '/Users/louismonier/Downloads/Monolingual/wiki.fr.vec'
nmax = 50000  # maximum number of word embeddings to load

# Monolingual word embeddings 
src_embeddings, src_id2word, src_word2id = load_embeddings(fr_path, nmax) # source = french 
tgt_embeddings, tgt_id2word, tgt_word2id = load_embeddings(eng_path, nmax) # target = english

In [23]:
# read ground-truth bilingual dictionaries function
def load_dic(path):
    dico_full = {}
    vectors_src=[]
    vectors_tgt = []
    with io.open(path,'r',encoding='utf_8') as f:
        for i,line in enumerate(f):
            word_src, word_tgt = line.rstrip().split(' ',1)
            if word_tgt in tgt_word2id :
                dico_full[word_src]=word_tgt
    for key in dico_full.keys() :
            vectors_src.append(src_embeddings[src_word2id[key]])
            vectors_tgt.append(tgt_embeddings[tgt_word2id[dico_full[key]]])
    X = np.vstack(vectors_src)
    Z = np.vstack (vectors_tgt)
    return dico_full,X,Z

In [45]:
# train & test bilingual dictionaries
path_train = r'/Users/louismonier/Downloads/Monolingual/fr-en.0-5000.txt' 
path_test = r'/Users/louismonier/Downloads/Monolingual/fr-en.5000-6500.txt'
dico_train, X_train, Z_train = load_dic(path_train)
dico_test, X_test, Z_test = load_dic(path_test) 

#### Learning a linear mapping from a source (french) to a target (english) embedding space thanks to a translation Matrix W 

Suppose we are given a set of word pairs and their associated vector representations ${ \{x_i , z_i\} }^{n}_{i=1}$ , where $x_i$ ∈ $R^{d1}$ is the distributed representation of word i in the source language, and $z_i$ ∈ $R^{d2}$ is the vector representation of its translation.

It is our goal to find a transformation matrix W such that W xi approximates $z_i$ . In practice, W can be learned by the following optimization problem :

$$ \underset{W}{min} \sum_{i=1}^{n} \| W x_i - z_i \|^2 $$ 

which we solve with gradient descent (GD), stochastic gradient descent (SGD) or mini-batch gradient descent (BGD).

In [49]:
# function to minimize
def C(W,X,Z):
    S=0
    for i in range(X.shape[0]):
        S=S+np.linalg.norm(np.dot(W,X[i])-Z[i])**2
    return S

In [50]:
# gradient of the function minimize
def dC_dW(W,X,Z):
    S=0
    for i in range(X.shape[0]):
        S=S+2*np.outer((np.dot(W,X[i])-Z[i]),X[i])
    return S

In [51]:
# GD function
def gradientDescent(eta,N): 
    W = np.random.rand(300,300) # random initialisation of W
    value_C = []
    acc_train = []
    acc_test = [] 
    
    for t in range(N): 
        if (t%10==0): # every 10 examples to go a little bit faster
            #dico_pred_test = prediction_dict(dico_test,W)  
            #acc_test.append(accuracy(dico_pred_test,dico_test))
            dico_pred_train = prediction_dict(dico_train,W)  
            acc_train.append(accuracy(dico_pred_train,dico_train))
        W -= eta*dC_dW(W,X_train,Z_train)
        value_C.append(C(W,X_train,Z_train))
    
    return (W,value_C,acc_test,acc_train)

In [52]:
# SGC or MGD
def SGD(eta,N,nb):
    W = np.random.rand(300,300) # random initialisation of W
    grad=0
    value_C = []
    
    for t in range(N):
        if nb>1 : # MGD
            l = np.random.choice(len(dico_train),nb) # size of batch
            for p in l :  
                grad += (2*np.outer((np.dot(tmp_W,X_train[p])-Z_train[p]),X_train[p]))
        else : # SGD
            l = np.random.randint(low=0,high=len(dico_train)) 
            grad += (2*np.outer((np.dot(tmp_W,X_train[l])-Z_train[l]),X_train[l]))
        grad = (1/nb)*grad
        W -= eta*grad
        value_C.append(C(W,X_train,Z_train))
        
    return (W,value_C)

### Training the model

### Testing the model

At the prediction time, for any given new word and its continuous vector representation x, we can map it to the other language space by computing $z = Wx$. Then, we find the word whose representation is closest to z in the target language space, using cosine similarity as the distance metric.

In [None]:
def prediction(W, new_word):
    x = src_embeddings[src_word2id[new_word]] # vector representation of the new word in the source space
    z = np.dot(W,x) # vector representation of the translated word in the target space

    # representation closest to z in the target language space, using cosine similarity as the distance metric
    z_pred = np.argmax(sklearn.metrics.pairwise.cosine_similarity(z.reshape(1,300),tgt_embeddings)) 
    
    return tgt_id2word[z_pred] # return the id of the translated word

In [54]:
# construct a translation French to English dictionary
def prediction_dict(dico, W):
    dico_pred={}
    i=0
    for word in dico.keys() :
        if (i%10==0):
            print(i)
        dico_pred[word] = prediction(W, word)
        i += 1
    return dico_pred 

In [55]:
# measure of the accuracy of the dictionnarty
def accuracy(dico_pred, dico):
    c = 0
    for key in dico.keys():
        if dico[key] == dico_pred[key]:
            c += 1
    return(c/len(dico)) # nb de mots bien prédits/nb de mots total

In [56]:
import matplotlib.pyplot as plt
    
W, val, aac_test, acc_train = gradientDescent(0.001, 30)

plt.plot(range(len(val)),val)
plt.ylabel('Cost Function')
plt.xlabel('Iteration')
plt.title("Cost function")
plt.legend()
plt.show()

dico_pred = prediction_dict(dico_test)  
accuracy_test = accuracy(dico_pred,dico_test)

plt.plot(range(0,len(val),10),aac_test)
plt.ylabel('Accuracy TestSet')
plt.xlabel('Iteration')
plt.legend()
plt.show()

0
10
20
30
40
50
60
70
80
90
100
110
120
130
140
150
160
170
180
190
200
210
220
230
240
250
260
270
280
290
300
310
320
330
340
350
360
370
380
390
400
410
420
430
440
450
460
470
480
490
500
510
520
530
540
550
560
570
580
590
600
610
620
630
640
650
660
670
680
690
700
710
720
730
740
750
760
770
780
790
800
810
820
830
840
850
860
870
880
890
900
910
920
930
940
950
960
970
980
990
1000
1010
1020
1030
1040
1050
1060
1070
1080
1090
1100
1110
1120
1130
1140
1150
1160
1170
1180
1190
1200
1210
1220
1230
1240
1250
1260
1270
1280
1290
1300
1310
1320
1330
1340
1350
1360
1370
1380
1390
1400
1410
1420
1430
1440
1450
1460
1470
1480
1490
1500
1510
1520
1530
1540
1550
1560
1570
1580
1590
1600
1610
1620
1630
1640
1650
1660
1670
1680
1690
1700
1710
1720
1730
1740
1750
1760
1770
1780
1790
1800
1810
1820
1830
1840
1850
1860
1870
1880
1890
1900
1910
1920
1930
1940
1950
1960
1970
1980
1990
2000
2010
2020
2030
2040
2050
2060
2070
2080
2090
2100
2110
2120
2130
2140
2150
2160
2170
2180
2190
2200
2210
2

KeyboardInterrupt: 

## Adressing the inconsistency

Based on "Normalized Word Embedding and Orthogonal Transform for Bilingual Word Translation" of Chao Xing, Dong Wang, Chao Liu & Yiye Lin (2015)

In [None]:
# function to minimize
def C_ortho(W,X,Z):
    S=0
    for i in range(X.shape[0]):
        S+=(W.dot(X[i])).reshape(1,300).dot(Z[i].reshape(300,1))
    return S

In [None]:
# gradient of function to minimize
def dC_dW_ortho(W,X,Z):
    S=0
    for i in range(X.shape[0]):
        S+=np.outer(X[i],Z[i])
    return S

In [None]:
# orthogonal GD function
def orthogonal_GD(alpha,N)
    W = np.random.rand(300,300) # random initialisation of W
    value_C_ortho = []
    
    for t in range(N):
        W += alpha*dC_dW_ortho(tmp_W,X_train,Z_train)
        value_C_ortho.append(C_ortho(W,X_train,Z_train))
        
    #rajouter contrainte d'orthogonalité sur W 
    print(value_C_ortho)
    print(dC_dW_ortho(W,X_train,Z_train))
    print(np.linalg.norm(dC_dW_ortho(W,X_train,Z_train)))
    return(W,value_C_ortho)