# Build an efficient unsupervised word translator

Based on : "Word Translation Without Parallel Data" by Alexis Conneau, Guillaume Lample, Marc Aurelio Ranzato, Ludovic Denoyer & Hervé Jégou (2017)

In [64]:
import io

import numpy as np
import matplotlib.pyplot as plt

import torch
from torch import nn
import torch.optim as optim
import torch.nn.functional as F
#from torch.autograd.variable import Variable

#from sklearn.metrics.pairwise import cosine_similarity

In [65]:
# load function for pretrained versions of word embeddings
def load_embeddings(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [108]:
##########
eng_path = '/Users/louismonier/Downloads/Monolingual/wiki.en.vec' 
fr_path = '/Users/louismonier/Downloads/Monolingual/wiki.fr.vec'
##########
#eng_path = '/Users/vince/DataProjetNLP/wiki.en.vec'
#fr_path = '/Users/vince/DataProjetNLP/wiki.fr.vec'
##########
nmax = 50000  # maximum number of word embeddings to load

# load monolingual word embeddings 
src_embeddings, src_id2word, src_word2id = load_embeddings(fr_path, nmax) # source = french 
tgt_embeddings, tgt_id2word, tgt_word2id = load_embeddings(eng_path, nmax) # target = english

In [67]:
# load ground-truth bilingual dictionaries function
def load_dic(path):
    dico_full = {}
    vectors_src=[]
    vectors_tgt = []
    with io.open(path,'r',encoding='utf_8') as f:
        for i,line in enumerate(f):
            word_src, word_tgt = line.rstrip().split(' ',1)
            if word_tgt in tgt_word2id :
                dico_full[word_src]=word_tgt
    for key in dico_full.keys() :
            vectors_src.append(src_embeddings[src_word2id[key]])
            vectors_tgt.append(tgt_embeddings[tgt_word2id[dico_full[key]]])
    X = np.vstack(vectors_src)
    Z = np.vstack (vectors_tgt)
    return dico_full,X,Z

In [68]:
# train & test bilingual dictionaries
##########
path_train = r'/Users/louismonier/Downloads/Monolingual/fr-en.0-5000.txt' 
path_test = r'/Users/louismonier/Downloads/Monolingual/fr-en.5000-6500.txt'
##########
#path_train = '/Users/vince/DataProjetNLP/fr-en.0-5000.txt' 
#path_test = '/Users/vince/DataProjetNLP/fr-en.5000-6500.txt'
##########

dico_train, X_train, Z_train = load_dic(path_train)
dico_test, X_test, Z_test = load_dic(path_test)

# convert embeddings vectors into torch tensors 
print(type(X_train[0]))
X_train, Z_train, X_test, Z_test = map(torch.tensor, (X_train, Z_train, X_test, Z_test)) 
print(type(X_train[0]))

<class 'numpy.ndarray'>
<class 'torch.Tensor'>


In [69]:
print(X_train.shape[0], "training samples")
print(X_test.shape[0], "test samples")
dim = X_train.shape[1]
print("Vectors dimension :", dim)

4971 training samples
1483 test samples
Vectors dimension : 300


## Build the discriminator 

Recall what is the objective of the discriminator here : ...

In [72]:
class Discriminator(nn.Module):
    def __init__(self, dim):
        super(Discriminator,self).__init__()
        self.h1 = nn.Linear(dim, 2048,bias=True) # 1st hidden layer
        self.h2 = nn.Linear(2048,2048,bias=True) # 2nd hidden layer
        self.out = nn.Linear(2048,1,bias=True) # output layer
        
    def forward(self, x):
        x = self.h1(F.dropout(x, p = 0.1)) # dropout pour ajouter du bruit
        x = F.leaky_relu(self.h1(x), negative_slope=0.2) 
        x = F.leaky_relu(self.h2(x), negative_slope=0.2)
        y = F.sigmoid(self.out(x)) # ouput = proba
        return y

In [73]:
def LossDisc(y, y_pred):
    return(-math.log((y_pred**y)*(1-y_pred)**(1-y)))

# or : 
LossD = nn.BCELoss()

## Build the generator 

Recall what is the objective of the discriminator here : ...

In [74]:
# simple linear function 
# can be seen at a neural network whose weights are elements of W 
class Generator(nn.Module):
    def __init__(self, dim):
        super(Generator, self).__init__()
        self.l1 = nn.Linear(dim, dim)

    def forward(self,x):
        y = self.l1(x)
        return y

In [75]:
def LossMap(y, y_pred):
    return(-math.log((y_pred**(1-y))*(1-y_pred)**y))

# or :
LossG = nn.BCELoss()

## It's time to train 

In [76]:
discrim = Discriminator(dim)
gen = Generator(dim)

In [77]:
optimD = optim.SGD(discrim.parameters(), lr=0.1)
optimG = optim.SGD(gen.parameters(), lr=0.1)

In [114]:
rand_src_word_id = torch.Tensor(32).random_(X_train.shape[0]).long()
src_word_emb = src_embeddings[rand_src_word_id.numpy()]
src_word_emb = torch.tensor(src_word_emb,dtype=torch.float)
print(src_word_emb.shape)
print(type(src_word_emb))
src_word_emb = torch.tensor(src_word_emb,dtype=torch.float)
print(type(src_word_emb))

(32, 300)
<class 'numpy.ndarray'>
<class 'torch.Tensor'>


In [62]:
#for epoch in range(3): #3 Epochs 

niter = 10
for iteration in range(niter):
    #if iteration % 10 == 0 :
        #print("epoch = %d, iteration = %d"%(epoch,iteration))
    
    # DISCRIMINATOR TRAINING
    for i in range(3):
        # set the training mode to True
        discrim.train()
        
        # set descrim gradient to zero before computation at every step
        optimD.zero_grad()
        
        # generate 32 random words from the source 
        rand_src_word_id = torch.Tensor(32).random_(nmax).long()
        src_word_emb = src_embeddings[rand_src_word_id.numpy()]
        src_word_emb = torch.tensor(src_word_emb, dtype=torch.float) # conversion to tensor
    
        wsrc_gen = gen(src_word_emb)
    
        # generate 32 random words from the target
        rand_tgt_word_id = torch.Tensor(32).random_(nmax).long()
        tgt_word_emb = tgt_embeddings[rand_tgt_word_id.numpy()]
        tgt_word_emb = torch.tensor(tgt_word_emb, dtype=torch.float) # conversion to tensor

        # concatenation of Wx and y aka traductions and tgt words
        input_tensor = torch.cat([src_mult_mapper,target_lang_word_emb],0)

        # output tensor is the answer the discriminator should give
        output_tensor = torch.Tensor(64).zero_().float()
        # we can smooth the answer by creating thresholds (# 0.8 # smoothing 80% # smoothing 20%)
        output_tensor[:32] = 1 # discrim should predict 100% proba of belonging to the src 
        output_tensor[32:] = 0 # discrim should predict 100% proba of belonging to the tgt 
        
        # prediction of the discriminator
        prediction = discriminator(input_tensor)
        
        # compute loss & propogate backward
        loss_discrim = LossD(prediction, output_tensor)
        # loss_discrim = LossDisc(output_tensor, prediction)
        
        loss_discrim.backward()
        optimD.step()

    ### TO CONTINUE HERE 
    # mapping training 
    discriminator.eval()
    #Set gradient to zero before computation at every step
    optimizer2.zero_grad()
    src_lang_word_id = torch.Tensor(32).random_(50000).long()
    src_lang_word_emb = src_embedding_learnable(src_lang_word_id).cuda()
    target_lang_word_id = torch.Tensor(32).random_(50000).long()
    target_lang_word_emb = target_embedding_learnable(target_lang_word_id).cuda()
    src_mult_mapper = mapper(src_lang_word_emb)
    input_tensor = torch.cat([src_mult_mapper,target_lang_word_emb],0).cuda()
    output_tensor = torch.Tensor(64).zero_().float().cuda()
    output_tensor[:32] = 1 -0.2 #Smoothing
    output_tensor[32:] = 0.2
    prediction = discriminator(input_tensor).cuda()
    loss = criterion2(prediction,1-output_tensor).cuda()
    loss.backward()
    optimizer2.step()
    mapping_tensor = mapper.linear1.weight.data
    mapping_tensor.copy_((1.01) * mapping_tensor - 0.01 * mapping_tensor.mm(mapping_tensor.t().mm(mapping_tensor)))


#Validation through proxy parralel dictionary construction (both directions) and CSLS
src_emb_map_validation = mapper(src_embedding_learnable.weight.cuda()).cuda()
target_emb_map_validation = target_embedding_learnable.weight.cuda()
src_emb_map_validation = src_emb_map_validation/src_emb_map_validation.norm(2, 1, keepdim=True).expand_as(src_emb_map_validation)
target_emb_map_validation = target_emb_map_validation/target_emb_map_validation.norm(2, 1, keepdim=True).expand_as(target_emb_map_validation)
src_to_target_dictionary = top_words(src_emb_map_validation,target_emb_map_validation)
target_to_src_dictionary = top_words(target_emb_map_validation,src_emb_map_validation)
dictionary = proxy_construct_dictionary(src_emb_map_validation,target_emb_map_validation,src_to_target_dictionary,target_to_src_dictionary)
if dictionary is None:
    mean_cosine = -1e9
else:
    mean_cosine = (src_emb_map_validation[dictionary[:, 0]] * target_emb_map_validation[dictionary[:, 1]]).sum(1).mean()

# Dampenining by 0.95
optimizer1.param_groups[0]['lr'] = 0.95*optimizer1.param_groups[0]['lr']
optimizer2.param_groups[0]['lr'] = 0.95*optimizer2.param_groups[0]['lr']
#Divide by 2 if validation decreases
if mean_cosine > 0 and mean_cosine < validation_tracker :
    optimizer1.param_groups[0]['lr'] = 0.5*optimizer1.param_groups[0]['lr']
    optimizer2.param_groups[0]['lr'] = 0.5*optimizer2.param_groups[0]['lr']
    validation_tracker = max(mean_cosine,validation_tracker)
print(epoch,mean_cosine)




epoch = 0, iteration = 0


NameError: name 'disc' is not defined

In [18]:
n_epochs = 1

for n in range(n_epochs):
    i = 0
    for i in range lenx, y in train_loader:
        y_pred = model(x)
        loss = criterion(y_pred,y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        i+=1
        if i % 100 ==0 :
            print("Epoch [{}/{}], [{}/{}], batch loss = {}".format(n+1,n_epochs,i*bs,n_data_train,loss))      
            
    acc = 0    
    with torch.no_grad():
        for x, y in test_loader:
            y_pred = model(x)
            y_pred = torch.argmax(y_pred,dim=1)
            acc += (y_pred == y).sum().item()
        acc = acc/n_data_test
    print("Epoch [{}/{}], test accuracy = {}".format(n+1,n_epochs,acc))

Discriminator(
  (h1): Linear(in_features=300, out_features=2048, bias=True)
  (h2): Linear(in_features=2048, out_features=2048, bias=True)
  (out): Linear(in_features=2048, out_features=1, bias=True)
)

In [31]:
# function to minimize 
# normalisation added
def C(W,X,Z):
    S = 0
    S = sum(np.linalg.norm(np.dot(X, W.T) - Z, axis=1)**2) 
    return S