# Build an efficient unsupervised word translator

Based on : "Word Translation Without Parallel Data" by Alexis Conneau, Guillaume Lample, Marc Aurelio Ranzato, Ludovic Denoyer & Hervé Jégou (2017)

We only use two large monolingual corpora, one in the source and one in the target language. Our method leverages adversarial training to learn a linear mapping from a source to a target space. How do we do ?

* In a 2 player game, a discriminator is trained to distinguish between the mapped source embeddings and the target embeddings, while the mapping (which can be seen as a generator) is jointly trained to fool the discriminator.

In [2]:
import torch
from torch import nn
import torch.nn.functional as F

import numpy as np
from scipy.stats import special_ortho_group
import matplotlib.pyplot as plt

In [3]:
# load function for pretrained versions of word embeddings
def load_embeddings(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [4]:
eng_path = '/Users/louismonier/Downloads/Monolingual/wiki.en.vec' 
fr_path = '/Users/louismonier/Downloads/Monolingual/wiki.fr.vec'
nmax = 50000  # maximum number of word embeddings to load

# load monolingual word embeddings 
src_embeddings, src_id2word, src_word2id = load_embeddings(fr_path, nmax) # source = french 
tgt_embeddings, tgt_id2word, tgt_word2id = load_embeddings(eng_path, nmax) # target = english

In [5]:
# load ground-truth bilingual dictionaries function
def load_dic(path):
    dico_full = {}
    vectors_src=[]
    vectors_tgt = []
    with io.open(path,'r',encoding='utf_8') as f:
        for i,line in enumerate(f):
            word_src, word_tgt = line.rstrip().split(' ',1)
            if word_tgt in tgt_word2id :
                dico_full[word_src]=word_tgt
    for key in dico_full.keys() :
            vectors_src.append(src_embeddings[src_word2id[key]])
            vectors_tgt.append(tgt_embeddings[tgt_word2id[dico_full[key]]])
    X = np.vstack(vectors_src)
    Z = np.vstack (vectors_tgt)
    return dico_full,X,Z

In [6]:
# train & test bilingual dictionaries
path_train = r'/Users/louismonier/Downloads/Monolingual/fr-en.0-5000.txt' 
#path_test = r'/Users/louismonier/Downloads/Monolingual/fr-en.5000-6500.txt'
dico_train, X_train, Z_train = load_dic(path_train)
#dico_test, X_test, Z_test = load_dic(path_test) 

In [8]:
dsrc = X_train.shape[1]
W = special_ortho_group.rvs(dsrc) 

In [None]:
class MLP_discriminator(nn.Module):
    def __init__(self, dim_input):
        super(MLP_discriminator, self).__init__()
        self.dropout1 = nn.Dropout(disc_drop)
        self.linear_1 = nn.Linear(dim_input, 2048, bias=True)
        self.linear_2 = nn.Linear(2048, 1, bias=True)
        
    def forward(self, x):     
        y = F.LeakyReLU(self.linear_1(x))
        y = nn.Sigmoid(self.linear_2(y))
        return y

In [None]:
#class generator(nn.Module):
#    def __init__(self, dim_input):
       # super(generator, self).__init__()

#ini
#W = special_ortho_group.rvs(dsrc)       

#beta = 0.01
# update to keep W orthogonal
#W = (1 + beta)*W - beta*np.dot(W, W.T, W)


In [None]:
#batch_size = 64
#disc_drop = 0.1
#smoothing = 0.2
#k = 1 nb of iterations for discriminator training
#lr = 0.1
#decay = 0.99