In [25]:
import io

from sklearn.metrics.pairwise import cosine_similarity
from sklearn.manifold import TSNE
from statistics import stdev
import time
import numpy as np
import matplotlib.pyplot as plt
import math

In [63]:
from __future__ import print_function
#%matplotlib inline
import argparse
import os
import random
import torch
import torch.nn as nn
import torch.nn.parallel
import torch.backends.cudnn as cudnn
import torch.optim as optim
import torch.utils.data
import torchvision.datasets as dset
import torchvision.transforms as transforms
import torchvision.utils as vutils
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.animation as animation
from IPython.display import HTML
import torch.nn.functional as F

from torch.autograd.variable import Variable



In [27]:
# load function for pretrained versions of word embeddings
def load_embeddings(emb_path, nmax=50000):
    vectors = []
    word2id = {}
    with io.open(emb_path, 'r', encoding='utf-8', newline='\n', errors='ignore') as f:
        next(f)
        for i, line in enumerate(f):
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            assert word not in word2id, 'word found twice'
            vectors.append(vect)
            word2id[word] = len(word2id)
            if len(word2id) == nmax:
                break
    id2word = {v: k for k, v in word2id.items()}
    embeddings = np.vstack(vectors)
    return embeddings, id2word, word2id

In [28]:
#eng_path = '/Users/louismonier/Downloads/Monolingual/wiki.en.vec' 
#fr_path = '/Users/louismonier/Downloads/Monolingual/wiki.fr.vec'

eng_path = '/Users/vince/DataProjetNLP/wiki.en.vec'
fr_path = '/Users/vince/DataProjetNLP/wiki.fr.vec'


nmax = 50000  # maximum number of word embeddings to load

# load monolingual word embeddings 
src_embeddings, src_id2word, src_word2id = load_embeddings(fr_path, nmax) # source = french 
tgt_embeddings, tgt_id2word, tgt_word2id = load_embeddings(eng_path, nmax) # target = english

In [29]:
# load ground-truth bilingual dictionaries function
def load_dic(path):
    dico_full = {}
    vectors_src=[]
    vectors_tgt = []
    with io.open(path,'r',encoding='utf_8') as f:
        for i,line in enumerate(f):
            word_src, word_tgt = line.rstrip().split(' ',1)
            if word_tgt in tgt_word2id :
                dico_full[word_src]=word_tgt
    for key in dico_full.keys() :
            vectors_src.append(src_embeddings[src_word2id[key]])
            vectors_tgt.append(tgt_embeddings[tgt_word2id[dico_full[key]]])
    X = np.vstack(vectors_src)
    Z = np.vstack (vectors_tgt)
    return dico_full,X,Z

In [30]:
# train & test bilingual dictionaries
path_train = '/Users/vince/DataProjetNLP/fr-en.0-5000.txt' 
path_test = '/Users/vince/DataProjetNLP/fr-en.5000-6500.txt'
dico_train, X_train, Z_train = load_dic(path_train)
dico_test, X_test, Z_test = load_dic(path_test) 
print(type(X_train[0]))
X_train, Z_train, X_test, Z_test = map(torch.tensor, (X_train, Z_train, X_test, Z_test)) #TRANSFORM INTO TORCH TENSORS
print(type(X_train[0]))

<class 'numpy.ndarray'>
<class 'torch.Tensor'>


In [31]:
# function to minimize 
# normalisation added
def C(W,X,Z):
    S = 0
    S = sum(np.linalg.norm(np.dot(X, W.T) - Z, axis=1)**2) 
    return S

In [32]:
print(X_train.shape[0], "training samples")
print(X_test.shape[0], "test samples")
dim = X_train.shape[1]
print("Vectors Dimension :", dim)

4971 training samples
1483 test samples
Vectors Dimension : 300


In [33]:
print(X_train.shape)
y = X_train.view(X_train.shape[1],-1)
print(y.shape)

torch.Size([4971, 300])
torch.Size([300, 4971])


In [78]:
testX_train = torch.rand(1,300,10)
testX_test = torch.rand(1,300,10)
testX_train = testX_train.view((10,300))
testX_test = testX_test.view((10,300))
testY_test = torch.as_tensor([0,1,1,0,1,0,1,0,1,0],dtype = torch.float).view(-1,1)
testY_train = torch.as_tensor([0,0,1,0,1,0,0,1,1,1],dtype = torch.float).view(-1,1)

In [79]:
testY_test

tensor([[0.],
        [1.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.],
        [1.],
        [0.]])

In [34]:
class Discriminator(nn.Module):
    def __init__(self,d):
        super(Discriminator,self).__init__()
        self.h1 = nn.Linear(dim,2048,bias=True)
        self.h2 = nn.Linear(2048,2048,bias=True)
        self.out = nn.Linear(2048,1,bias=True)


    def forward(self,x):
        y = x.view(x.shape[0],-1) #Reshape 
        y = F.relu(self.h1(y))
        y = F.relu(self.h2(y))
        y = self.out(y)
        return y

In [57]:
print(Discriminator(testX_test))

Discriminator(
  (h1): Linear(in_features=300, out_features=2048, bias=True)
  (h2): Linear(in_features=2048, out_features=2048, bias=True)
  (out): Linear(in_features=2048, out_features=1, bias=True)
)


In [76]:
disc = Discriminator(testX_test.shape)
disc.forward(testX_test)

tensor([[ 0.0335],
        [ 0.0647],
        [ 0.0696],
        [ 0.0376],
        [ 0.0509],
        [ 0.0293],
        [ 0.0388],
        [-0.0084],
        [ 0.0494],
        [ 0.0951]], grad_fn=<AddmmBackward>)

In [77]:
disc.train()

Discriminator(
  (h1): Linear(in_features=300, out_features=2048, bias=True)
  (h2): Linear(in_features=2048, out_features=2048, bias=True)
  (out): Linear(in_features=2048, out_features=1, bias=True)
)

In [35]:
class Mapping(nn.Module):
    def __init__(self,d):
        super(Mapping,self).__init__()
        self.l1 = nn.Linear(300,300)

    def forward(self,x):
        x = self.linear1(x)
        return x

In [36]:
def LossDisc(y, y_pred):
    return(-math.log((y_pred**y)*(1-y_pred)**(1-y)))

In [37]:
LossDisc(1,0.01)

4.605170185988091

In [38]:
def LossMap(y,y_pred):
    return(-math.log((y_pred**(1-y))*(1-y_pred)**y))
LossMap(1,0.99)

4.605170185988091

In [39]:
discrim = Discriminator(300)
gen = Mapping(300)
optimizer = torch.optim.SGD(discrim.parameters(),lr = 0.1)
criterion = LossDisc()

In [None]:
disc = Discriminator(testX_test.shape)
map = Mapping(testX_test.shape)
for epoch in range(3): #3 Epochs 
    for iteration in range(30):
        if iteration % 10 == 0 :
            print("epoch = %d, iteration = %d"%(epoch,iteration))
        # discriminator trained 3 times for every mapping training
        for i in range(3):
            disc.train()
            #Set gradient to zero before computation at every step
            optimizer.zero_grad()
            src_lang_word_id = torch.Tensor(32).random_(50000).long()
            src_lang_word_emb = src_embedding_learnable(src_lang_word_id).cuda()
            target_lang_word_id = torch.Tensor(32).random_(50000).long()
            target_lang_word_emb = target_embedding_learnable(target_lang_word_id).cuda()
            src_mult_mapper = mapper(src_lang_word_emb).cuda()
            input_tensor = torch.cat([src_mult_mapper,target_lang_word_emb],0).cuda()
            output_tensor = torch.Tensor(64).zero_().float().cuda()
            output_tensor[:32] = 1 -0.2 #Smoothing
            output_tensor[32:] = 0.2
            prediction = discriminator(input_tensor).cuda()
            #Compute loss and propogate backward
            loss = criterion1(prediction,output_tensor).cuda()
            loss.backward()
            optimizer1.step()

        # mapping training 
        discriminator.eval()
        #Set gradient to zero before computation at every step
        optimizer2.zero_grad()
        src_lang_word_id = torch.Tensor(32).random_(50000).long()
        src_lang_word_emb = src_embedding_learnable(src_lang_word_id).cuda()
        target_lang_word_id = torch.Tensor(32).random_(50000).long()
        target_lang_word_emb = target_embedding_learnable(target_lang_word_id).cuda()
        src_mult_mapper = mapper(src_lang_word_emb)
        input_tensor = torch.cat([src_mult_mapper,target_lang_word_emb],0).cuda()
        output_tensor = torch.Tensor(64).zero_().float().cuda()
        output_tensor[:32] = 1 -0.2 #Smoothing
        output_tensor[32:] = 0.2
        prediction = discriminator(input_tensor).cuda()
        loss = criterion2(prediction,1-output_tensor).cuda()
        loss.backward()
        optimizer2.step()
        mapping_tensor = mapper.linear1.weight.data
        mapping_tensor.copy_((1.01) * mapping_tensor - 0.01 * mapping_tensor.mm(mapping_tensor.t().mm(mapping_tensor)))

        
    #Validation through proxy parralel dictionary construction (both directions) and CSLS
    src_emb_map_validation = mapper(src_embedding_learnable.weight.cuda()).cuda()
    target_emb_map_validation = target_embedding_learnable.weight.cuda()
    src_emb_map_validation = src_emb_map_validation/src_emb_map_validation.norm(2, 1, keepdim=True).expand_as(src_emb_map_validation)
    target_emb_map_validation = target_emb_map_validation/target_emb_map_validation.norm(2, 1, keepdim=True).expand_as(target_emb_map_validation)
    src_to_target_dictionary = top_words(src_emb_map_validation,target_emb_map_validation)
    target_to_src_dictionary = top_words(target_emb_map_validation,src_emb_map_validation)
    dictionary = proxy_construct_dictionary(src_emb_map_validation,target_emb_map_validation,src_to_target_dictionary,target_to_src_dictionary)
    if dictionary is None:
        mean_cosine = -1e9
    else:
        mean_cosine = (src_emb_map_validation[dictionary[:, 0]] * target_emb_map_validation[dictionary[:, 1]]).sum(1).mean()

    # Dampenining by 0.95
    optimizer1.param_groups[0]['lr'] = 0.95*optimizer1.param_groups[0]['lr']
    optimizer2.param_groups[0]['lr'] = 0.95*optimizer2.param_groups[0]['lr']
    #Divide by 2 if validation decreases
    if mean_cosine > 0 and mean_cosine < validation_tracker :
        optimizer1.param_groups[0]['lr'] = 0.5*optimizer1.param_groups[0]['lr']
        optimizer2.param_groups[0]['lr'] = 0.5*optimizer2.param_groups[0]['lr']
        validation_tracker = max(mean_cosine,validation_tracker)
    print(epoch,mean_cosine)




In [18]:
n_epochs = 1

for n in range(n_epochs):
    i = 0
    for i in range lenx, y in train_loader:
        y_pred = model(x)
        loss = criterion(y_pred,y)
        loss.backward()
        optimizer.step()
        optimizer.zero_grad()
        i+=1
        if i % 100 ==0 :
            print("Epoch [{}/{}], [{}/{}], batch loss = {}".format(n+1,n_epochs,i*bs,n_data_train,loss))      
            
    acc = 0    
    with torch.no_grad():
        for x, y in test_loader:
            y_pred = model(x)
            y_pred = torch.argmax(y_pred,dim=1)
            acc += (y_pred == y).sum().item()
        acc = acc/n_data_test
    print("Epoch [{}/{}], test accuracy = {}".format(n+1,n_epochs,acc))

Discriminator(
  (h1): Linear(in_features=300, out_features=2048, bias=True)
  (h2): Linear(in_features=2048, out_features=2048, bias=True)
  (out): Linear(in_features=2048, out_features=1, bias=True)
)