In [54]:
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch.nn import functional as F

In [5]:
#src langauge is en. Target language is es.
#Load pre-trained embeddings. Using fast-text embeddings on wiki.

#Parameters
max_vocab = 10000
emb_dim = 300

In [6]:
#Processing the src words first
src_lang = 'en'
src_emd_path = 'data/wiki.en.vec'

src_word2id = {}
src_embeddings = []

with open(src_emd_path) as f:
    for i,line in enumerate(f):
        if i==0:
            split = line.split()
            assert len(split) == 2
            assert emb_dim == int(split[1])
        else:
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            if np.linalg.norm(vect)==0: #to avoid null embeddings
                vect[0] = 0.01
            assert word not in src_word2id
            assert vect.shape == (emb_dim, )
            src_word2id[word] = len(src_word2id)
            src_embeddings.append(vect[None,:])
        if i > max_vocab:
            break
            
src_id2word = {}
src_id2word = {v: k for k,v in src_word2id.items()}
src_embeddings = np.concatenate(src_embeddings,0)
src_embeddings = torch.from_numpy(src_embeddings).float()
src_embeddings = src_embeddings.cuda()

In [8]:
src_emb = nn.Embedding(len(src_word2id), emb_dim, sparse=True)
src_emb.weight.data.copy_(src_embeddings)


-2.3167e-02 -4.2483e-03 -1.0572e-01  ...   8.9398e-02 -1.5900e-02  1.4866e-01
-1.1112e-01 -1.3859e-03 -1.7780e-01  ...   6.3374e-02 -1.2161e-01  3.9339e-02
-6.5334e-02 -9.3031e-02 -1.7571e-02  ...   1.6642e-01 -1.3079e-01  3.5397e-02
                ...                   ⋱                   ...                
 3.4931e-02 -2.5885e-02 -2.5731e-01  ...  -2.5291e-02  1.9552e-01  1.5896e-01
-1.6874e-01 -8.9404e-04 -4.0138e-02  ...   2.5356e-01 -2.0092e-02  2.0918e-01
-9.0509e-02  3.2207e-01 -6.3597e-01  ...   1.4948e-01  1.6384e-01  3.8038e-01
[torch.FloatTensor of size 10001x300]

In [9]:
from src.dictionary import Dictionary

In [12]:
src_dico = Dictionary(src_id2word, src_word2id, src_lang)

In [13]:
# Setting up embeddings for target language

tgt_lang = 'es'
tgt_emd_path = 'data/wiki.es.vec'


tgt_word2id = {}
tgt_id2word = {}
tgt_embeddings = []



with open(tgt_emd_path) as f:
    for i,line in enumerate(f):
        if i==0:
            split = line.split()
            assert len(split) == 2
            assert emb_dim == int(split[1])
        else:
            word, vect = line.rstrip().split(' ', 1)
            vect = np.fromstring(vect, sep=' ')
            if np.linalg.norm(vect)==0: #to avoid null embeddings
                vect[0] = 0.01
            assert word not in tgt_word2id
            assert vect.shape == (emb_dim, )
            tgt_word2id[word] = len(tgt_word2id)
            tgt_embeddings.append(vect[None,:])
        if i > max_vocab:
            break

tgt_id2word = {v:k for k,v in tgt_word2id.items()}

tgt_dico = Dictionary(tgt_id2word, tgt_word2id, tgt_lang)
tgt_embeddings = np.concatenate(tgt_embeddings,0)
tgt_embeddings = torch.from_numpy(tgt_embeddings).float()
tgt_embeddings = tgt_embeddings.cuda()

tgt_emb = nn.Embedding(len(tgt_word2id), emb_dim, sparse=True)
tgt_emb.weight.data.copy_(tgt_embeddings)


-1.3075e-01 -8.7659e-02 -1.1427e-01  ...  -4.0476e-02 -1.2293e-02  4.2569e-02
-3.6446e-01  9.5962e-02 -1.6188e-01  ...  -1.4986e-01  2.3584e-01  1.8541e-01
-5.9110e-02 -8.3343e-02 -9.3019e-02  ...  -5.4064e-02  1.7285e-01  1.6713e-01
                ...                   ⋱                   ...                
-1.3761e-01 -3.4233e-01 -1.4767e-01  ...   1.1511e-01 -2.3792e-03 -2.4035e-01
 2.0302e-01 -2.0129e-01 -1.2699e-01  ...  -3.2187e-01  1.5285e-01  7.9039e-02
-3.8139e-01 -7.2348e-01 -6.2569e-02  ...  -6.2603e-01  2.7355e-01  6.8966e-02
[torch.FloatTensor of size 10001x300]

In [14]:
mapping = nn.Linear(emb_dim, emb_dim, bias=False)
mapping.weight.data.copy_(torch.diag(torch.ones(emb_dim)))


    1     0     0  ...      0     0     0
    0     1     0  ...      0     0     0
    0     0     1  ...      0     0     0
       ...          ⋱          ...       
    0     0     0  ...      1     0     0
    0     0     0  ...      0     1     0
    0     0     0  ...      0     0     1
[torch.FloatTensor of size 300x300]

In [15]:
disc_layers = 2
disc_dim_hidden = 2048
disc_dropout = 0
disc_inp_dropout = 0.1

In [46]:
#Discriminator class
class Discriminator(nn.Module):
    def __init__(self):
        super(Discriminator, self).__init__()
        
        self.emb_dim = emb_dim
        self.disc_layers = disc_layers
        self.disc_dim_hidden = disc_dim_hidden
        self.disc_dropout = disc_dropout
        self.disc_inp_dropout = disc_inp_dropout
        
        layers = [nn.Dropout(self.disc_inp_dropout)]
        for i in range(self.disc_layers + 1):
            input_dim = self.emb_dim if i == 0 else self.disc_dim_hidden
            output_dim = 1 if i==self.disc_layers else self.disc_dim_hidden
            layers.append(nn.Linear(input_dim, output_dim))
            if i < self.disc_layers:
                layers.append(nn.LeakyReLU(0.2))
                layers.append(nn.Dropout(self.disc_dropout))
        layers.append(nn.Sigmoid())
        self.layers = nn.Sequential(*layers)
        
    def forward(self, x):
        return self.layers(x).view(-1)                           

In [47]:
discriminator = Discriminator()

In [48]:
src_emb.cuda()
tgt_emb.cuda()
mapping.cuda()
discriminator.cuda()

Discriminator(
  (layers): Sequential(
    (0): Dropout(p=0.1)
    (1): Linear(in_features=300, out_features=2048)
    (2): LeakyReLU(0.2)
    (3): Dropout(p=0)
    (4): Linear(in_features=2048, out_features=2048)
    (5): LeakyReLU(0.2)
    (6): Dropout(p=0)
    (7): Linear(in_features=2048, out_features=1)
    (8): Sigmoid()
  )
)

In [74]:
# Normalize embeddings before the training
#Add it later

In [49]:
from torch import optim

In [50]:
optim_fn = optim.SGD
optim_params = {'lr': 0.1}

In [51]:
map_optimizer = optim_fn(mapping.parameters(), **optim_params)
disc_optimizer = optim_fn(discriminator.layers.parameters(), **optim_params)

In [None]:
epoch_size = 1000000
batch_size = 32
disc_steps = 5
disc_most_freq = 75000
disc_smooth = 0.1
map_beta = 0.001

In [53]:
def get_disc_xy(volatile):
    """
    Get discriminator input batch / output target.
    """
    # select random word IDs
    bs = batch_size
    mf = disc_most_freq
    assert mf <= min(len(src_dico), len(tgt_dico))
    src_ids = torch.LongTensor(bs).random_(mf)
    tgt_ids = torch.LongTensor(bs).random_(mf)
    src_ids = src_ids.cuda()
    tgt_ids = tgt_ids.cuda()

    # get word embeddings
    src_emb = src_emb(Variable(src_ids, volatile=True))
    tgt_emb = tgt_emb(Variable(tgt_ids, volatile=True))
    src_emb = mapping(Variable(src_emb.data, volatile=volatile))
    tgt_emb = Variable(tgt_emb.data, volatile=volatile)

    # input / target
    x = torch.cat([src_emb, tgt_emb], 0)
    y = torch.FloatTensor(2 * bs).zero_()
    y[:bs] = 1 - disc_smooth
    y[bs:] = dis_smooth
    y = Variable(y.cuda())

    return x, y

In [55]:
# Need to study the underlying math.
def orthogonalize():
    """
    Orthogonalize the mapping.
    """
    W = mapping.weight.data
    beta = map_beta
    W.copy_((1 + beta) * W - beta * W.mm(W.transpose(0, 1).mm(W)))

In [None]:
def eval_dis(self, to_log):
    """
    Evaluate discriminator predictions and accuracy.
    """
    bs = 128
    src_preds = []
    tgt_preds = []

    discriminator.eval()

    for i in range(0, src_emb.num_embeddings, bs):
        emb = Variable(src_emb.weight[i:i + bs].data, volatile=True)
        preds = discriminator(mapping(emb))
        src_preds.extend(preds.data.cpu().tolist())

    for i in range(0,tgt_emb.num_embeddings, bs):
        emb = Variable(tgt_emb.weight[i:i + bs].data, volatile=True)
        preds = discriminator(emb)
        tgt_preds.extend(preds.data.cpu().tolist())

    src_pred = np.mean(src_preds)
    tgt_pred = np.mean(tgt_preds)
    print("Discriminator source / target predictions: %.5f / %.5f"
                % (src_pred, tgt_pred))

    src_accu = np.mean([x >= 0.5 for x in src_preds])
    tgt_accu = np.mean([x < 0.5 for x in tgt_preds])
    dis_accu = ((src_accu * src_emb.num_embeddings + tgt_accu * tgt_emb.num_embeddings) /
                    (src_emb.num_embeddings + tgt_emb.num_embeddings))
    print("Discriminator source / target / global accuracy: %.5f / %.5f / %.5f"
                    % (src_accu, tgt_accu, dis_accu))

In [None]:
# Learning loop for Adversarial Training

def adv_training(epochs):
    for epoch in range(epochs):
        n_words_proc = 0
        for n_iter in range(0,epoch_size, batch_size):
            for _ in range(disc_steps): # Discriminator training
                discriminator.train() # sets the module in training mode ex adds dropout and batchnorm
                x,y = get_disc_xy(volatile=True)
                preds = discriminator(Variable(x.data))
                loss = F.binary_cross_entropy(preds,y)
                disc_optimizer.zero_grad()
                loss.backward()
                disc_optimizer.step()
                # Can add clipping if needed
            
            #Mapping step
            discriminator.eval() # Puts the module in evaluation mode.
            x, y = get_disc_xy(volatile=False)
            preds = discriminator(Variable(x.data))
            loss = F.binary_cross_entropy(preds, 1-y)
            map_optimizer.zero_grad()
            loss.backward()
            map_optmizer.step()
            orthogonalize()
            n_words_proc += 2 * batch_size
            
        # embeddings / discriminator evaluation
        
            
            
            
                
        
        
