In [1]:
%matplotlib inline
import torch, torch.nn as nn
from torch.autograd import Variable
from torch import optim
import torch.nn.functional as F
import re
import pickle
import collections
import bcolz
import pickle
from gensim.models import KeyedVectors
import numpy as np
from keras.preprocessing.sequence import pad_sequences
from sklearn import model_selection
import math
import torch

Using Theano backend.
 https://github.com/Theano/Theano/wiki/Converting-to-the-new-gpu-back-end%28gpuarray%29

Using gpu device 0: GeForce GTX 1060 3GB (CNMeM is disabled, cuDNN not available)


In [2]:
path = '/run/media/backman/yay/giga-fren/'
fname = path + 'giga-fren.release2.fixed'
en_fname = fname + '.en/data'
fr_fname = fname + '.fr/data'

In [3]:
re_eq = re.compile('^(Wh[^?.!]+\?)')
re_fq = re.compile('^([^?.!]+\?)')

In [4]:
lines = ((re_eq.search(eq), re_fq.search(fq))
        for eq, fq in zip(open(en_fname), open(fr_fname)))

In [5]:
qs = [(e.group(), f.group()) for e,f in lines if e and f]
len(qs)

52331

In [6]:
qs[:6]

[('What is light ?', 'Qu’est-ce que la lumière?'),
 ('Who are we?', 'Où sommes-nous?'),
 ('Where did we come from?', "D'où venons-nous?"),
 ('What would we do without it?', 'Que ferions-nous sans elle ?'),
 ('What is the absolute location (latitude and longitude) of Badger, Newfoundland and Labrador?',
  'Quelle sont les coordonnées (latitude et longitude) de Badger, à Terre-Neuve-etLabrador?'),
 ('What is the major aboriginal group on Vancouver Island?',
  'Quel est le groupe autochtone principal sur l’île de Vancouver?')]

In [7]:
pickle.dump(qs, open(path+'qs.pkl', 'wb'))

In [8]:
en_qs, fr_qs = zip(*qs)

In [9]:
re_apos = re.compile(r"(\w)'s\b")         # make 's a separate word
re_mw_punc = re.compile(r"(\w[’'])(\w)")  # other ' in a word creates 2 words
re_punc = re.compile("([\"().,;:/_?!—])") # add spaces around punctuation
re_mult_space = re.compile(r"  *")        # replace multiple spaces with just one

def simple_toks(sent):
    sent = re_apos.sub(r"\1 's", sent)
    sent = re_mw_punc.sub(r"\1 \2", sent)
    sent = re_punc.sub(r" \1 ", sent).replace('-', ' ')
    sent = re_mult_space.sub(' ', sent)
    return sent.lower().split()

In [10]:
fr_qtoks = list(map(simple_toks, fr_qs))

In [11]:
en_qtoks = list(map(simple_toks, en_qs))

In [12]:
PAD = 0
SOS = 1

def toks2ids(sents):
    voc_cnt = collections.Counter(t for sent in sents for t in sent)
    vocab = sorted(voc_cnt, key = voc_cnt.get, reverse = True)
    vocab.insert(PAD, "<PAD>")
    vocab.insert(SOS, "<SOS>")
    w2id = {w:i for i,w in enumerate(vocab)}
    ids = [[w2id[t] for t in sent] for sent in sents]
    return ids, vocab, w2id, voc_cnt

In [13]:
fr_ids, fr_vocab, fr_w2id, fr_counts = toks2ids(fr_qtoks)
en_ids, en_vocab, en_w2id, en_counts = toks2ids(en_qtoks)

In [14]:
glove_loc = '/run/media/backman/yay/glove/6B.100d'
en_vecs, en_wv_word, en_wv_idx = bcolz.open(glove_loc+'.dat')[:], pickle.load(open(glove_loc+'_words.pkl','rb'), encoding='latin1'), pickle.load(open(glove_loc+'_idx.pkl','rb'), encoding='latin1')
en_w2v = {w: en_vecs[en_wv_idx[w]] for w in en_wv_word}
n_en_vec, dim_en_vec = en_vecs.shape

In [15]:
w2v_path = '/run/media/backman/yay/frWac_non_lem_no_postag_no_phrase_200_cbow_cut0.bin'
fr_model = KeyedVectors.load_word2vec_format(w2v_path, binary=True,unicode_errors='ignore')
ft_voc = fr_model.vocab
dim_fr_vec = 200

In [16]:
def create_emb(w2v, targ_vocab, dim_vec):
    vocab_size = len(targ_vocab)
    emb = np.zeros((vocab_size, dim_vec))    
    found = 0
    
    for i, word in enumerate(targ_vocab):
        try: 
            emb[i] = w2v[word]
            found+=1
        except KeyError: emb[i] = np.random.normal(scale=0.6, size=(dim_vec,))
    return emb, found

In [17]:
en_embds, found = create_emb(en_w2v, en_vocab, dim_en_vec)

In [18]:
fr_embds, found = create_emb(fr_model, fr_vocab, dim_fr_vec)

In [19]:
maxlen = 30
en_padded = pad_sequences(en_ids, maxlen, 'int64', 'post', 'post')
fr_padded = pad_sequences(fr_ids, maxlen, 'int64', 'post', 'post')

In [20]:
fr_train, fr_test, en_train, en_test = model_selection.train_test_split(
    fr_padded, en_padded, test_size=0.1)

In [21]:
def get_batch(x, y, batch_size = 16):
    idxs = np.random.permutation(len(x))[:batch_size]
    return x[idxs], y[idxs]

In [22]:
def Var(*sz): return Variable(Arr(*sz), requires_grad = True).cuda()

In [23]:
def Arr(*sz): return torch.randn(sz)/math.sqrt(sz[0])

In [24]:
def long_t(arr): return Variable(torch.LongTensor(arr)).cuda()

In [25]:
def encode(inp, encoder):
    batch_size, input_length = inp.size()
    hidden = encoder.initHidden(batch_size).cuda()
    enc_outputs, hidden = encoder(inp, hidden)
    return long_t([SOS]*batch_size), enc_outputs, hidden

In [26]:
class AttnDecoderRNN(nn.Module):
    def __init__(self, embs, hidden_size, n_layers=2, p=0.1):
        super(AttnDecoderRNN, self).__init__()
        self.emb, emb_size, output_size = create_emb(embs)
        self.W1 = Var(hidden_size, hidden_size)
        self.W2 = Var(hidden_size, hidden_size)
        self.W3 = Var(emb_size+hidden_size, hidden_size)
        self.b2 = Var(hidden_size)
        self.b3 = Var(hidden_size)
        self.V = Var(hidden_size)
        self.gru = nn.GRU(hidden_size, hidden_size, num_layers=2)
        self.out = nn.Linear(hidden_size, output_size)
        
    def forward(self, inp, hidden, enc_outputs):
        emb_inp = self.emb(inp)
        print(enc_outputs, self.W1)
        w1e = enc_outputs.bmm(self.W1)
        w2h = torch.mm(hidden[-1], self.W2) + self.b2
        u = F.tanh(w1e + w2h)
        a = self.V*u
        a = F.softmax(a)
        Xa = a * enc_outputs
        res =  torch.mm(torch.cat([emb_inp, Xa.squeeze(1)], 1),self.W3)
        res = res + self.b3
        res, hidden = self.gru(res, hidden)
        res = F.log_softmax(self.out(res))
        return res, hidden

In [27]:
def train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit):
    decoder_input, encoder_outputs, hidden = encode(inp, encoder)
    target_length = targ.size()[1]
    
    enc_opt.zero_grad(); dec_opt.zero_grad()
    loss = 0
    
    for di in range(target_length):
        decoder_output, hidden = decoder(decoder_input, hidden, encoder_outputs)
        decoder_input = targ[:, di]
        loss += crit(decoder_output, decoder_input)
        
    loss.backward()
    enc_opt.step(); dec_opt.step()
    return loss.data[0] / target_length

In [28]:
def req_grad_params(o):
    return (p for p in o.parameters() if p.requires_grad)

In [29]:
def trainEpochs(encoder, decoder, n_epochs, print_every = 1000, lr=0.01):
    loss_total = 0
    
    enc_opt = optim.RMSprop(req_grad_params(encoder), lr = lr)
    dec_opt = optim.RMSprop(decoder.parameters(), lr = lr)
    crit = nn.NLLLoss().cuda()

    for epoch in range(n_epochs):
        fra, eng = get_batch(fr_train, en_train, 64)
        inp = long_t(fra)
        targ = long_t(eng)
        loss = train(inp, targ, encoder, decoder, enc_opt, dec_opt, crit)
        loss_total += loss
        
        if epoch % print_every == print_every -1:
            print('%d %d%% %.4f' % (epoch, epoch / n_epochs * 100, loss_total / print_every ))
            loss_total = 0

In [30]:
def create_emb(emb_mat, non_trainable=False):
    output_size, emb_size = emb_mat.size()
    emb = nn.Embedding(output_size, emb_size)
    emb.load_state_dict({'weight': emb_mat})
    if non_trainable:
        for param in emb.parameters(): 
            param.requires_grad = False
    return emb, emb_size, output_size

In [31]:
class EncoderRNN(nn.Module):
    def __init__(self, embs, hidden_size, n_layers=2):
        super(EncoderRNN, self).__init__()
        self.emb, emb_size, output_size = create_emb(embs, True)
        self.n_layers = n_layers
        self.hidden_size = hidden_size
        self.gru = nn.GRU(emb_size, hidden_size, batch_first=True, num_layers=n_layers)
#                          ,bidirectional=True)
        
    def forward(self, input, hidden):
        return self.gru(self.emb(input), hidden)

    def initHidden(self, batch_size):
        return Variable(torch.zeros(self.n_layers, batch_size, self.hidden_size))

In [32]:
fr_emb_t = torch.FloatTensor(fr_embds).cuda()
en_emb_t = torch.FloatTensor(en_embds).cuda()

In [33]:
hidden_size = 128
encoder = EncoderRNN(fr_emb_t, hidden_size).cuda()
decoder = AttnDecoderRNN(en_emb_t, hidden_size).cuda()

In [34]:
trainEpochs(encoder, decoder, 10000, print_every=500, lr=0.005)


Variable containing:
( 0 ,.,.) = 
  3.2357e-02  6.4462e-03  5.5324e-02  ...   9.9334e-03 -4.7622e-02  1.1210e-01
  1.3140e-01 -8.3232e-03  7.6303e-02  ...   8.6676e-03 -5.2049e-02  1.4854e-01
  1.2980e-01  1.0073e-01  2.4533e-02  ...   5.1474e-02  4.2675e-02  1.2873e-01
                 ...                   ⋱                   ...                
  8.8317e-02  8.4998e-02 -7.4410e-02  ...  -5.0137e-01 -1.8891e-01 -1.0301e-01
  8.9909e-02  8.4162e-02 -7.5480e-02  ...  -5.0195e-01 -1.8963e-01 -1.0332e-01
  9.1080e-02  8.3537e-02 -7.6225e-02  ...  -5.0236e-01 -1.9022e-01 -1.0355e-01

( 1 ,.,.) = 
  2.0529e-02  6.1334e-03  5.2608e-02  ...  -9.1129e-02 -6.7575e-02  9.8476e-02
  6.8003e-02 -3.0885e-02  1.2180e-01  ...  -1.3500e-01 -1.1992e-01  1.7945e-01
  8.7006e-02 -5.9687e-02  1.3113e-01  ...  -1.8265e-01 -1.6623e-01  2.0441e-01
                 ...                   ⋱                   ...                
  9.1378e-02  8.3980e-02 -7.6538e-02  ...  -5.0251e-01 -1.8995e-01 -1.0395e-01
  9.

RuntimeError: matrix and matrix expected at /pytorch/torch/lib/THC/generic/THCTensorMathBlas.cu:237