In [1]:
import torch
import torch.nn as nn
import numpy as np
import pandas as pd
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from torchvision import transforms, utils
import pickle as pkl
from torch.autograd import Variable
from torch.nn.utils.rnn import pack_padded_sequence
import time
import torch.nn.functional as F
import random
import torch.optim as optim
from comet_ml import Experiment


In [2]:
manualSeed = 0
random.seed(manualSeed)
torch.manual_seed(manualSeed)

<torch._C.Generator at 0x7fdb48132270>

In [3]:
train_set = pd.read_csv('data/rnn/train-tokenized.txt', names=["previous", "current", "label"])
val_set = pd.read_csv('data/rnn/val-tokenized.txt', names=["previous", "current", "label"])
test_set = pd.read_csv('data/rnn/test-tokenized.txt', names=["previous", "current", "label"])

In [4]:
def to_balanced_set(dataset):
    dataset_pos = dataset[dataset["label"] == 1]
    dataset_neg = dataset[dataset["label"] == 0].sample(random_state = manualSeed, frac=1.).reset_index(drop=True)[:dataset_pos.shape[0]]
    return pd.concat((dataset_pos, dataset_neg)).sample(random_state = manualSeed, frac=1.)

In [5]:
train_balanced = to_balanced_set(train_set)#[:1000]
val_balanced = to_balanced_set(val_set)#[:100]
test_balanced = to_balanced_set(test_set)#[:100]

In [6]:
# Useful for
# - debug
# - choosing voc. size
word_count = pkl.load(open('data/rnn/wordcount.pkl', 'r'))
idx2word = pkl.load(open('data/rnn/idx2word.pkl', 'r'))
word2idx = pkl.load(open('data/rnn/word2idx.pkl', 'r'))

# Hyperparameters

In [43]:
workers = 1
batchSize = 50
voc_size = 10000

epochs = 100

model_type = "GRU" # "RNN_TANH", "RNN_RELU", "GRU"
em_size = 10
nhid = 200
n_layers = 2
dropout = .0
bidirectional = True

log_interval = 10

lr = 1e-3
clip = 1.

In [44]:
def collate(batch):
    # As every elem in batch has different length, I pad all sentences in the batch with 0
    # And sort them by length of sentences because of pack_padded_sequence, see pytorch doc.
    
    batch_size = len(batch)
    lengths_sentence = []
    labels_long = torch.LongTensor(batch_size)
    labels = []
    
    for elem in batch:
        lengths_sentence += [elem[0].size(0)]
        labels += [elem[1][0]]
        
    lengths = torch.LongTensor(lengths_sentence)
    max_length = max(lengths_sentence)    
    lengths, indexes = torch.sort(lengths, dim=0, descending=True)
    
    x = torch.zeros(batch_size, max_length).long()
    i = 0
    
    for idx in indexes:
        x[i,:lengths[i]] = batch[idx][0]
        labels_long[i] = labels[idx]
        i += 1
    
    return x, lengths, labels_long

class idx_to_sentence(object):
    
    def __init__(self, idx2word):
        self.idx2word = idx2word
        
    def array_to_str(self, s_idx):
        s_str = ""
        for idx in s_idx:
            s_str += self.idx2word[idx] + " "
        return s_str
        
    def print_array(self, s_idx):
        print(self.array_to_str(s_idx))
    
    
class IronyDataset(Dataset):
    """Face Landmarks dataset."""

    def __init__(self, dataset, voc_size, transform=None):
        """
        Args:
            csv_file (string): Path to the csv file with annotations.
            root_dir (string): Directory with all the images.
            transform (callable, optional): Optional transform to be applied
                on a sample.
        """
        self.voc_size = voc_size
        self.dataset = dataset
        
    def idx_str_to_idx(self, idx):
        idx = int(idx)
        if idx >= voc_size:
            # Return <UNK> token !
            return 0
        return idx

    def __len__(self):
        return self.dataset.shape[0]

    def __getitem__(self, idx):
        elem = self.dataset.iloc[idx]
        prev, current, label = elem['previous'].split(" ")[:-1], elem['current'].split(" ")[1:-1], elem['label']
        sentence = torch.LongTensor([self.idx_str_to_idx(i) for i in prev] + [self.idx_str_to_idx(i) for i in current])
        label = torch.LongTensor([label])
        return sentence, label

In [45]:
sentence_parser = idx_to_sentence(idx2word)

train_wrapper = IronyDataset(train_balanced, voc_size)
val_wrapper = IronyDataset(val_balanced, voc_size)
test_wrapper = IronyDataset(test_balanced, voc_size)

train_loader = DataLoader(train_wrapper, batch_size=batchSize, shuffle=True, num_workers=workers, collate_fn=collate)
val_loader = DataLoader(val_wrapper, batch_size=batchSize, shuffle=True, num_workers=workers, collate_fn=collate)
test_loader = DataLoader(test_wrapper, batch_size=batchSize, shuffle=True, num_workers=workers, collate_fn=collate)

In [46]:
a = iter(train_loader)
text, length, label = a.next()

Process Process-14:
Traceback (most recent call last):
    self.run()
  File "/home/lucas/anaconda2/lib/python2.7/multiprocessing/process.py", line 258, in _bootstrap
  File "/home/lucas/anaconda2/lib/python2.7/multiprocessing/process.py", line 114, in run
    self._target(*self._args, **self._kwargs)
  File "/home/lucas/anaconda2/lib/python2.7/site-packages/torch/utils/data/dataloader.py", line 34, in _worker_loop
    r = index_queue.get()
  File "/home/lucas/anaconda2/lib/python2.7/multiprocessing/queues.py", line 378, in get
    return recv()
  File "/home/lucas/anaconda2/lib/python2.7/site-packages/torch/multiprocessing/queue.py", line 21, in recv
    buf = self.recv_bytes()
KeyboardInterrupt


In [47]:
textfollower = idx_to_sentence(idx2word)

In [48]:
textfollower.print_array(text[0].numpy())

Best news coming from Saudi since <UNK> <UNK> <eos> Good, now the women who run away because they aren't allowed to drive have a better chance of reaching safety instead of being thrown in prison for having too much of their face <UNK> <eos> 


# Model and training

In [49]:
def unpad_input(input, length):
    cat = []
    batch_size = input.size(0)

    for i in range(batch_size):
        cat += [input[i,:length[i]]]

    unpadded_input = torch.cat((cat), 0)
    return unpadded_input

class _attn(nn.Module):
    def __init__(self, in_size, mid_size, heads):
        super(_attn, self).__init__()
        self.in_size = in_size
        self.fc1 = nn.Linear(in_size, mid_size)
        self.fc2 = nn.Linear(mid_size, heads)
    
    def forward(self, input, length):
        # x : batch_size x max_len x in_size
        # length : batch_size
        x = unpad_input(input, length)
        alphas = self.fc2(F.relu(self.fc1(x)))
        
        low_idx = 0
        out = []
        
        idx = 0
        for i in length:
            high_idx = low_idx + i
            out += [F.softmax(alphas[low_idx:high_idx]).transpose(0, 1).mm(input[idx,:length[idx]]).view(1, -1)]
            idx += 1
            low_idx += i
        
        return torch.cat((out), 0)
    
class _multihead_attn(nn.Module):
    
    def __init__(self):
        super(_multihead_attn, self).__init__()
    
    def forward(self, x, length):
        pass

class AttentionModel(nn.Module):
    def __init__(self, voc_size, in_size, mid_size, heads, dropout=0.5):
        super(AttentionModel, self).__init__()
        self.emb = nn.Embedding(voc_size, in_size)
        self.attn1 = _attn(in_size, mid_size, heads)
        self.fc1 = nn.Linear(in_size * heads, in_size)
        self.attn2 = _attn(in_size, mid_size, heads)
        self.fc2 = nn.Linear(in_size + in_size * heads, 1)
        self.drop = nn.Dropout(dropout)
    
    def forward(self, x, length):
        x = self.drop(self.emb(x))
        out = self.attn1(x, length)
        out = F.relu(self.fc1(out))
        out = torch.cat((out, self.attn2(x, length)), 1)
        return F.sigmoid(self.fc2(out))

In [50]:
class RNNModel(nn.Module):
    """Container module with an encoder, a recurrent module, and a decoder."""

    def __init__(self, rnn_type, ntoken, ninp, nhid, nlayers, dropout=0.5, bidirectional=True):
        super(RNNModel, self).__init__()
        self.drop = nn.Dropout(dropout)
        self.encoder = nn.Embedding(ntoken, ninp)
        if rnn_type in ['LSTM', 'GRU']:
            self.rnn = getattr(nn, rnn_type)(ninp, nhid, nlayers, dropout=dropout, batch_first=False, bidirectional=bidirectional)
        else:
            try:
                nonlinearity = {'RNN_TANH': 'tanh', 'RNN_RELU': 'relu'}[rnn_type]
            except KeyError:
                raise ValueError( """An invalid option for `--model` was supplied,
                                 options are ['LSTM', 'GRU', 'RNN_TANH' or 'RNN_RELU']""")
            self.rnn = nn.RNN(ninp, nhid, nlayers, nonlinearity=nonlinearity, dropout=dropout)
            
        decoder_in_size = nhid * nlayers # bidirectionnal
        if bidirectional:
            decoder_in_size *= 2
        self.decoder = nn.Linear(decoder_in_size, 1)

        self.init_weights()

        self.rnn_type = rnn_type
        self.nhid = nhid
        self.nlayers = nlayers
        self.bidirectional = bidirectional

    def init_weights(self):
        initrange = 0.1
        self.encoder.weight.data.uniform_(-initrange, initrange)
        self.decoder.bias.data.fill_(0)
        self.decoder.weight.data.uniform_(-initrange, initrange)

    def forward(self, input, length, hidden):
        batch_size = input.size(0)
        emb = self.encoder(input)
        emb = self.drop(emb)
        emb = pack_padded_sequence(emb, length, batch_first=True)
        h_ts, h_T = self.rnn(emb, hidden)
        
        if self.rnn_type == "LSTM":
            (h_T, c_T) = h_t
        h_T = h_T.view(batch_size, -1)
        output = self.drop(h_T)
        decoded = F.sigmoid(self.decoder(h_T))
        return decoded

    def init_hidden(self, bsz):
        weight = next(self.parameters()).data
        h_size = self.nlayers
        if self.bidirectional:
            h_size *= 2
            
        if self.rnn_type == 'LSTM':
            return (Variable(weight.new(h_size, bsz, self.nhid).zero_()),
                    Variable(weight.new(h_size, bsz, self.nhid).zero_()))
        else:
            return Variable(weight.new(h_size, bsz, self.nhid).zero_())

In [56]:
model = AttentionModel(100000, 100, 200, 10, .8)
criterion = nn.BCELoss()
optimizer = optim.RMSprop(model.parameters(), lr = lr)
print(model)

AttentionModel (
  (emb): Embedding(100000, 100)
  (attn1): _attn (
    (fc1): Linear (100 -> 200)
    (fc2): Linear (200 -> 10)
  )
  (fc1): Linear (1000 -> 100)
  (attn2): _attn (
    (fc1): Linear (100 -> 200)
    (fc2): Linear (200 -> 10)
  )
  (fc2): Linear (1100 -> 1)
  (drop): Dropout (p = 0.8)
)


In [57]:
def train():
    model.train()
    total_loss = 0
    start_time = time.time()
    # hidden = model.init_hidden(batchSize)
    for batch_idx, (text, length, label) in enumerate(train_loader):
        model.zero_grad()
        batch_size = text.size(0)
        # hidden = model.init_hidden(batch_size)
        data = Variable(text)
        label = Variable(label.float())
        out = model(data, length.cpu().numpy()).squeeze()
        loss = criterion(out, label)
        total_loss += loss.data
        
        loss.backward()
        
        torch.nn.utils.clip_grad_norm(model.parameters(), clip)
        optimizer.step()
#         for p in model.parameters():
#             p.data.add_(-lr, p.grad.data)
        
        if batch_idx % log_interval == 0 and batch_idx > 0:
            cur_loss = total_loss[0] / log_interval
            elapsed = time.time() - start_time
            print('| epoch {:3d} | {:5d}/{:5d} batches | lr {:.6f} | ms/batch {:5.2f} | '
                    'loss {:5.2f}'.format(
                epoch, batch_idx, len(train_loader), lr,
                elapsed * 1000 / log_interval, cur_loss))
            total_loss = 0
            start_time = time.time()

In [58]:
def evaluate(data_loader):
    model.eval()
    total_loss = 0
    # hidden = model.init_hidden(batchSize)
    correct = 0
    
    fake_negative_printed = 0
    fake_positive_printed = 0
    
    for batch_idx, (text, length, label) in enumerate(data_loader):
        batch_size = text.size(0)
        # hidden = model.init_hidden(batch_size)
        data = Variable(text, volatile=True)
        label = Variable(label.float(), volatile=True)
        out = model(data, length.cpu().numpy())
        loss = criterion(out, label).squeeze()
        pred = out.data.round()
        correct_batch = pred.eq(label.data.view_as(pred)).cpu()
        correct += correct_batch.sum()
        if correct_batch.sum() < batch_size and (fake_negative_printed < 5 or fake_positive_printed < 5):
            for i in range(batch_size):
                if correct_batch[i][0] == 0:
                    if pred[i][0] == 1 and fake_positive_printed < 5:
                        fake_positive_printed += 1
                        textfollower.print_array(text[i,:length[i]].numpy())
                        print("True : %d, Predicted : %d" % (label[i].data[0], pred[i][0]))
                    elif fake_negative_printed < 5:
                        fake_negative_printed += 1
                        textfollower.print_array(text[i,:length[i]].numpy())
                        print("True : %d, Predicted : %d" % (label[i].data[0], pred[i][0]))
        
        total_loss += loss.data
    
    total_loss /= len(data_loader)
    
    print('\nTest set: Average loss: {:.4f}, Accuracy: {:.1f}/{:.0f} ({:.0f}%)\n'.format(
        total_loss[0], correct, len(data_loader.dataset),
        100. * correct / len(data_loader.dataset)))
    
    return total_loss[0]

In [None]:
best_val_loss = np.inf

for epoch in range(1, epochs + 1):
    epoch_start_time = time.time()
    train()
    val_loss = evaluate(val_loader)
    print('-' * 89)
    print('| end of epoch {:3d} | time: {:5.2f}s | valid loss {:5.2f}'.format(epoch, (time.time() - epoch_start_time), val_loss))
    print('-' * 89)
    # Save the model if the validation loss is the best we've seen so far.
    if not best_val_loss or val_loss < best_val_loss:
        best_val_loss = val_loss
    else:
        # Anneal the learning rate if no improvement has been seen in the validation dataset.
        lr /= 1.5
        optimizer = optim.Adam(model.parameters(), lr = lr)

| epoch   1 |    10/ 3195 batches | lr 0.001000 | ms/batch 249.10 | loss  1.20
| epoch   1 |    20/ 3195 batches | lr 0.001000 | ms/batch 237.16 | loss  0.82
| epoch   1 |    30/ 3195 batches | lr 0.001000 | ms/batch 191.95 | loss  0.76
| epoch   1 |    40/ 3195 batches | lr 0.001000 | ms/batch 196.03 | loss  0.81
| epoch   1 |    50/ 3195 batches | lr 0.001000 | ms/batch 189.52 | loss  0.75
| epoch   1 |    60/ 3195 batches | lr 0.001000 | ms/batch 210.92 | loss  0.77
| epoch   1 |    70/ 3195 batches | lr 0.001000 | ms/batch 243.44 | loss  0.74
| epoch   1 |    80/ 3195 batches | lr 0.001000 | ms/batch 258.69 | loss  0.72
| epoch   1 |    90/ 3195 batches | lr 0.001000 | ms/batch 203.44 | loss  0.73
| epoch   1 |   100/ 3195 batches | lr 0.001000 | ms/batch 184.07 | loss  0.74
| epoch   1 |   110/ 3195 batches | lr 0.001000 | ms/batch 263.41 | loss  0.74
| epoch   1 |   120/ 3195 batches | lr 0.001000 | ms/batch 322.44 | loss  0.72
| epoch   1 |   130/ 3195 batches | lr 0.001000 | ms

In [None]:
lr /= 2
optimizer = optim.Adam(model.parameters(), lr = lr)

In [55]:
evaluate(val_loader)

Canada should have a zone for refugees and any Canadian that is <UNK> should be required to live there. <eos> He doesn't have to be <UNK> <UNK> him 6 feet down from the surface of Canada is <UNK> <eos> 
True : 0, Predicted : 1
In any of the GTA <UNK> if I <UNK> a car or shot at it, and they <UNK> off, I would chase them and finish them <UNK> <eos> <UNK> <UNK> <eos> 
True : 0, Predicted : 1
If the economy is in good <UNK> then why hasn't the <UNK> raised interest rates yet? <eos> Are wages really the best way to <UNK> an <UNK> <UNK> <eos> 
True : 0, Predicted : 1
As I've grown older I've realized that Santa likes rich kids more than everyone else <eos> Wow this is very <UNK> I have never heard this before. <eos> 
True : 1, Predicted : 0
They ain't <UNK> if you own em <eos> those <UNK> are beautiful i want to cop but dont know where to find them <eos> 
True : 0, Predicted : 1
<UNK> was a <UNK> on the <UNK> <UNK> <eos> Op doesnt know the basic <UNK> <UNK> this subreddit in a <UNK> i suppo

0.6841297149658203