In [16]:
%load_ext autoreload
%autoreload 2
import numpy as np
import pickle
from utils import generator, read_batch
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
%matplotlib inline
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch.optim import Adam
from torch.utils.data import DataLoader, Dataset
from utils import read_batch

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [8]:
p_file_idxes = 'data/file_idxes.npy'
p_tokenizer = 'data/tokenizer.pickle'
p_label2idx = 'data/label2idx.pickle'
p_label_counter = 'data/label_counter'
p_word_vectors = './data/word_vectors.npy'
data_path = './data'

In [9]:
EMBEDDING_DIM = 200

In [10]:
file_idxes = np.load(p_file_idxes)
word_vectors = np.load(p_word_vectors)
with open(p_label2idx, 'rb') as f:
    label2idx = pickle.load(f)
with open(p_label_counter, 'rb') as f:
    label_counter = pickle.load(f)
with open(p_tokenizer, 'rb') as f:
    tokenizer = pickle.load(f)

In [11]:
for i, (seq, lab) in enumerate(generator(file_idxes, data_path, label2idx)):
    print(seq, lab)
    if i > 0:
        break

[[ 120  313]
 [   0  394]
 [   0  529]
 [   0 1275]] [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
[[293]
 [626]
 [109]] [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]


In [90]:
class TextsDataset(Dataset):
    def __init__(self, file_idxes, data_path, label2idx):
        self.file_idxes = file_idxes
        self.data_path = data_path
        self.label2idx = label2idx
        
    def __len__(self):
        return len(self.file_idxes)
    
    def __getitem__(self, idx):
        batch_x, batch_y = read_batch(self.data_path, self.file_idxes[idx], self.label2idx)
        return LongTensor(batch_x), FloatTensor(batch_y)

In [91]:
class ConvClassifier(nn.Module):
    def __init__(self, kernel_size, n_filters, embedding_dim, word_vectors, n_labels, freeze_embeds=True):
        super(ConvClassifier, self).__init__()
        self.embedding = nn.Embedding.from_pretrained(word_vectors, freeze_embeds)
        self.conv = nn.Conv2d(1, n_filters, [kernel_size, embedding_dim])
        self.linear = nn.Linear(n_filters, n_labels)
        
    def forward(self, batch_in):
        batch_in = torch.unsqueeze(batch_in, 1)
        print(batch_in.size())
        x = self.embedding(batch_in)
        print(x.size())
        x = self.conv(x)
        print(x.size())
        x = torch.squeeze(x, -1)
        print(x.size())
        x = nn.MaxPool1d(x.size()[2])(x)
        print(x.size())
        x = torch.squeeze(x, 2)
        print(x.size())
        logits = self.linear(x)
        return logits

In [95]:
dataset = TextsDataset(file_idxes, data_path, label2idx)
dataloader = DataLoader(dataset, num_workers=1, collate_fn=lambda l: l[0])
clf = ConvClassifier(2, 10, 200, FloatTensor(word_vectors), len(label2idx))
optimizer = Adam(filter(lambda p: p.requires_grad, clf.parameters()))
criterion = nn.BCEWithLogitsLoss()

for batch_x, batch_y in dataloader:
    logits = clf(batch_x)
    loss = criterion(logits, batch_y)
    print(loss.data.numpy())
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    break

torch.Size([4, 1, 2])
torch.Size([4, 1, 2, 200])
torch.Size([4, 10, 1, 1])
torch.Size([4, 10, 1])
torch.Size([4, 10, 1])
torch.Size([4, 10])
0.7032218
