In [1]:
from get_datasets import get_data
train_dataset, val_dataset, test_dataset, vocab = get_data('../dl_group_project/SentimentClassifier/')

In [2]:
from torch.utils.data import DataLoader

def sorted_collate(batch):
    x, y = zip(*batch)
    order = np.argsort([len(xi) for xi in x])[::-1]
    x = [torch.LongTensor(x[i]) for i in order]
    y = torch.stack([y[i] for i in order])[:, None]
    return x, y

test_loader = DataLoader(test_dataset, batch_size=1024, shuffle=False, collate_fn=sorted_collate)

In [3]:
import torch
from torch import nn

class IMDbRNN(nn.Module):
    def __init__(self, vocab_size, embedding_size, hidden_size, num_classes):
        super().__init__()
        self.rnn = nn.GRU(embedding_size, hidden_size)
        self.cls = nn.Linear(hidden_size, num_classes)
        self.emb = nn.Embedding(vocab_size, embedding_size)
    
    def prepare_sequence(self, x):
        l = torch.LongTensor([len(xi) for xi in x])
        x = torch.split_with_sizes(self.emb(torch.cat(x).to(self.emb.weight.device)), l.unbind(0))
        x = nn.utils.rnn.pad_sequence(x, batch_first=True, padding_value=5)
        return x, l
    
    def forward(self, x):
        x, l = self.prepare_sequence(x)
        o, h = self.rnn(torch.nn.utils.rnn.pack_padded_sequence(x, l, batch_first=True))
        return self.cls(h[0])

In [4]:
device = torch.device('cuda:0')
model = IMDbRNN(2000, 32, 16, 1)
model.to(device)
model.load_state_dict(torch.load('simple_model.trch'))
model.eval();

In [5]:
import numpy as np
from tqdm import tqdm_notebook

predictions, labels = [], []
with torch.no_grad():
    for x,y in tqdm_notebook(test_loader):
        predictions.append(model(x).detach().cpu().numpy())
        labels.append(y.detach().cpu().numpy())
predictions = np.concatenate(predictions)
labels = np.concatenate(labels)

HBox(children=(IntProgress(value=0, max=25), HTML(value='')))




In [6]:
from sklearn.metrics import roc_auc_score

roc_auc_score(labels, predictions)

0.9341942943999999