## Packages

In [1]:
import os
import numpy as np

In [33]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader

import spacy

## Import Data

In [3]:
train_texts_neg = []
for filename in os.listdir('./data/train/neg'):
    file = open('./data/train/neg/' + filename, 'r')
    train_texts_neg.append(file.read())
    
train_texts_pos = []
for filename in os.listdir('./data/train/pos'):
    file = open('./data/train/pos/' + filename, 'r')
    train_texts_pos.append(file.read())
    
test_texts_neg = []
for filename in os.listdir('./data/test/neg'):
    file = open('./data/test/neg/' + filename, 'r')
    test_texts_neg.append(file.read())
    
test_texts_pos = []
for filename in os.listdir('./data/test/pos'):
    file = open('./data/test/pos/' + filename, 'r')
    test_texts_pos.append(file.read())

In [4]:
# subsample for the purpose of speed
train_texts_neg = train_texts_neg[:100]
train_texts_pos = train_texts_pos[:100]
test_texts_neg = test_texts_neg[:100]
test_texts_pos = test_texts_pos[:100]

In [5]:
print(len(train_texts_neg))
print(len(train_texts_pos))
print(len(test_texts_neg))
print(len(test_texts_pos))

100
100
100
100


## Tokenize

In [6]:
tok = spacy.load('en')

In [7]:
train_texts_neg_tok = [tok(line) for line in train_texts_neg]
train_texts_pos_tok = [tok(line) for line in train_texts_pos]
test_texts_neg_tok = [tok(line) for line in test_texts_neg]
test_texts_pos_tok = [tok(line) for line in test_texts_pos]

## Remove Stop Words and Punctuation

In [8]:
train_texts_neg_stop = [[word for word in sent if not (word.is_stop or word.is_punct)] for sent in train_texts_neg_tok]
train_texts_pos_stop = [[word for word in sent if not (word.is_stop or word.is_punct)] for sent in train_texts_pos_tok]
test_texts_neg_stop = [[word for word in sent if not (word.is_stop or word.is_punct)] for sent in test_texts_neg_tok]
test_texts_pos_stop = [[word for word in sent if not (word.is_stop or word.is_punct)] for sent in test_texts_pos_tok]

In [9]:
# should also remove all the .<br> and the like, but no point in it now

## Lemmatize

In [10]:
# train_texts_neg_lem = [[word.lemma_ for word in sent if word.has_vector and not (word.is_stop or word.is_punct)] for sent in train_texts_neg_stop]
# train_texts_pos_lem = [[word.lemma_ for word in sent if word.has_vector and not (word.is_stop or word.is_punct)] for sent in train_texts_pos_stop]
# test_texts_neg_lem = [[word.lemma_ for word in sent if word.has_vector and not (word.is_stop or word.is_punct)] for sent in test_texts_neg_stop]
# test_texts_pos_lem = [[word.lemma_ for word in sent if word.has_vector and not (word.is_stop or word.is_punct)] for sent in test_texts_pos_stop]

In [11]:
train_texts_neg_lem = train_texts_neg_stop
train_texts_pos_lem = train_texts_pos_stop
test_texts_neg_lem = test_texts_neg_stop
test_texts_pos_lem = test_texts_pos_stop

## Create Vocab Dicts

In [12]:
train_texts_neg_vect = [[word for word in sent if word.has_vector] for sent in train_texts_neg_lem]
train_texts_pos_vect = [[word for word in sent if word.has_vector] for sent in train_texts_pos_lem]
test_texts_neg_vect = [[word for word in sent if word.has_vector] for sent in test_texts_neg_lem]
test_texts_pos_vect = [[word for word in sent if word.has_vector] for sent in test_texts_pos_lem]

In [13]:
all_words  = set(
    [word for sentence in train_texts_neg_lem for word in sentence] + \
    [word for sentence in train_texts_pos_lem for word in sentence] + \
    [word for sentence in test_texts_neg_lem for word in sentence] + \
    [word for sentence in test_texts_pos_lem for word in sentence])

In [14]:
word2ix = {word: i for i, word in enumerate(all_words)}

## Pad Sequences

In [15]:
pad_str = 'pad'
pad_int = len(all_words)

In [16]:
word2ix[pad_str] = pad_int
ix2word = {i: word for word, i in word2ix.items()}

In [17]:
max_len = max(
[max([len(sent) for sent in train_texts_neg_vect])] + 
[max([len(sent) for sent in train_texts_pos_vect])] +
[max([len(sent) for sent in test_texts_neg_vect])] + 
[max([len(sent) for sent in test_texts_pos_vect])])

In [18]:
def pad_seq(sent, max_len):
    return sent + [pad_str]*(max_len - len(sent))

In [19]:
train_texts_neg_pad = [pad_seq(sent, max_len) for sent in train_texts_neg_vect]
train_texts_pos_pad = [pad_seq(sent, max_len) for sent in train_texts_pos_vect]
test_texts_neg_pad = [pad_seq(sent, max_len) for sent in test_texts_neg_vect]
test_texts_pos_pad = [pad_seq(sent, max_len) for sent in test_texts_pos_vect]

## Change to Padded Integer Lists of Tensors

In [20]:
train_texts_neg_int = [[word2ix[word] for word in sent] for sent in train_texts_neg_pad]
train_texts_pos_int = [[word2ix[word] for word in sent] for sent in train_texts_pos_pad]
test_texts_neg_int = [[word2ix[word] for word in sent] for sent in test_texts_neg_pad]
test_texts_pos_int = [[word2ix[word] for word in sent] for sent in test_texts_pos_pad]

In [21]:
train_texts_neg_tens = [torch.tensor(sent, dtype=torch.long) for sent in train_texts_neg_int]
train_texts_pos_tens = [torch.tensor(sent, dtype=torch.long) for sent in train_texts_pos_int]
test_texts_neg_tens = [torch.tensor(sent, dtype=torch.long) for sent in test_texts_neg_int]
test_texts_pos_tens = [torch.tensor(sent, dtype=torch.long) for sent in test_texts_pos_int]

## Prepare weight matrix with Spacy's word vectors

In [166]:
weights_matrix = np.zeros((len(all_words)+1, 384)) ## 384 is Spacy's builtin vector length

In [167]:
for i, word in enumerate(all_words):
    weights_matrix[i] = word.vector

In [168]:
# this function will create the layer and return the proper sizes
def create_emb_layer(weights_matrix, non_trainable=False):
    num_embeddings, embedding_dim = weights_matrix.shape
    emb_layer = nn.Embedding(num_embeddings, embedding_dim)
    emb_layer.load_state_dict({'weight': torch.Tensor(weights_matrix)})
    if non_trainable:
        emb_layer.weight.requires_grad = False

    return emb_layer, num_embeddings, embedding_dim

## Package Training and Test Data

In [169]:
X_train = train_texts_neg_tens + train_texts_pos_tens
X_test = test_texts_neg_tens + test_texts_pos_tens
y_train = [0] * len(train_texts_neg_pairs) + [1] * len(train_texts_pos_pairs)
y_test = [0] * len(test_texts_neg_pairs) + [1] * len(test_texts_pos_pairs)

In [170]:
y_train = [torch.tensor(tag, dtype=torch.float) for tag in y_train]
y_test = [torch.tensor(tag, dtype=torch.float) for tag in y_test]

In [171]:
class IMDBSentimentDataset(Dataset):
    
    def __init__(self, X, y):
        self.X = X
        self.y = y
        
    def __len__(self):
        return len(self.X)
        
    def __getitem__(self, idx):
        return {'text': self.X[idx], 'tag': self.y[idx]}

In [172]:
X_train_dataset = IMDBSentimentDataset(X_train, y_train)
X_test_dataset = IMDBSentimentDataset(X_test, y_test)

In [173]:
trainloader = DataLoader(X_train_dataset, batch_size=32, shuffle=True, num_workers=2)
testloader = DataLoader(X_test_dataset, batch_size=32, shuffle=True, num_workers=2)

# Model 

We'll begin with a very simple model, just to test things

In [174]:
embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)

In [175]:
class simpleRNN(nn.Module):

    def __init__(self):
        super(simpleRNN, self).__init__()
        self.size_hidden = 64
        self.embedding, num_embeddings, embedding_dim = create_emb_layer(weights_matrix)
        self.GRU = nn.GRU(embedding_dim, self.size_hidden, 1, batch_first=True)
        self.lin = nn.Linear(self.size_hidden, 1)
        self.sigmoid = nn.Sigmoid()
        
    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly
        # why they have this dimensionality.
        # The axes semantics are (num_layers, minibatch_size, hidden_dim)
        return (torch.zeros(1, 1, self.size_hidden),
                torch.zeros(1, 1, self.size_hidden))

    def forward(self, inputs):
        embeds = self.embedding(inputs)
        out, hidden = self.GRU(embeds)
        hid = hidden.view((-1,self.size_hidden))
        probs = self.sigmoid(self.lin(hid)).view(-1)
        return probs

In [176]:
model = simpleRNN()
loss_function = nn.BCELoss()
optimizer = optim.Adam(model.parameters())

## Pre-Evaluate

In [177]:
with torch.no_grad():
    preds = np.array([])
    labels = np.array([])
    for batch in testloader:
        preds = np.append(preds, model(batch['text']).numpy())
        labels = np.append(labels, batch['tag'].numpy())

In [178]:
preds = np.where(preds > 0.5, 1, 0)

In [179]:
print("Pre-Accuracy is {}".format(np.mean(preds == labels)))

Pre-Accuracy is 0.5


## Train

In [180]:
loss_acc =  0
for epoch in range(5):
    for i_batch, sample_batched in enumerate(trainloader):
        print("now in epoch {} in batch {}".format(epoch, i_batch))
        
        # Step 1. Remember that Pytorch accumulates gradients.
        # We need to clear them out before each instance
        model.zero_grad()
        
        model.hidden = model.init_hidden()

        # Step 3. Run our forward pass.
        texts = sample_batched['text']
        tags = sample_batched['tag']
        
        preds = model(texts)

        # Step 4. Compute the loss, gradients, and update the parameters by
        #  calling optimizer.step()
        loss = loss_function(preds, tags)
        loss_acc += loss
        loss.backward()
        optimizer.step()
    print("in epoch {} the loss is {}".format(epoch, loss_acc))
    loss_acc = 0

now in epoch 0 in batch 0
now in epoch 0 in batch 1
now in epoch 0 in batch 2
now in epoch 0 in batch 3
now in epoch 0 in batch 4
now in epoch 0 in batch 5
now in epoch 0 in batch 6
in epoch 0 the loss is 4.868297100067139
now in epoch 1 in batch 0
now in epoch 1 in batch 1
now in epoch 1 in batch 2
now in epoch 1 in batch 3
now in epoch 1 in batch 4
now in epoch 1 in batch 5
now in epoch 1 in batch 6
in epoch 1 the loss is 4.838747978210449
now in epoch 2 in batch 0
now in epoch 2 in batch 1
now in epoch 2 in batch 2
now in epoch 2 in batch 3
now in epoch 2 in batch 4
now in epoch 2 in batch 5
now in epoch 2 in batch 6
in epoch 2 the loss is 4.830099105834961
now in epoch 3 in batch 0
now in epoch 3 in batch 1
now in epoch 3 in batch 2
now in epoch 3 in batch 3
now in epoch 3 in batch 4
now in epoch 3 in batch 5
now in epoch 3 in batch 6
in epoch 3 the loss is 4.858771800994873
now in epoch 4 in batch 0
now in epoch 4 in batch 1
now in epoch 4 in batch 2
now in epoch 4 in batch 3
now 

## Post-Evaluate

In [182]:
with torch.no_grad():
    preds = np.array([])
    labels = np.array([])
    for batch in testloader:
        preds = np.append(preds, model(batch['text']).numpy())
        labels = np.append(labels, batch['tag'].numpy())

In [183]:
preds = np.where(preds > 0.5, 1, 0)

In [184]:
print("Post-Accuracy is {}".format(np.mean(preds == labels)))

Post-Accuracy is 0.5
