In [1]:
import zipfile
import string
from collections import Counter
import numpy as np
from torch.utils.data import TensorDataset, DataLoader
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

torch.manual_seed(1)

<torch._C.Generator at 0x7f4878016af0>

In [2]:
# read in and preprocess data

def read_input(input_file):
    with zipfile.ZipFile(input_file) as z:
        with z.open(input_file[:-4], 'r') as f:
            for i, line in enumerate(f):
                # skip header
                if i == 0: continue
                decoded_line = line.decode('utf-8').split('\t')
                # lower case and remove punctuation
                line = [l.lower() for l in decoded_line[2] if l not in string.punctuation]
                line = ''.join(line).split()
                yield (line, int(decoded_line[-1].rstrip('\n')))

def preprocess(input_file):
    # get list of data points
    data = list(read_input('train.tsv.zip'))
    # get rid of 0-length reviews
    data = [(review, label) for (review, label) in data if len(review) > 0]
    
    return data
    
file = 'train.tsv.zip'
data = preprocess(file)

In [3]:
# encode reviews

all_reviews = [" ".join(review) for review,label in data]
all_labels = np.array([label for review,label in data])
all_words = " ".join(all_reviews).split()
all_reviews = [review.split() for review in all_reviews]

word_counts = Counter(all_words)
word_list = word_counts.most_common(len(all_words))
word_to_ix = {word:i+1 for i, (word,count) in enumerate(word_list)}
ix_to_word = {ix:word for word, ix in word_to_ix.items()}
encoded_reviews = [[word_to_ix[word] for word in review] for review in all_reviews]

In [4]:
# make all reviews the same length

def pad_data(encoded_reviews):
    max_length = len(max(encoded_reviews, key = len))
    reviews = []
    for review in encoded_reviews:
        if len(review) < max_length:
            reviews.append([0]*(max_length - len(review)) + review)
        else:
            reviews.append(review)
    return np.array(reviews)

padded_reviews = pad_data(encoded_reviews)

In [5]:
# split into training, validation, & test sets

train_index = int(len(padded_reviews)*0.7)
valid_index = int(len(padded_reviews)*0.85)

train_reviews = padded_reviews[:train_index]
valid_reviews = padded_reviews[train_index:valid_index]
test_reviews = padded_reviews[valid_index:]

# get labels for all 3 sets
train_labels = all_labels[:train_index]
valid_labels = all_labels[train_index:valid_index]
test_labels = all_labels[valid_index:]

In [6]:
# get dataloaders

train_data = TensorDataset(torch.LongTensor(train_reviews), torch.Tensor(train_labels))
valid_data = TensorDataset(torch.LongTensor(valid_reviews), torch.Tensor(valid_labels))
test_data = TensorDataset(torch.LongTensor(test_reviews), torch.Tensor(test_labels))

train_loader = DataLoader(train_data, batch_size = 50, shuffle = True, drop_last = True)
valid_loader = DataLoader(valid_data, batch_size = 50, shuffle = True, drop_last = True)
test_loader = DataLoader(test_data, batch_size = 50, shuffle = True, drop_last = True)

In [7]:
# create model

class LSTMSentiment(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, output_dim, vocab_size, layers):
        super(LSTMSentiment, self).__init__()
        self.vocab_size = vocab_size
        self.layers = layers
        self.hidden_dim = hidden_dim
        
        self.word_embeddings = nn.Embedding(vocab_size + 1, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, layers, batch_first = True)
        self.hidden2label = nn.Linear(hidden_dim, output_dim)

    def forward(self, inputs):
        embeds = self.word_embeddings(inputs)
        lstm_out, h = self.lstm(embeds)
        label_space = self.hidden2label(lstm_out)
        label_scores = F.log_softmax(label_space, dim=1)
        softmax_last = label_scores[:, -1]

        return softmax_last

In [8]:
# train model

vocab_size = len(word_to_ix)
embedding_dim = 50
hidden_dim = 50
output_dim = 5
layers = 2
num_epochs = 7
step = 0

model = LSTMSentiment(embedding_dim, hidden_dim, output_dim, vocab_size, layers)
model.cuda()

loss_function = nn.NLLLoss()
optimizer = optim.SGD(model.parameters(), lr=0.1)

for epoch in range(num_epochs):
    for inputs, labels in train_loader:
        model.zero_grad()

        inputs, labels = inputs.cuda(), labels.cuda()

        label_scores = model(inputs)
        loss = loss_function(label_scores, labels.long())
        loss.backward()
        optimizer.step()
        
        # print loss and accuracy for training and validation sets every 2200 steps
        if (step % 2200) == 0:            
            v_num_correct = 0
            valid_losses = []
            
            for v_inputs, v_labels in valid_loader:
                v_inputs, v_labels = v_inputs.cuda(), v_labels.cuda()

                v_output = model(v_inputs)
                
                # get loss
                v_loss = loss_function(v_output, v_labels.long())
                valid_losses.append(v_loss.item())
                
                # get accuracy
                v_predictions = [list(pred).index(max(list(pred))) for pred in v_output]
                v_correct = [int(p == i) for p, i in zip(v_predictions, list(v_labels))]
                v_num_correct += np.sum(v_correct)
                
            t_num_correct = 0    
            for t_inputs, t_labels in train_loader:
                t_inputs, t_labels = t_inputs.cuda(), t_labels.cuda()

                t_output = model(t_inputs)
                
                # get accuracy
                t_predictions = [list(pred).index(max(list(pred))) for pred in t_output]
                t_correct = [int(p == i) for p, i in zip(t_predictions, list(t_labels))]
                t_num_correct += np.sum(t_correct)

            print("Epoch: {}/{}".format((epoch+1), num_epochs),
                  "Training Loss: {:.4f}".format(loss.item()),
                  "Validation Loss: {:.4f}".format(np.mean(valid_losses)),
                  "Training Accuracy: {:.6f}".format(t_num_correct/len(train_loader.dataset)),
                  "Validation Accuracy: {:.6f}".format(v_num_correct/len(valid_loader.dataset)))
        step += 1

Epoch: 1/7 Training Loss: 3.8707 Validation Loss: 3.8579 Training Accuracy: 0.222096 Validation Accuracy: 0.219398
Epoch: 2/7 Training Loss: 0.1021 Validation Loss: 0.0963 Training Accuracy: 0.516146 Validation Accuracy: 0.498441
Epoch: 3/7 Training Loss: 0.0370 Validation Loss: 0.0541 Training Accuracy: 0.517226 Validation Accuracy: 0.501047
Epoch: 4/7 Training Loss: 0.0152 Validation Loss: 0.0227 Training Accuracy: 0.518490 Validation Accuracy: 0.500790
Epoch: 5/7 Training Loss: 0.0058 Validation Loss: 0.0120 Training Accuracy: 0.517785 Validation Accuracy: 0.496347
Epoch: 6/7 Training Loss: 0.0091 Validation Loss: 0.0155 Training Accuracy: 0.517364 Validation Accuracy: 0.496261
Epoch: 7/7 Training Loss: 0.0036 Validation Loss: 0.0057 Training Accuracy: 0.517876 Validation Accuracy: 0.497714


In [9]:
# test on test set

num_correct = 0
test_losses = []
        
for inputs, labels in test_loader:
    inputs, labels = inputs.cuda(), labels.cuda()
    output = model(inputs)
                
    # get loss
    loss = loss_function(output, labels.long())
    test_losses.append(loss.item())
                
    # get accuracy
    predictions = [list(pred).index(max(list(pred))) for pred in output]
    correct = [int(p == i) for p, i in zip(predictions, list(labels))]
    num_correct += np.sum(correct)
    
print("Test Loss: {:.4f}".format(np.mean(test_losses)),
      "Test Accuracy: {:.6f}".format(num_correct/len(test_loader.dataset)))

Test Loss: 0.0098 Test Accuracy: 0.481735
