In [None]:
from google.colab import drive
drive.mount('/content/drive')
root_folder = "/content/drive/My Drive/CS 182/CS182-Spring2020-NLP-Project/"

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [None]:
import os
import sys
sys.path.append(root_folder)
from collections import Counter
import torch as th
from torch import nn
import torch.nn.functional as F
import torch.optim as optim
import word_embed as embed
import numpy as np
import json
import random

Loading word2vec model
Finished loading word2vec model


In [None]:
def getStar(pred):
  a = th.argmax(pred)
  return [0] * (a) + [1] + [0] * (len(pred) - a - 1)

def isCorrect(predicted, actual):
  return np.argmax(np.array(predicted)) == np.argmax(np.array(actual))

In [None]:
hidden_size = 256
num_layers = 2
batch_size = 128
epochs = 50
lr = 5e-5
input_size = 300
output_size = 5
dropout = 0.1

device = th.device("cuda" if th.cuda.is_available() else "cpu")
print(device)

class LSTM(nn.Module):
    def __init__(self):
        super(LSTM, self).__init__()
        self.lstm = nn.LSTM(input_size, hidden_size, num_layers, batch_first=True, dropout=dropout, bidirectional=True)
        self.linear = nn.Linear(2 * hidden_size, output_size)

    def forward(self, x):
        _, (hidden, _) = self.lstm(x)
        hidden = th.cat((hidden[-2, :, :], hidden[-1, :, :]), dim = 1)
        out = self.linear(hidden)
        return out

cuda


In [None]:
def train(model_name_to_save, trained_model_name = None):
    model = LSTM()
    if trained_model_name != None:
      model.load_state_dict(th.load(root_folder + trained_model_name))
    model.to(device)

    weights = th.cuda.FloatTensor([91365 / 373507, 25058 / 373507, 24085 / 373507, 50803 / 373507, 182196 / 373507])
    weights = 1.0 / weights
    weights = weights / weights.sum()
    loss_fn = nn.CrossEntropyLoss(weight=weights)
    loss_fn.to(device)

    optimizer = optim.Adam(model.parameters(), lr = lr)

    data = []
    with open(root_folder + 'dataset/training_data.jsonl', 'r') as file:
        data = [json.loads(jline) for jline in file.read().splitlines()]

    num_batches = len(data) // batch_size
    for epoch in range(epochs):
        count = 0
        batch = []

        for elem in data:
            batch_data = th.cuda.FloatTensor(embed.embed_review(elem["text"]))
            batch_label = int(elem["stars"]) - 1
            batch += [[batch_data, batch_label]]
            count += 1

            if count % batch_size == 0:
                random.shuffle(batch)
                
                # forward and backward passes
                batch_data = [elem[0] for elem in batch]
                batch_labels = [elem[1] for elem in batch]

                padded = nn.utils.rnn.pad_sequence(batch_data, batch_first=True)
                padded.to(device)
                targets = th.cuda.LongTensor(batch_labels)

                model.zero_grad()
                output = model(padded).squeeze()
                loss = loss_fn(output, targets)
                loss.backward()
                optimizer.step()

                """
                correct = 0
                for i in range(len(output)):
                  predicted = getStar(output[i])
                  actual = batch_labels[i]
                  print(len(predicted))

                  if isCorrect(predicted, actual):
                    correct += 1
                """

                batch = []
                
                batch_count = count // batch_size
                progress = str(round(batch_count / num_batches, 2))
                print("Epoch: " + str(epoch) + "\tBatch: " + str(batch_count) + "\tProgress: " + progress + "\tLoss: " + str(loss.item()))
        
        th.save(model.state_dict(), root_folder + model_name_to_save)

In [None]:
def validate(model_name):
    model = LSTM()
    model.load_state_dict(th.load(root_folder + model_name))
    model.to(device)
    model.eval()

    data = []
    with open(root_folder + 'dataset/test_data.jsonl', 'r') as file:
        data = [json.loads(jline) for jline in file.read().splitlines()]

    total_diff = 0
    correct = 0
    count = 0
    for elem in data:
        tensor = th.cuda.FloatTensor(embed.embed_review(elem["text"])).unsqueeze(0)

        output = model(tensor)
        prediction = th.argmax(output)
        # was 0-indexed, so adding 1
        prediction = int(prediction) + 1
        
        total_diff += abs(prediction - int(elem["stars"]))
        correct += 1 if prediction == int(elem["stars"]) else 0
        count += 1
        print("Predicted:", prediction,
            "\tActual:", elem["stars"],
            "\tAccuracy:", correct / count,
            "\tDiff:", total_diff)

In [None]:
print(device)
validate("model128.pt")
# train("model128.pt", "model128.pt")

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
Predicted: 2 	Actual: 1.0 	Accuracy: 0.6954238094253397 	Diff: 19495
Predicted: 4 	Actual: 5.0 	Accuracy: 0.6954094292803971 	Diff: 19496
Predicted: 5 	Actual: 4.0 	Accuracy: 0.6953950497301544 	Diff: 19497
Predicted: 4 	Actual: 5.0 	Accuracy: 0.6953806707745751 	Diff: 19498
Predicted: 5 	Actual: 5.0 	Accuracy: 0.6953869693774166 	Diff: 19498
Predicted: 5 	Actual: 5.0 	Accuracy: 0.6953932677197916 	Diff: 19498
Predicted: 1 	Actual: 2.0 	Accuracy: 0.6953788896929598 	Diff: 19499
Predicted: 1 	Actual: 3.0 	Accuracy: 0.695364512260679 	Diff: 19501
Predicted: 5 	Actual: 5.0 	Accuracy: 0.695370810676701 	Diff: 19501
Predicted: 5 	Actual: 5.0 	Accuracy: 0.6953771088322858 	Diff: 19501
Predicted: 5 	Actual: 5.0 	Accuracy: 0.6953834067274494 	Diff: 19501
Predicted: 4 	Actual: 5.0 	Accuracy: 0.695369030390738 	Diff: 19502
Predicted: 1 	Actual: 1.0 	Accuracy: 0.695375328192512 	Diff: 19502
Predicted: 5 	Actual: 5.0 	Accuracy: 0.695