In [None]:
import torch
import numpy as np
from torch.utils.data import DataLoader, TensorDataset
from torch import Tensor
import pandas as pd
from sklearn.utils import shuffle
from processor import Processor
from sklearn.metrics import accuracy_score
from lstm import LSTM
import itertools

train = pd.read_table("train.tsv")
valid = pd.read_table("valid.tsv")
test = pd.read_table("test.tsv")

In [None]:
Processor.tokenize(train)
Processor.tokenize(valid)
Processor.tokenize(test)
Processor.preprocess_text_data(train)
Processor.preprocess_text_data(valid)
Processor.preprocess_text_data(test)
encodings = Processor.encode(train)
label_encodings = encodings["label"]
train = shuffle(train)
valid = shuffle(valid)
test = shuffle(test)
training_label = torch.tensor(train["label"])
valid_label = torch.tensor(valid["label"])
test_label = torch.tensor(test["label"])

In [None]:
data_train = list(train["text"])
data_valid = list(valid["text"])
data_test = list(test["text"])
padded = list(zip(*itertools.zip_longest(*data_train, fillvalue=0)))
valid_padded = list(zip(*itertools.zip_longest(*data_valid, fillvalue=0)))
test_padded = list(zip(*itertools.zip_longest(*data_test, fillvalue=0)))
training_data = torch.tensor(padded)
valid_data = torch.tensor(valid_padded)
test_data = torch.tensor(test_padded)

In [None]:
# Create custom data loaders 
dataset = TensorDataset(training_data, training_label)
train_loader = DataLoader(dataset, batch_size=32)
dataset = TensorDataset(valid_data, valid_label)
valid_loader = DataLoader(dataset, batch_size=32)
dataset = TensorDataset(test_data, test_label)
test_loader = DataLoader(dataset, batch_size=32)

In [None]:
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
lstm = LSTM(1778, 512, 512)

criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(lstm.parameters(), lr=0.001)

loss_all = 0
total_train = 0

for epoch in range(4000):
    for data, labels in train_loader:
        optimizer.zero_grad()

        y_pred = lstm(data)

        # Compute Loss
        loss = criterion(y_pred, labels)
        loss_all = loss.item() * labels.shape[0]
        total_train += labels.shape[0]

        loss.backward()
        optimizer.step()

    lstm.eval()
    correct = 0
    total_valid = valid_label.shape[0]

    with torch.no_grad():
        y_pred = lstm(valid_data)
        loss = criterion(y_pred, valid_label)

    lstm.train()

    if epoch % 10 == 0:
        print("Current Epoch: ", epoch)
        print("Training Loss: ", loss_all / total_train)
        print("Validation Accuracy: {0:.2f}% ".format(accuracy_score(valid_label, np.argmax(y_pred.data.numpy(), axis=1)) * 100))


In [None]:
from sklearn.metrics import classification_report, matthews_corrcoef

lstm.eval()
correct = 0
total = test_label.shape[0]

preds = []

with torch.no_grad():
    y_pred = lstm(test_data)
    loss = criterion(y_pred.squeeze(0), test_label)
    for idx, i in enumerate(y_pred.squeeze(0)):
        pred = torch.argmax(i)
        preds.append(pred)
        if pred == test_label[idx]:
            correct += 1

# Use the same sentiment_to_id mapping used during training for target_names
target_names = {v: k for k, v in label_encodings.items()}

print(classification_report(test_label, preds, target_names=target_names))
print(matthews_corrcoef(test_label, preds))
print("Testing accuracy: {:.2f}%".format(correct / total * 100))
