In [1]:
# ## data exploration

# import pandas as pd
# from sklearn.feature_extraction.text import CountVectorizer

# # vocab size of train set
# df = pd.read_table('/content/drive/MyDrive/data/train.tsv')
# vectorizer = CountVectorizer()

# df = vectorizer.fit_transform(df['Review'])
# print('Vocab size of the train set: ', len(vectorizer.get_feature_names()))

In [2]:
# set seed
import torch

manual_seed = 572
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [3]:
## load and preprocess
from torchtext.data import Field
from torchtext.data import TabularDataset

# torchtext fields
TEXT = Field(sequential=True, tokenize='spacy', lower=True) 
LABEL = Field(sequential=False, unk_token = None)

# load data: train and val
train, dev = TabularDataset.splits(
    path='/content/drive/MyDrive/data', 
    train='train.tsv', validation='dev.tsv', 
    format='tsv',
    skip_header=True, 
    fields=[('reviews', TEXT), ('ratings', LABEL)])

# load data: test
test = TabularDataset(
  path="/content/drive/MyDrive/data/test.tsv",
  format='tsv',
  skip_header=True, 
  fields=[('reviews', TEXT)])

In [4]:
## Baseline model
from torchtext.vocab import Vectors
from torchtext.data import Iterator, BucketIterator

# import pretrained word embedding
vectors = Vectors(name='glove.42B.300d.txt', cache='/content/drive/MyDrive/data')

# build vocab, choose vocab size
TEXT.build_vocab(train, max_size=5000, min_freq=3, vectors=vectors) ##### try: max_size = 5000, 10_000, 20_000, all; min_freq=3, 5, none
LABEL.build_vocab(train)

# create splits, choose batch size
train_iter, dev_iter = BucketIterator.splits(
 (train, dev), 
 batch_sizes=(32,32), ##### try: (32,32), (64,32), (64,32), (64,64)
 sort_key=lambda x: len(x.reviews), 
 sort=True, 
 sort_within_batch=True
)

test_iter = Iterator(
  dataset = test, 
  sort = False, 
  batch_size = 32, ##### try: 32, 64
  sort_key=None, 
  shuffle=False, 
  sort_within_batch=False, 
  device = device, 
  train=False 
)

In [5]:
# create GRU class
import torch.nn as nn

class GRUmodel(nn.Module):
    def __init__(self, embedding_size, vocab_size, output_size, hidden_size, num_layers, dropout_p, nonlin):
        super(GRUmodel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size).from_pretrained(TEXT.vocab.vectors, freeze=False) ##### try: freeze = False, True
        
        self.gru_layer = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout_p)
        self.activation_fn = nonlin
        self.linear_layer = nn.Linear(hidden_size, output_size) 
        self.softmax_layer = nn.LogSoftmax(dim=1)
      
    def forward(self, x):
        out = self.embedding(x)
        out, _ = self.gru_layer(out)
        out = out[-1, :,:]
        out = self.activation_fn(out)
        out = self.linear_layer(out)
        out = self.softmax_layer(out)
        return out

In [6]:
# hyperparameters
HIDDEN_SIZE = 207 
NUM_LAYERS = 2
MAX_EPOCHS = 20 
LEARNING_RATE = 0.00413267391256542
NUM_CLASSES = 5 
EMBEDDING_SIZE = 300 
VOCAB_SIZE = len(TEXT.vocab)
DROPOUT_P = 0.023313434837778705
MOMENTUM = 0.99
NONLIN = nn.ReLU()

# set the seed
manual_seed = 333
torch.manual_seed(manual_seed)
if n_gpu > 0:
  torch.cuda.manual_seed(manual_seed)

# set model, define loss, optimizer
model = GRUmodel(EMBEDDING_SIZE, VOCAB_SIZE, NUM_CLASSES, HIDDEN_SIZE, NUM_LAYERS, DROPOUT_P, NONLIN)
model.to(device) 

criterion = nn.NLLLoss()
criterion.to(device)

optimizer = torch.optim.SGD(model.parameters(), lr=LEARNING_RATE, momentum=MOMENTUM)

In [7]:
## train and evaluate
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train_(loader):
    total_loss = 0.0
    num_sample = 0

    for batch in loader:
        # load current batch
        batch_input = batch.reviews
        batch_output = batch.ratings
        
        batch_input = batch_input.to(device)
        batch_output = batch_output.to(device)

        # forward propagation
        model_outputs = model(batch_input)
        cur_loss = criterion(model_outputs, batch_output)
        total_loss += cur_loss.cpu().item()

        # backward propagation
        optimizer.zero_grad()
        cur_loss.backward()
        optimizer.step()

        num_sample += batch_output.shape[0]

    return total_loss/num_sample

def evaluate(loader):
    all_pred = []
    all_label = []

    with torch.no_grad():
        for batch in loader:
            # load current batch
            batch_input = batch.reviews
            batch_output = batch.ratings

            batch_input = batch_input.to(device)
            
            # forward propagation
            model_outputs = model(batch_input)

            # identify predicted class
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            all_pred.extend(predicted)
            all_label.extend(batch_output.cpu())
            
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return accuracy,f1score

In [8]:
# execute
for epoch in range(MAX_EPOCHS):

    train_loss = train_(train_iter)

    train_acc, train_f1 = evaluate(train_iter)
    val_acc, val_f1 = evaluate(dev_iter)
    
    print('Epoch [{}/{}], Loss: {:.4f}, Training Accuracy: {:.4f}, Validation Accuracy: {:.4f}'.format(epoch+1, MAX_EPOCHS, train_loss, train_acc, val_acc))

Epoch [1/20], Loss: 0.0421, Training Accuracy: 0.2732, Validation Accuracy: 0.2914
Epoch [2/20], Loss: 0.0346, Training Accuracy: 0.5298, Validation Accuracy: 0.5154
Epoch [3/20], Loss: 0.0302, Training Accuracy: 0.5165, Validation Accuracy: 0.5095
Epoch [4/20], Loss: 0.0276, Training Accuracy: 0.5977, Validation Accuracy: 0.5852
Epoch [5/20], Loss: 0.0257, Training Accuracy: 0.6388, Validation Accuracy: 0.6110
Epoch [6/20], Loss: 0.0249, Training Accuracy: 0.6382, Validation Accuracy: 0.5988
Epoch [7/20], Loss: 0.0244, Training Accuracy: 0.6753, Validation Accuracy: 0.6418
Epoch [8/20], Loss: 0.0234, Training Accuracy: 0.6690, Validation Accuracy: 0.6432
Epoch [9/20], Loss: 0.0226, Training Accuracy: 0.6308, Validation Accuracy: 0.5827
Epoch [10/20], Loss: 0.0232, Training Accuracy: 0.6840, Validation Accuracy: 0.6481
Epoch [11/20], Loss: 0.0221, Training Accuracy: 0.6510, Validation Accuracy: 0.6271
Epoch [12/20], Loss: 0.0227, Training Accuracy: 0.6834, Validation Accuracy: 0.6252
E