In [12]:
# set seed
import torch

manual_seed = 572
torch.manual_seed(manual_seed)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
n_gpu = torch.cuda.device_count()

In [13]:
## vocab size of data
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer

# vocab size of train set
df = pd.read_table('/content/drive/MyDrive/data/train.tsv')
vectorizer = CountVectorizer()

df = vectorizer.fit_transform(df['Review'])
vocab_num = len(vectorizer.get_feature_names())

In [14]:
## load and preprocess
from torchtext.data import Field
from torchtext.data import TabularDataset
from torchtext.vocab import Vectors

# torchtext fields
TEXT = Field(sequential=True, tokenize='spacy', lower=True) 
LABEL = Field(sequential=False, unk_token = None)

# load data: train and val
train, dev = TabularDataset.splits(
    path='/content/drive/MyDrive/data', 
    train='train.tsv', validation='dev.tsv', 
    format='tsv',
    skip_header=True, 
    fields=[('reviews', TEXT), ('ratings', LABEL)])

# load data: test
test = TabularDataset(
  path="/content/drive/MyDrive/data/test.tsv",
  format='tsv',
  skip_header=True, 
  fields=[('reviews', TEXT)])

# word embedding
vectors = Vectors(name='glove.42B.300d.txt', cache='/content/drive/MyDrive/data')

In [15]:
# train and evaluate
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score

def train_(loader,model,criterion,optimizer,device):
    total_loss = 0.0
    num_sample = 0

    for batch in loader:
        # load the current batch
        batch_input = batch.reviews
        batch_output = batch.ratings
        
        batch_input = batch_input.to(device)
        batch_output = batch_output.to(device)

        # forward propagation
        model_outputs = model(batch_input)
        cur_loss = criterion(model_outputs, batch_output)
        total_loss += cur_loss.cpu().item()

        # backward propagation
        optimizer.zero_grad()
        cur_loss.backward()
        optimizer.step()

        num_sample += batch_output.shape[0]
    return total_loss/num_sample

def evaluate(loader,model,criterion,device):
    all_pred=[]
    all_label = []
    
    with torch.no_grad(): 
        for batch in loader:
             # load the current batch
            batch_input = batch.reviews
            batch_output = batch.ratings

            batch_input = batch_input.to(device)

            # forward propagation
            model_outputs = model(batch_input)
            probabilities, predicted = torch.max(model_outputs.cpu().data, 1)
            all_pred.extend(predicted)
            all_label.extend(batch_output.cpu())
            
    accuracy = accuracy_score(all_label, all_pred)
    f1score = f1_score(all_label, all_pred, average='macro') 
    return accuracy, f1score

In [16]:
# create neural network
import torch.nn as nn

class GRUmodel(nn.Module):
    def __init__(self, embedding_size, vocab_size, output_size, hidden_size, num_layers, dropout_p, nonlin):
        super(GRUmodel, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embedding_size).from_pretrained(TEXT.vocab.vectors, freeze=False)
        
        self.gru_layer = nn.GRU(input_size=embedding_size, hidden_size=hidden_size, num_layers=num_layers, dropout=dropout_p)
        self.activation_fn = nonlin
        self.linear_layer = nn.Linear(hidden_size, output_size) 
        self.softmax_layer = nn.LogSoftmax(dim=1)
      
    def forward(self, x):
        out = self.embedding(x)
        out, _ = self.gru_layer(out)
        out = out[-1, :,:]
        out = self.activation_fn(out)
        out = self.linear_layer(out)
        out = self.softmax_layer(out)
        return out

In [17]:
## hyperparameter tuning
from torchtext.data import Iterator, BucketIterator
import scipy
import numpy as np

def random_search(num_iter):
    for i in range(num_iter):
        results = []
        config = {
            'max_vocab': scipy.stats.randint.rvs(5000, vocab_num),
            'min_freq': scipy.stats.randint.rvs(0,5),
            'batch_size': np.random.choice([32, 64]),
            'layers': scipy.stats.randint.rvs(2,6),
            'lr': scipy.stats.loguniform.rvs(10**-3,1),
            'nonlin' : np.random.choice([nn.ReLU(),nn.Tanh(), nn.Sigmoid()]),
            'dropout': scipy.stats.loguniform.rvs(0.01,0.9),
            'hidden_nodes': scipy.stats.randint.rvs(50,300),
            'max_epochs': scipy.stats.randint.rvs(5,20),
            'momentum': np.random.choice([0.99, 0.9, 0.5])
        }
        
        # build vocab, choose vocab size
        TEXT.build_vocab(train, max_size=config['max_vocab'], min_freq=config['min_freq'], vectors=vectors)
        LABEL.build_vocab(train)

        # create splits, choose batch size
        train_iter, dev_iter = BucketIterator.splits(
        (train, dev), 
        batch_sizes=(config['batch_size'], config['batch_size']),
        sort_key=lambda x: len(x.reviews), 
        sort=True, 
        sort_within_batch=True
        )

        test_iter = Iterator(
          dataset = test, 
          sort = False, 
          batch_size = config['batch_size'],
          sort_key=None, 
          shuffle=False, 
          sort_within_batch=False, 
          device = device, 
          train=False 
        )

        # constants
        NUM_CLASSES = 5 
        EMBEDDING_SIZE = 300 
        VOCAB_SIZE = len(TEXT.vocab)

        # initialize model
        model = GRUmodel(EMBEDDING_SIZE, VOCAB_SIZE, NUM_CLASSES, config['hidden_nodes'], config['layers'], config['dropout'], config['nonlin'])
        model.to(device)
        criterion = nn.NLLLoss()
        optimizer = np.random.choice([torch.optim.SGD(model.parameters(), lr=config['lr'], momentum=config['momentum']), 
                                      torch.optim.Adam(model.parameters(), lr=config['lr'])])
        
        # print configuration
        print("new config, iteration", i+1)
        config['optimizer'] = optimizer
        print(config)

        max_dev = 0
        best_epoch = 0
        for epoch in range(config['max_epochs']):
            train_loss = train_(train_iter,model,criterion,optimizer,device)

            train_acc, train_f1 = evaluate(train_iter,model,criterion,device)
            dev_acc, dev_f1 = evaluate(dev_iter,model,criterion,device)
                    
            if dev_acc > max_dev:
                max_dev = dev_acc
                best_epoch = epoch+1
                
            print('Epoch [{}/{}], Loss: {:.4f}, Training Accuracy: {:.4f}, Validation Accuracy: {:.4f}'.format(epoch+1, config['max_epochs'], train_loss, train_acc, dev_acc))
        results.append((max_dev,best_epoch,config))
        print("Best validation score for iterations #{}: {}".format(i+1,max_dev))
    return results

In [None]:
random_search(20)

new config, iteration 1
{'max_vocab': 41042, 'min_freq': 0, 'batch_size': 64, 'layers': 3, 'lr': 0.12227144096824473, 'nonlin': Tanh(), 'dropout': 0.7163376083095391, 'hidden_nodes': 179, 'max_epochs': 16, 'momentum': 0.99, 'optimizer': SGD (
Parameter Group 0
    dampening: 0
    lr: 0.12227144096824473
    momentum: 0.99
    nesterov: False
    weight_decay: 0
)}
Epoch [1/16], Loss: 0.0820, Training Accuracy: 0.2999, Validation Accuracy: 0.3255
Epoch [2/16], Loss: 0.1810, Training Accuracy: 0.2604, Validation Accuracy: 0.2733
Epoch [3/16], Loss: 0.2379, Training Accuracy: 0.3168, Validation Accuracy: 0.3236
Epoch [4/16], Loss: 0.2288, Training Accuracy: 0.2945, Validation Accuracy: 0.3050
Epoch [5/16], Loss: 0.2091, Training Accuracy: 0.1416, Validation Accuracy: 0.1415
Epoch [6/16], Loss: 0.1847, Training Accuracy: 0.2681, Validation Accuracy: 0.2714
Epoch [7/16], Loss: 0.1877, Training Accuracy: 0.3187, Validation Accuracy: 0.3324
Epoch [8/16], Loss: 0.1592, Training Accuracy: 0.27