# Lab 6: RNNs & Word Embeddings

Author: Ravi C, Ren Yi

Edit by Long Chen

In [60]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity
import gensim
from gensim.models import KeyedVectors
import nltk
nltk.download('punkt')
from nltk.tokenize import word_tokenize
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset

# If there's a GPU available...
if torch.cuda.is_available():    

    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

print("Device being used: %s" %device)

No GPU available, using the CPU instead.
Device being used: cpu


[nltk_data] Downloading package punkt to /Users/longchen/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


## Goal:
- Understand the mechanics of RNNs in Pytorch
- Train RNN based neural networks on text data
- Basics of word embedding and how to use them

## Problem Setup

### Dataset
Download the two files in the data folder [here](https://drive.google.com/drive/folders/1KBUyfU87zz8eOZwr2ifDi2Z4LBHlSZ28?usp=sharing). Save the folder in the same directory as this notebook.

For the first part, we will be using the [First GOP Debate Twitter Sentiment dataset](https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras/data), which contains Tweets after the first GOP debate and their sentiments (among other stuff).

In [7]:
np.random.seed(1111)

df = pd.read_csv('data/Sentiment.csv')
df.head()

Unnamed: 0,id,candidate,candidate_confidence,relevant_yn,relevant_yn_confidence,sentiment,sentiment_confidence,subject_matter,subject_matter_confidence,candidate_gold,...,relevant_yn_gold,retweet_count,sentiment_gold,subject_matter_gold,text,tweet_coord,tweet_created,tweet_id,tweet_location,user_timezone
0,1,No candidate mentioned,1.0,yes,1.0,Neutral,0.6578,None of the above,1.0,,...,,5,,,RT @NancyLeeGrahn: How did everyone feel about...,,2015-08-07 09:54:46 -0700,629697200650592256,,Quito
1,2,Scott Walker,1.0,yes,1.0,Positive,0.6333,None of the above,1.0,,...,,26,,,RT @ScottWalker: Didn't catch the full #GOPdeb...,,2015-08-07 09:54:46 -0700,629697199560069120,,
2,3,No candidate mentioned,1.0,yes,1.0,Neutral,0.6629,None of the above,0.6629,,...,,27,,,RT @TJMShow: No mention of Tamir Rice and the ...,,2015-08-07 09:54:46 -0700,629697199312482304,,
3,4,No candidate mentioned,1.0,yes,1.0,Positive,1.0,None of the above,0.7039,,...,,138,,,RT @RobGeorge: That Carly Fiorina is trending ...,,2015-08-07 09:54:45 -0700,629697197118861312,Texas,Central Time (US & Canada)
4,5,Donald Trump,1.0,yes,1.0,Positive,0.7045,None of the above,1.0,,...,,156,,,RT @DanScavino: #GOPDebate w/ @realDonaldTrump...,,2015-08-07 09:54:45 -0700,629697196967903232,,Arizona


Let's first look at some basic intuition and stats of the data

In [8]:
# Training data is a string of words
df.loc[0, 'text']

'RT @NancyLeeGrahn: How did everyone feel about the Climate Change question last night? Exactly. #GOPDebate'

In [9]:
pd.DataFrame(df.groupby('sentiment').count()['text'])

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
Negative,8493
Neutral,3142
Positive,2236


For simplicity, 
- we only use ```X = 'sentiment'``` and ```y = 'text'``` from the original dataframe. 
- We only look at positive (1) and negative (0) tweets.

In [10]:
df = df[['sentiment', 'text']]
df = df[df['sentiment'] != 'Neutral']
df['sentiment'] = [1 if s == "Positive" else 0 for s in df['sentiment']]
df.groupby('sentiment').count()

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
0,8493
1,2236


In [11]:
train_data, test_data = train_test_split(df, test_size=0.10, random_state=42)
train_data.index = np.arange(len(train_data))
test_data.index = np.arange(len(test_data))
train_data.groupby('sentiment').count().apply(lambda x: 100 * x / float(x.sum()))

Unnamed: 0_level_0,text
sentiment,Unnamed: 1_level_1
0,79.152858
1,20.847142


### Input representations

#### Build vocabulary
We need to build a vocabulary using words in our training data. Any words in the test set that are not in our vocabulary will be replaced with an ```<UNK>``` token. We will also add a ```<PAD>``` token as padding.

For computational purposes, we'll only take words that appeared more than 3 times.

In [13]:
UNK = "<UNK>"
PAD = "<PAD>"

def build_vocab(sentences, min_count=3, max_vocab=None):
    """
    Build vocabulary from sentences (list of strings)
    """
    # keep track of the number of appearance of each word
    word_count = Counter()
    
    for sentence in sentences:
        # Regular expression operations: [] (indicate a set of characters), 
        sentence = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', sentence)
        word_count.update(word_tokenize(sentence.lower()))
    
    vocabulary = list([w for w in word_count if word_count[w] > min_count]) + [UNK, PAD]
    indices = dict(zip(vocabulary, range(len(vocabulary))))

    return vocabulary, indices

vocabulary, vocab_indices = build_vocab(train_data['text'])

print(len(vocabulary))

3069


## Model Time

#### Word representations
Next, we neeed to convert each word/token in the sentences into its index in the vocabulary so that pytorch can use it. We do this for both train and test set.

### DataLoader

In [14]:
class TweetDataset(Dataset):
    def __init__(self, vocab_index, df, label = 'sentiment'):
        self.vocab_index = vocab_index
        self.df = df
        self.label = label
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, key):
        sentence = self.df.loc[key, 'text']
        sentence = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', sentence)
        token_indices = np.array([self.vocab_index[word] if word in self.vocab_index else self.vocab_index['<UNK>'] for word in word_tokenize(sentence.lower())])
        return (torch.tensor(token_indices) , self.df.loc[key, self.label])


def pad_collate(batch):
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]
    
    # I want to    eat an     apple
    # I am   going to  sleep  PAD  
    # batch_first: output will be in B x T x * if True, or in T x B x * otherwise
    xx_pad = pad_sequence(xx, batch_first=True, padding_value=len(vocabulary)-1)

    return torch.as_tensor(xx_pad), torch.as_tensor(x_lens), torch.LongTensor(yy)
    

BATCH_SIZE = 32
# shuffle: set to True to have the data reshuffled at every epoch
train_loader = DataLoader(TweetDataset(vocab_indices, train_data),
                          batch_size=BATCH_SIZE,
                          shuffle=True,
                          collate_fn = pad_collate)
test_loader = DataLoader(TweetDataset(vocab_indices, test_data),
                         batch_size=BATCH_SIZE,
                         shuffle=True,
                         collate_fn = pad_collate)

Let's get a general idea of what an instance of training batch will be like.

In [30]:
sample_input = next(iter(train_loader))
print("Padded sequence".center(80, '*'))
print(sample_input[0][0])
print("Length of sequence".center(80, '*'))
print(sample_input[1])
print("Label of sequence".center(80, '*'))
print(sample_input[2])

********************************Padded sequence*********************************
tensor([  30,   17, 3067,   31,   95,  946,   87, 1029,    1, 1767, 1768,    5,
         265,   37,   93,   17, 1267,   48,  343,  594,   90,  354,   51,  627,
          93,   32, 3068, 3068, 3068, 3068])
*******************************Length of sequence*******************************
tensor([26, 30, 12, 22, 26, 20, 28, 22, 19, 27, 25, 21, 21, 15, 23, 30, 30, 23,
        26, 21, 13, 22, 15, 21, 25, 27, 28, 26, 22, 18, 19, 21])
*******************************Label of sequence********************************
tensor([0, 1, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0, 1, 0])


### RNN

For this lab, we will be exploring two variants of RNN: vanilla (or Elman) RNN and LSTM (Long-short term memory). In the following code block, please try to define your own model. Here are some hints.

- Each input word is represented by a vector of dimension ```embedding_dim```. Check out ```nn.Embedding``` to see how to initialize embeddings randomly.
- Your model should take the following input parameters
    - ```hidden_dim```: The number of features in the hidden state h of your RNN layer
    - ```output_dim```: Number of output classes
    - ```vocab_size``` Size of your vocabulary. 
    - ```embedding_dim```: Dimension of word embeddings
- Your model should consist of an RNN layer (you can use either ```nn.RNN``` or ```nn.LSTM```) followed by a linear layer.
- $h_{0}$ (and $c$ if you use LSTM) should be initialized as a zero vector of dimension ```hidden_dim```. You might want to check out ```nn.Parameter```

In [31]:
class RNN(nn.Module):
    def __init__(self, hidden_dim, output_dim, 
                 vocab_size, embedding_dim, rnn='LSTM'):
        super(RNN, self).__init__()
        
        self.emb = nn.Embedding(vocab_size, embedding_dim, padding_idx=vocab_size-1)
        self.hidden_dim = hidden_dim
        self.rnn_fn = rnn
        assert self.rnn_fn in ['LSTM', 'RNN']
        self.rnn = getattr(nn, rnn)(embedding_dim, hidden_dim, batch_first=True)
        self.fc = nn.Linear(hidden_dim, output_dim)
        
    def forward(self, x, x_len):
        x = self.emb(x)
        
        # output:  tensor containing the output features (h_t) from the last layer of the RNN, tensor containing the hidden state for t = seq_len.
        # pack_padded_sequence: Packs a Tensor containing padded sequences of variable length.
        # enforce_sorted: if True, the input is expected to contain sequences sorted by length in a decreasing order. If False, the input will get sorted unconditionally.
        _, last_hidden = self.rnn(pack_padded_sequence(x, x_len, batch_first=True, enforce_sorted=False))
        if self.rnn_fn == 'LSTM':
            # (h,c)
            # c_0: tensor containing the initial cell state for each element in the batch.
            last_hidden = last_hidden[0]
        out = self.fc(last_hidden.view(-1, self.hidden_dim))
        return out

In [11]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


### Train and validation loop

In [32]:
def train(model, train_loader=train_loader, test_loader=test_loader, 
          learning_rate=0.001, num_epoch=10, print_every=100):
    # Training steps
    start_time = time.time()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, weight_decay=10**(-5))
    for epoch in range(num_epoch):
        model.train()
        for i, (data, data_len, labels) in enumerate(train_loader):
            data, data_len, labels = data.to(device), data_len.to(device), labels.to(device)
            outputs = model(data, data_len)
            model.zero_grad()
            loss = loss_fn(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

             # report performance
            if (i + 1) % print_every == 0:
                print('Train set | epoch: {:3d} | {:6d}/{:6d} batches | Loss: {:6.4f}'.format(
                    epoch, i + 1, len(train_loader), loss.item()))     
    
    # Evaluate after every epochh
        correct = 0
        total = 0
        model.eval()

        predictions = []
        truths = []

        with torch.no_grad():
            for i, (data, data_len, labels) in enumerate(test_loader):
                data, data_len, labels = data.to(device), data_len.to(device), labels.to(device)
                outputs = model(data, data_len)
                pred = outputs.data.max(-1)[1]
                predictions += list(pred.cpu().numpy())
                truths += list(labels.cpu().numpy())
                total += labels.size(0)
                correct += (pred == labels).sum()
                
            acc = (100 * correct / total)
            auc = roc_auc_score(truths, predictions)
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Test set | Accuracy: {:6.4f} | AUC: {:4.2f} | time elapse: {:>9}'.format(
                acc, auc, elapse))

Run the code block below to check your model performance.

In [33]:
torch.manual_seed(42)
rnn_model = RNN(40, 2, len(vocabulary), 50, rnn='RNN').to(device)
train(rnn_model)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.5330
Train set | epoch:   0 |    200/   302 batches | Loss: 0.5172
Train set | epoch:   0 |    300/   302 batches | Loss: 0.3056
Test set | Accuracy: 79.8695 | AUC: 0.53 | time elapse:  00:00:03
Train set | epoch:   1 |    100/   302 batches | Loss: 0.4811
Train set | epoch:   1 |    200/   302 batches | Loss: 0.2720
Train set | epoch:   1 |    300/   302 batches | Loss: 0.5370
Test set | Accuracy: 82.1062 | AUC: 0.61 | time elapse:  00:00:07
Train set | epoch:   2 |    100/   302 batches | Loss: 0.4949
Train set | epoch:   2 |    200/   302 batches | Loss: 0.4659
Train set | epoch:   2 |    300/   302 batches | Loss: 0.2792
Test set | Accuracy: 82.6654 | AUC: 0.65 | time elapse:  00:00:11
Train set | epoch:   3 |    100/   302 batches | Loss: 0.3749
Train set | epoch:   3 |    200/   302 batches | Loss: 0.2641
Train set | epoch:   3 |    300/   302 batches | Loss: 0.4357
Test set | Accuracy: 82.5722 | AUC: 0.69 | time elapse:  0

In [34]:
lstm_model = RNN(40, 2, len(vocabulary), 50, rnn='LSTM').to(device)
train(lstm_model)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.4345
Train set | epoch:   0 |    200/   302 batches | Loss: 0.4457
Train set | epoch:   0 |    300/   302 batches | Loss: 0.4866
Test set | Accuracy: 81.4539 | AUC: 0.59 | time elapse:  00:00:06
Train set | epoch:   1 |    100/   302 batches | Loss: 0.3053
Train set | epoch:   1 |    200/   302 batches | Loss: 0.4502
Train set | epoch:   1 |    300/   302 batches | Loss: 0.3433
Test set | Accuracy: 82.6654 | AUC: 0.63 | time elapse:  00:00:12
Train set | epoch:   2 |    100/   302 batches | Loss: 0.4614
Train set | epoch:   2 |    200/   302 batches | Loss: 0.3137
Train set | epoch:   2 |    300/   302 batches | Loss: 0.4248
Test set | Accuracy: 84.3430 | AUC: 0.67 | time elapse:  00:00:17
Train set | epoch:   3 |    100/   302 batches | Loss: 0.2623
Train set | epoch:   3 |    200/   302 batches | Loss: 0.2395
Train set | epoch:   3 |    300/   302 batches | Loss: 0.3912
Test set | Accuracy: 84.1566 | AUC: 0.71 | time elapse:  0

### Model predictions

In [35]:
def sentences_to_padded_index_sequences(words, sentences):
    for i, s in enumerate(sentences):
        # only take the first pad_length tokens
        token_indices = np.array([words[w] if w in words else words['<UNK>'] for w in re.findall(r"[\w']+|[.,!?;]", s.lower())])
    return token_indices, len(token_indices)

In [36]:
def test_sentence(sentence, model):
    model.eval()
    test_tensor, len_sent = sentences_to_padded_index_sequences(vocab_indices, [sentence])
    score = model(torch.LongTensor(test_tensor.astype(int)).unsqueeze(0).to(device), torch.as_tensor([len_sent]).to(device)).data.numpy().squeeze()
    label = np.argmax(score)
    return ("positive" if label == 1 else "negative", score[label])

In [37]:
test_sentence("Enjoyed the #GOPDebates and am looking forward to the #DemocraticDebates next.", lstm_model)

('positive', 2.4169722)

In [38]:
test_sentence("Donald Trump is a really nasty piece of work. Hope he disappears quickly. #GOPDebate", lstm_model)

('negative', 2.8372514)

## Word Embeddings and How to Use Them

When using deep learning methods on NLP tasks, we usually utilize [word embedding](https://en.wikipedia.org/wiki/Word_embedding). To put it briefly, word embedding represent words, or tokens, in a vocabulary as a distributed numerical vector. There are a lot of methods to obtain a word embedding, with some of the most famous shallow models being Word2Vec, GloVe, and FastText while the deeper models are BERT, RoBERTa, T5. It is not difficult to find a general purpose word embedding trained by one of the aforementioned methods on the Internet that's been trained with a massive amount of data. It is usually a good idea to use these pre-trained embedding to save yourself some time and computing resource.

In this lab, we will be using the [GloVe embedding](https://nlp.stanford.edu/projects/glove/) developed by Stanford,  one of the state-of-the-art word embedding. Please download the file ```glove.6B.50d.txt``` [here](https://drive.google.com/file/d/1JweINiA5JvTNLTm663LH8OdWssK2Kcid/view?usp=sharing).

In [40]:
from gensim.scripts.glove2word2vec import glove2word2vec
# load embedding

_ = glove2word2vec('data/glove.6B.50d.txt', 'tmp_file')
glove_embedding = KeyedVectors.load_word2vec_format('tmp_file')

  _ = glove2word2vec('data/glove.6B.50d.txt', 'tmp_file')


### Word embedding vectors

Now we can play around with these vectors to get a sense of how word embeddings can be used to represent words. Here's how you can look up a word embedding vector.

### Find similar words

The word embedding vectors can help us find words with similar meanings. Word similarities can be measured by [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). The function below looks up the most similar words to a given word:

In [41]:
glove_embedding.similar_by_word('hate', topn=5)

[('hatred', 0.7746836543083191),
 ('shame', 0.7489537000656128),
 ('racist', 0.7371559143066406),
 ('anyone', 0.7364715933799744),
 ('bigotry', 0.7300711870193481)]

### Word arithmetic

In [42]:
glove_embedding.similar_by_word(glove_embedding['worse'] - glove_embedding['better'] + glove_embedding['best'], topn=1)

[('worst', 0.8109660744667053)]

In [43]:
glove_embedding.similar_by_word(glove_embedding['king'] - glove_embedding['man'] + glove_embedding['woman'], topn=2)

[('king', 0.8859834671020508), ('queen', 0.8609582185745239)]

### Train an LSTM model withh GloVe embedding

Complete the code below. Replace the randomly generated embeddings withh GloVe embeddings. (Hint: check out ```nn.Embedding.weight```). 

In [67]:
class GloveDataset(Dataset):
    def __init__(self, embedding, df, label = 'sentiment'):
        self.embedding = embedding
        self.df = df
        self.label = label
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, key):
        sentence = self.df.loc[key,'text']
        sentence = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', sentence)
        
        # Deal with recent gensim updates
        if int(gensim.__version__[0]) == 4:
            token_indices = np.array([self.embedding[word] for word in word_tokenize(sentence.lower()) if word in self.embedding])
        else:
            token_indices = np.array([self.embedding[word] for word in word_tokenize(sentence.lower()) if word in self.embedding.vocab])
        
        if len(token_indices):
            return (torch.from_numpy(token_indices) , self.df.loc[key, self.label])
        else:
            return None


def pad_collate_glove(batch):
    batch = filter(lambda x:x is not None, batch)
    (xx, yy) = zip(*batch)
    x_lens = [len(x) for x in xx]

    xx_pad = pad_sequence(xx, batch_first=True, padding_value=0)

    return xx_pad, torch.as_tensor(x_lens), torch.LongTensor(yy)

In [68]:
gensim.__version__[0]

'4'

In [69]:
# Re-indexing tokens
train_loader_glove = DataLoader(GloveDataset(glove_embedding, train_data),
                                batch_size = BATCH_SIZE,
                                shuffle = True,
                                collate_fn = pad_collate_glove)
test_loader_glove = DataLoader(GloveDataset(glove_embedding, test_data),
                               batch_size=BATCH_SIZE,
                               shuffle=True,
                               collate_fn = pad_collate_glove)

In [70]:
glove_model = RNN(40, 2, len(vocabulary), 50, rnn='LSTM')
# nn.Identity can fill in gaps to provide a consistent architecture.
glove_model.emb = nn.Identity()
train(glove_model.to(device), train_loader=train_loader_glove, test_loader=test_loader_glove)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.4955
Train set | epoch:   0 |    200/   302 batches | Loss: 0.4322
Train set | epoch:   0 |    300/   302 batches | Loss: 0.6698
Test set | Accuracy: 80.5219 | AUC: 0.59 | time elapse:  00:00:05
Train set | epoch:   1 |    100/   302 batches | Loss: 0.2929
Train set | epoch:   1 |    200/   302 batches | Loss: 0.4945
Train set | epoch:   1 |    300/   302 batches | Loss: 0.5564
Test set | Accuracy: 82.9450 | AUC: 0.67 | time elapse:  00:00:10
Train set | epoch:   2 |    100/   302 batches | Loss: 0.4828
Train set | epoch:   2 |    200/   302 batches | Loss: 0.4426
Train set | epoch:   2 |    300/   302 batches | Loss: 0.3756
Test set | Accuracy: 82.9450 | AUC: 0.64 | time elapse:  00:00:15
Train set | epoch:   3 |    100/   302 batches | Loss: 0.1594
Train set | epoch:   3 |    200/   302 batches | Loss: 0.4944
Train set | epoch:   3 |    300/   302 batches | Loss: 0.3339
Test set | Accuracy: 82.5722 | AUC: 0.71 | time elapse:  0

In the last case we just used the embedding without training it. Let's try and train the GloVe embedding to see if that increases the performance.

In [72]:
import numpy as np
from tqdm import tqdm
# load embedding
emb_dim = 50
with open('data/glove.6B.50d.txt') as f:
    glove_embedding = []
    words = {}
    chars = {}
    idx2words = {}
    ordered_words = []

    for i, line in tqdm(enumerate(f)):
        s = line.split()
        glove_embedding.append(np.asarray(s[1:]))
        
        words[s[0]] = len(words)
        idx2words[i] = s[0]
        ordered_words.append(s[0])
        
# add unknown to word and char
glove_embedding.append(np.random.rand(emb_dim))
words["<UNK>"] = len(words)

# add padding
glove_embedding.append(np.zeros(emb_dim))
words["<PAD>"] = len(words)

chars["<UNK>"] = len(chars)
chars["<PAD>"] = len(chars)

glove_embedding = np.array(glove_embedding).astype(float)

400000it [00:05, 69768.13it/s]


In [73]:
train_loader_glove = DataLoader(TweetDataset(words, train_data),
                                batch_size=BATCH_SIZE,
                                shuffle=True,
                                collate_fn = pad_collate)
test_loader_glove = DataLoader(TweetDataset(words, test_data),
                               batch_size=BATCH_SIZE,
                               shuffle=True,
                               collate_fn = pad_collate)

In [74]:
glove_model = RNN(40, 2, len(glove_embedding), 50, rnn='LSTM')
glove_model.emb.weight.data.copy_(torch.from_numpy(glove_embedding))
train(glove_model, train_loader=train_loader_glove, test_loader=test_loader_glove)

Train set | epoch:   0 |    100/   302 batches | Loss: 0.6346
Train set | epoch:   0 |    200/   302 batches | Loss: 0.3434
Train set | epoch:   0 |    300/   302 batches | Loss: 0.5903
Test set | Accuracy: 80.8947 | AUC: 0.59 | time elapse:  00:00:22
Train set | epoch:   1 |    100/   302 batches | Loss: 0.4892
Train set | epoch:   1 |    200/   302 batches | Loss: 0.4271
Train set | epoch:   1 |    300/   302 batches | Loss: 0.4497
Test set | Accuracy: 83.8770 | AUC: 0.68 | time elapse:  00:00:44
Train set | epoch:   2 |    100/   302 batches | Loss: 0.3000
Train set | epoch:   2 |    200/   302 batches | Loss: 0.3587
Train set | epoch:   2 |    300/   302 batches | Loss: 0.1875
Test set | Accuracy: 84.6226 | AUC: 0.72 | time elapse:  00:01:07
Train set | epoch:   3 |    100/   302 batches | Loss: 0.2807
Train set | epoch:   3 |    200/   302 batches | Loss: 0.1064
Train set | epoch:   3 |    300/   302 batches | Loss: 0.1865
Test set | Accuracy: 83.9702 | AUC: 0.75 | time elapse:  0

The model seems to have overfit here. We can increase the regularization through weight decay/dropout to get better results.