# Lab 7: RNNs & Word Embeddings

In [None]:
__author__ = "Ren Yi"
__version__ = "BMSC-GA 4493/BMIN-GA 3007, NYU, Spring 2019"

In [None]:
import re
import os
import time
import pandas as pd
import numpy as np
from collections import Counter
from sklearn.model_selection import train_test_split
from sklearn.metrics.pairwise import cosine_similarity

## Goal:
- Understand the mechanics of RNNs in Pytorch
- Train RNN based neural networks on text data
- Basics of word embedding and how to use them

## Problem Setup

### Dataset
Download the two files in the data folder [here](https://drive.google.com/drive/folders/1KBUyfU87zz8eOZwr2ifDi2Z4LBHlSZ28?usp=sharing). Save the folder in the same directory as this notebook.

For the first part, we will be using the [First GOP Debate Twitter Sentiment dataset](https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras/data), which contains Tweets after the first GOP debate and their sentiments (among other stuff).

In [None]:
np.random.seed(1111)

df = pd.read_csv('data/Sentiment.csv')
df.head()

Let's first look at some basic stats of the data

In [None]:
pd.DataFrame(df.groupby('sentiment').count()['text'])

For simplicity, 
- we only use ```X = 'sentiment'``` and ```y = 'text'``` from the original dataframe. 
- We only look at positive (1) and negative (0) tweets.

In [None]:
df = df[['sentiment', 'text']]
df = df[df['sentiment'] != 'Neutral']
df['sentiment'] = [1 if s == "Positive" else 0 for s in df['sentiment']]
df.groupby('sentiment').count()

In [None]:
train_data, test_data = train_test_split(df, test_size=0.10, random_state=42)
train_data.groupby('sentiment').count().apply(lambda x: 100 * x / float(x.sum()))

In [None]:
train_X, train_y = train_data['text'], train_data['sentiment']
test_X, test_y = test_data['text'], test_data['sentiment']

### Input representations

#### Build vocabulary
We need to build a vocabulary using words in our training data. Any words in the test set that are not in our vocabulary will be replaced with an ```<UNK>``` token. We will also add a ```<PAD>``` token as padding.

For computational purposes, we'll only take words that appeared more than 3 times.

In [None]:
UNK = "<UNK>"
PAD = "<PAD>"
def build_vocab(sentences, min_count=3, max_vocab=None):
    """
    Build vocabulary from sentences (list of strings)
    """
    # keep track of the number of appearance of each word
    word_count = Counter()
    
    for s in sentences:
        word_count.update(re.findall(r"[\w']+|[.,!?;]", s.lower()))
    
    vocabulary = list([w for w in word_count if word_count[w] > min_count]) + [UNK, PAD]
    indices = dict(zip(vocabulary, range(len(vocabulary))))

    return vocabulary, indices
    
vocabulary, vocab_indices = build_vocab(train_X)
print(len(vocabulary))

#### Word representations
Next, we neeed to convert each word/token in the sentences into its index in the vocabulary so that pytorch can use it. We also pad our sentences to a fixed length of 25 tokens so that we can do batch processing. We do this for both train and test set.

In [None]:
def sentences_to_padded_index_sequences(words, sentences, pad_length=100):
    padded_sequences = np.zeros((len(sentences), pad_length))
    for i, s in enumerate(sentences):
        indices = np.ones(pad_length) * words['<PAD>']
        # only take the first pad_length tokens
        token_indices = np.array([words[w] if w in words else words['<UNK>'] for w in re.findall(r"[\w']+|[.,!?;]", s.lower())[:pad_length]])
        indices[:len(token_indices)] = token_indices
        padded_sequences[i] = indices
    return padded_sequences

In [None]:
train_X = sentences_to_padded_index_sequences(vocab_indices, train_data['text'], 25)
test_X = sentences_to_padded_index_sequences(vocab_indices, test_data['text'], 25)

In [None]:
train_X.shape

## Model Time

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from sklearn.metrics import roc_auc_score
from torch.utils.data import DataLoader, Dataset

### DataLoader

In [None]:
class TweetDataset(Dataset):
    def __init__(self, sentences, labels):
        self.sentences = sentences.astype(int)
        self.labels = np.array(labels).astype(int)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, key):
        return (torch.LongTensor(self.sentences[key]), self.labels[key])

BATCH_SIZE = 32
train_loader = DataLoader(TweetDataset(train_X, train_y),
                          batch_size=BATCH_SIZE,
                          shuffle=True)
test_loader = DataLoader(TweetDataset(test_X, test_y),
                          batch_size=BATCH_SIZE,
                          shuffle=True)

### Train and validation loop

In [None]:
def train(model, train_loader=train_loader, test_loader=test_loader, 
          learning_rate=0.001, num_epoch=10, print_every=100):
    # Training steps
    start_time = time.time()
    model.train()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
    for epoch in range(num_epoch):
        for i, (data, labels) in enumerate(train_loader):
            outputs = model(data)
            model.zero_grad()
            loss = loss_fn(outputs.squeeze(), labels)
            loss.backward()
            optimizer.step()

             # report performance
            if (i + 1) % print_every == 0:
                print('Train set | epoch: {:3d} | {:6d}/{:6d} batches | Loss: {:6.4f}'.format(
                    epoch, i + 1, len(train_loader), loss.item()))     
#                 print('Epoch: [{0}/{1}], Step: [{2}/{3}], Loss: {4}, Validation Acc:{5}, AUC:{6}'.format(
#                     epoch + 1, EPOCHS, i + 1, len(train_loader), loss.data[0], test_acc, test_auc))
    
    # Evaluate after every epochh
        correct = 0
        total = 0
        model.eval()

        predictions = []
        truths = []

        with torch.no_grad():
            for i, (data, labels) in enumerate(test_loader):
                outputs = model(data).squeeze()
#                 import ipdb; ipdb.set_trace()
#                 predicted = ((outputs > 0.5).long()).view(-1)
                pred = outputs.data.max(1)[1]
                predictions += list(pred.numpy())
                truths += list(labels.numpy())
                total += labels.size(0)
                correct += (pred == labels).sum()
                
            acc = (100 * correct / total)
            auc = roc_auc_score(truths, predictions)
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Test set | Accuracy: {:6.4f} | AUC: {:4.2f} | time elapse: {:>9}'.format(
                acc, auc, elapse))

For this lab, we will be exploring two variants of RNN: vanilla (or Elman) RNN and LSTM (Long-short term memory). In the following code block, please try to define your own model. Here are some hints.

- Each input word is represented by a vector of dimension ```embedding_dim```. Check out ```nn.Embedding``` to see how to initialize embeddings randomly.
- Your model should take the following input parameters
    - ```hidden_dim```: The number of features in the hidden state h of your RNN layer
    - ```output_dim```: Number of output classes
    - ```vocab_size``` Size of your vocabulary. 
    - ```embedding_dim```: Dimension of word embeddings
- Your model should consist of an RNN layer (you can use either ```nn.RNN``` or ```nn.LSTM```) followed by a linear layer.
- $h_{0}$ (and $c$ if you use LSTM) should be initialized as a zero vector of dimension ```hidden_dim```. You might want to check out ```nn.Parameter```

### RNN

In [None]:
class RNN(nn.Module):
    def __init__(self, hidden_dim, output_dim, 
                 vocab_size, embedding_dim, rnn='LSTM'):
        super(RNN, self).__init__()
        
        
    def init_hidden(self, batch_size):

        
    def forward(self, x):

Run the code block below to check your model performance. Using the parameters provided, you should be able to get about 0.6 AUC using vanilla RNN or about 0.7 AUC using LSTM after 10 training epochs.

In [None]:
torch.manual_seed(111)
rnn_model = RNN(40, 2, len(vocabulary), 50, rnn='RNN')
train(rnn_model)

In [None]:
torch.manual_seed(111)
lstm_model = RNN(40, 2, len(vocabulary), 50, rnn='LSTM')
train(lstm_model)

### Model predictions

In [None]:
def test_sentence(sentence, model):
    model.eval()
    test_tensor = torch.LongTensor(sentences_to_padded_index_sequences(vocab_indices, [sentence]).astype(int))
    score = model(test_tensor).data.numpy().squeeze()
    label = np.argmax(score)
    
    return ("positive" if label == 1 else "negative", score[label])

In [None]:
test_sentence("Enjoyed the #GOPDebates and am looking forward to the #DemocraticDebates next.", lstm_model)

In [None]:
test_sentence("Donald Trump is a really nasty piece of work. Hope he disappears quickly. #GOPDebate", lstm_model)

## Word Embeddings and How to Use Them

When using deep learning methods on NLP tasks, we usually utilize [word embedding](https://en.wikipedia.org/wiki/Word_embedding). To put it briefly, word embedding represent words, or tokens, in a vocabulary as a distributed numerical vector. There are a lot of methods to obtain a word embedding, with some of the most famous being Word2Vec, GloVe, and ELMo. It is not difficult to find a general purpose word embedding trained by one of the aforementioned methods on the Internet that's been trained with a massive amount of data. It is usually a good idea to use these pre-trained embedding to save yourself some time and computing resource.

In this lab, we will be using the [GloVe embedding](https://nlp.stanford.edu/projects/glove/) developed by Stanford,  one of the state-of-the-art word embedding. Please download the file ```glove.6B.50d.txt``` [here](https://drive.google.com/file/d/1JweINiA5JvTNLTm663LH8OdWssK2Kcid/view?usp=sharing).

In [None]:
import numpy as np
from tqdm import tqdm
# load embedding
emb_dim = 50
with open('glove.6B/glove.6B.50d.txt') as f:
    glove_embedding = []
    words = {}
    chars = {}
    idx2words = {}
    ordered_words = []

    for i, line in tqdm(enumerate(f)):
        s = line.split()
        glove_embedding.append(np.asarray(s[1:]))
        
        words[s[0]] = len(words)
        idx2words[i] = s[0]
        ordered_words.append(s[0])
        
# add unknown to word and char
glove_embedding.append(np.random.rand(emb_dim))
words["<UNK>"] = len(words)

# add padding
glove_embedding.append(np.zeros(emb_dim))
words["<PAD>"] = len(words)

chars["<UNK>"] = len(chars)
chars["<PAD>"] = len(chars)

glove_embedding = np.array(glove_embedding).astype(float)

Now we have three variables
- ```glove_embedding``` of shape [106687, 50] consisting of the actual vectors,
- ```words```, a dictionary consisting of each token in the vocabulary and its corresponding row in ```glove_embedding```, and
- ```idx2words```, a list consisting of all the words in their order in ```glove_embedding```

### Word embedding vectors

Now we can play around with these vectors to get a sense of how word embeddings can be used to represent words. Here's how you can look up a word embedding vector.

In [None]:
glove_embedding[words['this']]

### Find similar words

The word embedding vectors can help us find words with similar meanings. Word similarities can be measured by [Cosine similarity](https://en.wikipedia.org/wiki/Cosine_similarity). The function below looks up the most similar words to a given word:

In [None]:
def find_nearest(ref_vec, words, embedding, topk=10):
    """
    Finds the top-k most similar words to "word" in terms of cosine similarity in the given embedding
    :param ref_vec: reference word vector
    :param words: dict, word to its index in the embedding
    :param embedding: numpy array of shape [V, embedding_dim]
    :param topk: number of top candidates to return
    :return a list of top-k most similar words
    """
    # compute cosine similarities
    scored_words = cosine_similarity(ref_vec.reshape(1,-1), glove_embedding)[0]
    
    # sort the words by similarity and return the topk
    sorted_words = np.argsort(-scored_words)
    
    return [(idx2words[w], scored_words[w]) for w in sorted_words[:topk]]

In [None]:
find_nearest(glove_embedding[words['hate']], words, glove_embedding, topk=5)

### Word arithmetic

In [None]:
vec = glove_embedding[words['worse']] - glove_embedding[words['better']] + glove_embedding[words['best']]
find_nearest(vec, words, glove_embedding, topk=1)

In [None]:
vec = glove_embedding[words['king']] - glove_embedding[words['queen']] + glove_embedding[words['woman']]
find_nearest(vec, words, glove_embedding, topk=1)

### Train an LSTM model withh GloVe embedding

Complete the code below. Replace the randomly generated embeddings withh GloVe embeddings. (Hint: check out ```nn.Embedding.weight```). Using the parameters provided, you should be able to get about 0.75 AUC using GloVe embeddings after 10 training epochs. 

In [None]:
# Re-indexing tokens
train_X_glove = sentences_to_padded_index_sequences(words, train_data['text'], 25)
test_X_glove = sentences_to_padded_index_sequences(words, test_data['text'], 25)

train_loader_glove = DataLoader(TweetDataset(train_X_glove, train_y),
                                batch_size=BATCH_SIZE,
                                shuffle=True)
test_loader_glove = DataLoader(TweetDataset(test_X_glove, test_y),
                               batch_size=BATCH_SIZE,
                               shuffle=True)

In [None]:
torch.manual_seed(111)
glove_model = RNN(40, 2, len(glove_embedding), 50, rnn='LSTM')
# TODO: Add GloVe embeddings

train(glove_model, train_loader=train_loader_glove, test_loader=test_loader_glove)