In [None]:
import os
import cv2
import time
import random
import numpy as np
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
import torch.optim as optim

## The ```nn.RNN``` module
Some basic options for ```nn.RNN```
- ```input_size```: refers to size of embedding/feature vectors (i.e. number of channels)
- ```hidden_size```: desired dimensions of hidden state vector
- ```num_layers```: number of RNNs stacked on top
- ```batch_first```: If True, the input/output dimension is *(batch size, sequence length, embedding/feature vector size)*, otherwise it is *(sequence length, batch size, embedding/feature vector size)*

In [None]:
# assume we have a sequence of 300 dimensional vectors
# hidden state dimension will be 100
basic_rnn = nn.RNN(input_size=300, hidden_size=100, num_layers=1, batch_first=True)

In [None]:
# what's in here?
for name, param in basic_rnn.named_parameters():
    print(name, param.shape)

In [None]:
# assume we have batch_size=3 and a length 10 sequence of 300 dimensional vectors
input_seq = torch.rand((3, 10, 300))

In [None]:
# we get two outputs when we pass a batch to the RNN
output = basic_rnn(input_seq)
for element in output:
    print(element.shape)

- The first output is a length ten sequence of 100 dimensional vectors (per datapoint in batch of size 3)
- These are all the hidden states as we passed the sequence through the RNN

In [None]:
output[0]

- The second output is a single 100 dimensional vector (per datapoint in batch of size 3)
- This is the *last* hidden state

In [None]:
print(output[1] - output[0][:,-1,:])

We can give the RNN layer a second input: a initial hidden state

In [None]:
# a different initial hidden state changes the output slightly
basic_rnn(input_seq)[1] - basic_rnn(input_seq, torch.rand((1, 3, 100)))[1]

- We see two sets of weights if we do more than one layer
- Note that the $W_{ih}$ weight of the second layer is 100$\times$100 since the input vectors for the second layer of the RNN are 100-dimensional vectors

In [None]:
two_layer_basic_rnn = nn.RNN(input_size=300, hidden_size=100, num_layers=2, batch_first=True)
for name, param in two_layer_basic_rnn.named_parameters():
    print(name, param.shape)

- The shape of our output changes slightly
- The first element are the hidden states of the top/last layer
- The second element are the hidden states output by the two layers (let's one use this as input to a new RNN)

In [None]:
output = two_layer_basic_rnn(input_seq)
for element in output:
    print(element.shape)

In [None]:
# Vanilla RNN using nn.RNN
class Vanilla_RNN(nn.Module):
    def __init__(self, input_size, hidden_size, output_size):
        super(Vanilla_RNN, self).__init__()
        self.rnn = nn.RNN(input_size, hidden_size)
        
        # here is our g function from the lecture slides
        # linear layer turning the i-th hidden state into the i-th output
        self.g = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        
        out, hidden = self.rnn(x)
        out = self.g(out)

        return out, hidden
    
v_rnn = Vanilla_RNN(300, 100, 50)

In [None]:
# output dimension has changing because we did a linear layer from 100-dim to 50-dim
for output in v_rnn(input_seq):
    print(output.shape)

## Fancier RNN architectures

- `nn.GRU` works almost identically to the `nn.RNN` (more parameters inside the $f$ function)
- ``nn.LSTM`` is slightly different in that it also has a cell state. So the second output element is a tupe of *(final hidden state, final cell state)*

In [None]:
basic_gru = nn.GRU(input_size=300, hidden_size=100, num_layers=1, batch_first=True)
for output in basic_gru(input_seq):
    print(output.shape)

In [None]:
basic_lstm = nn.LSTM(input_size=300, hidden_size=100, num_layers=1, batch_first=True)
for output in basic_lstm(input_seq):
    try:
        print(output.shape)
    except:
        name = 'hidden'
        for ele in output:
            print(f'{name} state size:', ele.shape)
            name = 'cell'

## Generating Text
- Idea: Take a text and use the shifted text as target

In [None]:
df_clean = pd.read_csv('course_data/IMDB_cleaned.csv')
df_clean.head()

In [None]:
# count words, send infrequent to unknown
from collections import Counter

reviews = [review.split(' ') for review in list(df_clean['cleaned'])]
word_freq = dict(Counter([token for review in reviews for token in review]).most_common())
print(len(word_freq))
min_freq = 50
word_dict = {}

# sending all the unknowns to 0
i = 1
for word in word_freq:
    if word_freq[word] > min_freq:
        word_dict[word] = i
        i += 1
    else:
        word_dict[word] = 0

# dictionary length        
dict_length = max(word_dict.values()) + 1
dict_length

In [None]:
# clean out unknown tokens for simplicity
df_cleaner = pd.DataFrame(list(df_clean.apply(lambda x:
                        {'cleaned': ' '.join([token for token in x['cleaned'].split(' ') if word_dict[token] != 0]),
                          'sentiment':x['sentiment']}, axis=1)))

In [None]:
# clean out reviews that are too short
min_length = 12
print(len(df_clean))
df_cleaner = df_cleaner[df_cleaner.apply(lambda x: len(x['cleaned'].split(' ')) >= min_length, axis=1)].reset_index(drop=True)
len(df_cleaner)

In [None]:
import random

# max length here will be maximum length of the sequence predicted
class IMDBDataset(Dataset):
    def __init__(self, df, word_dict, max_length):
        self.df = df
        self.word_dict = word_dict
        self.sent_dict = {'negative': 0, 'positive': 1}
        self.max_len = max_length
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        review = row['cleaned'].split(' ')
        
        
        x = torch.zeros(self.max_len-1)
        y = torch.zeros(self.max_len-1)
        
        starting_point = random.randint(0,len(review) - (self.max_len))
        
        # get reviews as a list of integers
        for idx in range(self.max_len-1):
            x[idx] = self.word_dict[review[starting_point + idx]]
            y[idx] = self.word_dict[review[starting_point + idx + 1]]
            
        
        # embedding likes long tensors
        return x.long(), y.long()
ds = IMDBDataset(df_cleaner, word_dict, 10)

# target is the input review shifted over one
# i.e. predict next word from first part of the sequence
next(iter(ds))

In [None]:
dl = DataLoader(ds, batch_size = 1000, shuffle=True)
for element in tqdm(dl):
    None

In [None]:
# create a model to generate a synthetic review
class LSTM_Gen(nn.Module):
    def __init__(self, word_dict, embedding_size, hidden_size):
        super(LSTM_Gen, self).__init__()
        self.word_dict = word_dict
        self.hidden_size = hidden_size
        
        # integer to word dictionary
        self.idx2word = dict([(x, y) for x, y in zip(self.word_dict.values(), self.word_dict.keys())])
        self.idx2word[0] = 'UNK'
        
        # length of dictionary
        dict_length = max(word_dict.values()) + 1
        
        # embed the words
        self.emb = nn.Embedding(dict_length, embedding_size)
        
        # pass through an LSTM
        self.lstm = nn.LSTM(embedding_size, hidden_size)
        
        # send output through a linear layer
        self.linear = nn.Linear(hidden_size, dict_length)

    def forward(self, x):
        x = self.emb(x)
        out, hidden = self.lstm(x)
        out = self.linear(out)

        return out.permute((0, 2, 1))
    
    # method to generate sequence using LSTM module
    def gen_seq(self, start_token, seq_length):
        print(start_token)
        softmax = nn.LogSoftmax(dim=2)
        
        # embedding of start token
        next_emb = self.emb(torch.tensor([[self.word_dict[start_token]]]))
        
        # initial hidden/cell states
        next_state = (torch.zeros((1,1,self.hidden_size)), torch.zeros((1,1,self.hidden_size)))
        
        # generate a sequence!
        for i in range(seq_length):
            # use the hidden/cell states for input into next pass through LSTM layer
            out, next_state = self.lstm(next_emb, next_state)
            
            # make prediction
            y_pred = self.linear(out)
            next_idx = torch.argmax(softmax(y_pred), dim=2)
            print(self.idx2word[torch.squeeze(next_idx).item()])
            
            # embed prediction for input into next pass
            next_emb = self.emb(next_idx)
            

lstm_model = LSTM_Gen(word_dict, embedding_size=100, hidden_size=100)

In [None]:
lstm_model.gen_seq('first', 10)

In [None]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        
        y_pred = model(x)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_acc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
        
    softmax = nn.LogSoftmax(dim=1)
    
    for x, y in dataloader:
        y_pred = torch.argmax(softmax(model(x)), dim=1)
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        
    percent_wrong = total_incorrect / num_points
    return 1 - percent_wrong

In [None]:
total = sum(list(word_freq.values()))

# need to weight the cross entropy loss because of imbalanced dataset
weights = [0]
for value in word_freq.values():
    weights.append(total / (dict_length * value))

nn.CrossEntropyLoss(weight=torch.tensor(weights))

optimizer = optim.Adam(lstm_model.parameters(), lr = 0.01)

In [None]:
num_epochs = 2

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss = one_pass(lstm_model, dl, optimizer, lossFun)
    print('Loss: ', loss)

In [None]:
lstm_model.gen_seq('film', 10)

## Seq2Seq
- Great introduction [here](https://github.com/bentrevett/pytorch-seq2seq)

In [None]:
# 30,000 english-german sentences
from torchtext.datasets import Multi30k
train_data, valid_data, test_data = Multi30k()

In [None]:
next(train_data)

In [None]:
# need tokenizers for english and german
import spacy
#!python -m spacy download en_core_web_sm
#!python -m spacy download de_core_news_sm

spacy_de = spacy.load('de_core_news_sm')
spacy_en = spacy.load('en_core_web_sm')

In [None]:
def clean_to_csv(iterator, path):
    rows = []
    for sent_de, sent_en in tqdm(iterator):
        # create a state of sentence token
        tokenized_text_de = ['<sos>']
        tokenized_text_en = ['<sos>']
        
        # no lemmatization for translation!
        for token in spacy_de(sent_de):
            if token.text not in ['.', '\n']:
                tokenized_text_de.append(token.text.lower())
        for token in spacy_en(sent_en):
            if token.text not in ['.', '\n']:
                tokenized_text_en.append(token.text.lower())
        tokenized_text_de.append('<eos>')
        tokenized_text_en.append('<eos>')
        row = {'english': tokenized_text_en,
               'german': tokenized_text_de}
        rows.append(row)
    df = pd.DataFrame(rows)
    df.to_csv(path)
    return df
    
df_train = clean_to_csv(train_data, 'course_data/Multi30k_train.csv')
df_val = clean_to_csv(valid_data, 'course_data/Multi30k_val.csv')
df_test = clean_to_csv(test_data, 'course_data/Multi30k_test.csv')
df_train.head()

In [None]:
df_train = pd.read_csv('course_data/Multi30k_train.csv').drop(columns=["Unnamed: 0"]).applymap(lambda x: x.strip('][').split(', '))
df_val = pd.read_csv('course_data/Multi30k_val.csv').drop(columns=["Unnamed: 0"]).applymap(lambda x: x.strip('][').split(', '))
df_test = pd.read_csv('course_data/Multi30k_test.csv').drop(columns=["Unnamed: 0"]).applymap(lambda x: x.strip('][').split(', '))

In [None]:
# build vocab from JUST training data (prevent data leakage)
from collections import Counter

def build_vocab(df, col_name, min_freq):
    all_words = [token for sentence in list(df[col_name]) for token in sentence if token != '\n']
    
    word_freq = dict(Counter(all_words).most_common())
    word_dict = {'<unk>' : 0}
    
    i = 0
    for word in word_freq:
        if word_freq[word] >= min_freq:
            word_dict[word] = i+1
            i += 1
        else:
            word_dict[word] = 0
    
    idx2word = dict([(x, y) for x, y in zip(word_dict.values(), word_dict.keys())])
    idx2word[0] = '<unk>'
    
    return word_freq, word_dict, idx2word

word_freq_en, word_dict_en, idx2word_en = build_vocab(df_train, 'english', 2)
word_freq_de, word_dict_de, idx2word_de = build_vocab(df_train, 'german', 2)

len(idx2word_en), len(idx2word_de)

In [None]:
max_len = 0
for idx in range(len(df_train)):
    row = df_train.iloc[idx]
    if len(row['english']) > max_len:
        max_len = len(row['english'])
    if len(row['german']) > max_len:
        max_len = len(row['german'])
max_len

In [None]:
class en2deDataset(Dataset):
    def __init__(self, df, word_dict_en, word_dict_de, max_len):
        self.df = df
        self.word_dict_en = word_dict_en
        self.word_dict_de = word_dict_de
        self.max_len = max_len
    
    def __len__(self):
        return len(self.df)
    
    def __getitem__(self, idx):
        row = self.df.iloc[idx]
        sent_en = row['english']
        sent_de = row['german']
        
        x = torch.zeros(self.max_len)
        y = torch.zeros(self.max_len)
        
        # front pad english sentence
        for idx in range(len(sent_en)):
            # unknown words get sent to 0
            try:
                word_idx = self.word_dict_en[sent_en[idx]]
            except:
                word_idx = 0
            x[self.max_len - len(sent_en) + idx] = word_idx
        
        # back pad german sentence
        for idx in range(len(sent_de)):
            # unknown words get sent to 0
            try:
                word_idx = self.word_dict_de[sent_de[idx]]
            except:
                word_idx = 0
            y[idx] = word_idx
        
        # embedding likes long tensors
        return x.long(), y.long()

In [None]:
ds_train = en2deDataset(df_train, word_dict_en, word_dict_de, max_len)
next(iter(ds_train))

In [None]:
ds_train = en2deDataset(df_train, word_dict_en, word_dict_de, max_len)
dl_train = DataLoader(ds_train, batch_size=100, shuffle=True)

ds_val = en2deDataset(df_val, word_dict_en, word_dict_de, max_len)
dl_val = DataLoader(ds_val, batch_size=100, shuffle=False)

ds_test = en2deDataset(df_test, word_dict_en, word_dict_de, max_len)
dl_test = DataLoader(ds_test, batch_size=100, shuffle=False)
    
next(iter(dl_train))

In [None]:
# now we define a simple Encoder with an LSTM
class Encoder(nn.Module):
    def __init__(self, dict_length_en, emb_size, hidden_size):
        super().__init__()
                
        self.emb_en = nn.Embedding(dict_length_en, emb_size)
        self.rnn = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, batch_first=True)
                
    def forward(self, x):
        
        # don't need the outputs, just the hidden/cell states for input into the decoder
        outputs, (hidden, cell) = self.rnn(self.emb_en(x))
        
        return hidden, cell

In [None]:
# for the decoder, we need the states from the encoder as input as well as the target sentence
# the forward pass represents the prediction of a single German word (the next word in the sentence)
class Decoder(nn.Module):
    def __init__(self, dict_length_de, emb_size, hidden_size):
        super().__init__()
                
        self.emb_de = nn.Embedding(dict_length_de, emb_size)
        self.rnn = nn.LSTM(input_size=emb_size, hidden_size=hidden_size, batch_first=True)
        
        # output function
        self.linear = nn.Linear(hidden_size, dict_length_de)
                
    def forward(self, input_word, hidden, cell):
        
        input_emb = self.emb_de(input_word)
        
        # output the next hidden/cell states
        output, (hidden, cell) = self.rnn(input_emb, (hidden, cell))
        
        # prediction for next word
        output = self.linear(output)
        
        return output, (hidden, cell)

In [None]:
class Seq2Seq(nn.Module):
    def __init__(self, dict_length_en, dict_length_de, emb_size, hidden_size, max_len):
        super().__init__()
        
        self.encoder = Encoder(dict_length_en, emb_size, hidden_size)
        self.decoder = Decoder(dict_length_de, emb_size, hidden_size)
        self.softmax = nn.LogSoftmax(dim=2)
        self.max_len = max_len
        self.output_size = dict_length_de
        
    def forward(self, x, y):
        
        hidden, cell = self.encoder(x)
    
        next_word = y[:, 0:1]
        prediction = torch.zeros((y.shape[0], self.output_size, y.shape[1]))
        
        # first token is always <sos>
        prediction[:, 1, 0] = 1
        
        for i in range(self.max_len-1):
            
            output, (hidden, cell) = self.decoder(next_word, hidden, cell)
            prediction[:, :, i+1] = torch.squeeze(output)
            
            # can implement teacher forcing here (sometimes use target word rather than predicted word for next token)
            teacher_forcing_prob = random.uniform(0, 1)
            #teacher_forcing_prob = 1.0
            if teacher_forcing_prob > 0.5:
                next_word = torch.argmax(self.softmax(output), dim=2)
            else:
                next_word = y[:, (i+1):(i+2)]
            
                        
        return prediction

In [None]:
model = Seq2Seq(len(idx2word_en), len(idx2word_de), 100, 100, max_len)

In [None]:
x, y = next(iter(dl_train))
model(x, y)

In [None]:
def one_pass(model, dataloader, optimizer, lossFun, backwards=True, print_loss=False):
    
    if backwards == True:
        model.train()
    else:
        model.eval()
    
    total_loss = 0.0
    for x, y in tqdm(dataloader):
        
        y_pred = model(x, y)
        loss = lossFun(y_pred, y)
        total_loss += loss.item()
        
        if backwards == True:
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()
    avg_loss = total_loss / len(dataloader)
    
    if print_loss == True:
        print(avg_loss)
    
    return avg_loss

def one_pass_acc(model, dataloader, num_points):
    model.eval()
    total_incorrect = 0
        
    softmax = nn.LogSoftmax(dim=1)
    
    for x, y in dataloader:
        y_pred = torch.argmax(softmax(model(x, y)), dim=1)
        total_incorrect += torch.count_nonzero(y - y_pred).item()
        
    percent_wrong = total_incorrect / num_points
    return 1 - percent_wrong

In [None]:
lossFun = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr = 0.01)

In [None]:
num_epochs = 2

for epoch in tqdm(range(num_epochs)):
    print('Epoch: ', epoch)
    
    loss_train = one_pass(model, dl_train, optimizer, lossFun)
    print('Loss: ', loss_train)
    
    #acc_train = one_pass_acc(model, dl_train, len(ds_train))
    #print('Accuracy: ', acc_train)

In [None]:
# test a translation
softmax = nn.LogSoftmax(dim=1)

# one batch
x, y = next(iter(dl_train))
y_pred = model(x, y)
# english sentence
sent_en = []
for index in x[0]:
    next_word = idx2word_en[index.item()].strip("''")
    if next_word not in ['<sos>', '<eos>', '<unk>']:
        sent_en.append(next_word)
print(' '.join(sent_en))

sent_de = []
for index in torch.argmax(model.softmax(y_pred), dim=1)[0]:
    next_word = idx2word_de[index.item()].strip("''")
    if next_word not in ['<sos>', '<eos>', '<unk>']:
        sent_de.append(next_word)
print(' '.join(sent_de))

## Custom Loss Function

In [None]:
class some_loss(nn.Module):
    def __init__(self, hyperparam):
        super(some_loss, self).__init__()
        self.hyperparam = hyperparam
        
    
    def forward(self, y_pred, y):
        diff = y_pred - y
        
        # average over each entry and batch size
        torch.norm(diff) / torch.numel(doff)
        return