# Starter Project
## Creating a Basic LSTM Classifier using Pytorch

## Section 1.1 Initialization

In [1]:
#importing libraries
import pandas as pd
import nltk
from nltk.corpus import stopwords
import re
from wordsegment import segment, load
from nltk.tokenize import TweetTokenizer

STOPWORDS = set(stopwords.words('english'))

### Section 1.1.1 Initializing Tokenizer, in case you don't know what a tokenizer is please visit: https://nlp.stanford.edu/IR-book/html/htmledition/tokenization-1.html

In [2]:
# in our model we will be using tweet tokenizer
# we have reduced the lenght of the text, and using preserve_case as False we turn all charecters to lower case
tknzr = TweetTokenizer(reduce_len=True, preserve_case=False, strip_handles=False)

### Section 1.1.2 Creating the Pre-Processing Function here we remove unwanted symbols and numbers from our text and change known emoji symbols to their text value

In [3]:
def text_preprocess(text):
    text = str(text)
    FLAGS = re.MULTILINE | re.DOTALL
    eyes = r"[8:=;]"
    nose = r"['`\-]?"

    def re_sub(pattern, repl):
        return re.sub(pattern, repl, text, flags=FLAGS)
    text = re_sub(r"https?:\/\/\S+\b|www\.(\w+\.)+\S*", "<url>")
    text = re_sub(r"/"," / ")
    text = re_sub(r"@\w+", "<user>")
    text = re_sub(r"{}{}[)dD]+|[)dD]+{}{}".format(eyes, nose, nose, eyes), "<smile>")
    text = re_sub(r"{}{}p+".format(eyes, nose), "<lolface>")
    text = re_sub(r"{}{}\(+|\)+{}{}".format(eyes, nose, nose, eyes), "<sadface>")
    text = re_sub(r"{}{}[\/|l*]".format(eyes, nose), "<neutralface>")
    text = re_sub(r"<3","<heart>")
    text = re_sub(r"[-+]?[.\d]*[\d]+[:,.\d]*", "<number>")
    text = re_sub(r"([!?.]){2,}", r"\1 <repeat>")
    text = re_sub(r"\b(\S*?)(.)\2{2,}\b", r"\1\2 <elong>")
    text = " ".join([word for word in str(text).split() if word not in STOPWORDS])

    tokens = tknzr.tokenize(text.lower())
    return " ".join(tokens)

### Section 1.1.3 Importing the dataset and pre-processing it
#### We turn the sentiment labels to numerical values for the machine to understand
#### You can download the dataset from [here](https://www.kaggle.com/lakshmi25npathi/imdb-dataset-of-50k-movie-reviews)

In [None]:
# we import the dataset using read_csv function from the pandas library
# just replace INPUTPATH with your input path
df = pd.read_csv("INPUTPATH.csv")

In [None]:
# visualizing the firt 5 rows of our dataset
df.head()

In [None]:
# encoding sentiment labels to numerical values
encode_label = {'negative' : 0, 'positive' : 1}
df['sentiment'] = df['sentiment'].map(encode_label)

In [None]:
# visualizing the firt 5 rows of our dataset again
df.head()

In [None]:
# pre-processing the dataset
df['review'] = df['review'].apply(text_preprocess)

In [None]:
# visualizing the firt 5 rows of our dataset again after pre-processing
df.head()

### Section 1.1.4 Saving the pre-processed dataset

In [None]:
# we use the to_csv function from the pandas library to export our dataframe as a csv file
#just replace destination_folder with your outputpath
# Trim text and titletext to first_n_words
train_test_ratio = 0.10
train_valid_ratio = 0.80

first_n_words = 500
df_raw['review'] = df_raw['review'].apply(trim_string)

# Splits dataset according to label
df_real = df_raw[df_raw['sentiment'] == 0]
df_fake = df_raw[df_raw['sentiment'] == 1]

# Train-test split
df_real_full_train, df_real_test = train_test_split(df_real, train_size = train_test_ratio, random_state = 1)
df_fake_full_train, df_fake_test = train_test_split(df_fake, train_size = train_test_ratio, random_state = 1)

# Train-valid split
df_real_train, df_real_valid = train_test_split(df_real_full_train, train_size = train_valid_ratio, random_state = 1)
df_fake_train, df_fake_valid = train_test_split(df_fake_full_train, train_size = train_valid_ratio, random_state = 1)

# Concatenate splits of different labels
df_train = pd.concat([df_real_train, df_fake_train], ignore_index=True, sort=False)
df_valid = pd.concat([df_real_valid, df_fake_valid], ignore_index=True, sort=False)
df_test = pd.concat([df_real_test, df_fake_test], ignore_index=True, sort=False)

# Write preprocessed data for train, test and validation
df_train.to_csv(destination_folder + '/train.csv', index=False)
df_valid.to_csv(destination_folder + '/valid.csv', index=False)
df_test.to_csv(destination_folder + '/test.csv', index=False)

## Section 2.1 Training the Starter Model

In [None]:
# importing important libraries needed for this section

import matplotlib.pyplot as plt
import pandas as pd
import torch

# Preliminaries

from torchtext.data import Field, TabularDataset, BucketIterator

# Models

import torch.nn as nn
from torch.nn.utils.rnn import pack_padded_sequence, pad_packed_sequence

# Training

import torch.optim as optim


# Evaluation

from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import seaborn as sns

#Sets device as GPU or CPU for training depending on your local machine

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')

### Section 2.1.1 Loading the pre-processed dataset and creating Training, Testing and Validation Sets
We import the csv file we have saved above and create the iterators for Train set, Validation set and Test set.

In [None]:
# Define columns to read.
#text_field will contain our text from the movie reviews
text_field = Field(lower=True, include_lengths=True, batch_first=True)
#label_field will contain the labels respective to each review
label_field = Field(sequential=False, use_vocab=False, batch_first=True, dtype=torch.float)
#final list containing the labels and the text
fields = [('sentiment', label_field), ('review', text_field)]

#we use TabularDataset to create a Dataset for training, validation and testing purpose, the path of each file is passed as the argument
train, valid, test = TabularDataset.splits(path=destination_folder, train='train.csv', validation='valid.csv', test='test.csv',
                                           format='CSV', fields=fields, skip_header=True)

#BucketIterator is used to create iterations of the complete sets in the defined batch sizes
#Batch size is the number of examples after which the weights of the network are updated
#A smaller batch size means the weights of the network are updated more freqeuntly and a large batch size would mean less updates
train_iter = BucketIterator(train, batch_size=64, sort_key=lambda x: len(x.review),
                            device=device, sort=True, sort_within_batch=True)
valid_iter = BucketIterator(valid, batch_size=64, sort_key=lambda x: len(x.review),
                            device=device, sort=True, sort_within_batch=True)
test_iter = BucketIterator(test, batch_size=64, sort_key=lambda x: len(x.review),
                            device=device, sort=True, sort_within_batch=True)

# Vocabulary
#this creates the vocabulary for our model, any word that occurs more than three times is added to our vocaublary file
text_field.build_vocab(train, min_freq=3)

### Section 2.1.2 Creating the model architecture
We have defined a simple LSTM model having 128 units, with a bidirecational layer. This is followed be a Dropout layer which drops off random words from training sentences. In the end we have a linear layer, where input units are set as 2 * output units of LSTM layer

In [None]:
class LSTM(nn.Module):

    def __init__(self, dimension=128):
        super(LSTM, self).__init__()

        self.embedding = nn.Embedding(len(text_field.vocab), 500)
        self.dimension = dimension
        self.lstm = nn.LSTM(input_size=500,
                            hidden_size=dimension,
                            num_layers=1,
                            batch_first=True,
                            bidirectional=True)
        self.drop = nn.Dropout(p=0.3)

        self.fc = nn.Linear(2*dimension, 1)

    def forward(self, text, text_len):

        text_emb = self.embedding(text)

        packed_input = pack_padded_sequence(text_emb, text_len, batch_first=True, enforce_sorted=False)
        packed_output, _ = self.lstm(packed_input)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)

        out_forward = output[range(len(output)), text_len - 1, :self.dimension]
        out_reverse = output[:, 0, self.dimension:]
        out_reduced = torch.cat((out_forward, out_reverse), 1)
        text_fea = self.drop(out_reduced)

        text_fea = self.fc(text_fea)
        text_fea = torch.squeeze(text_fea, 1)
        text_out = torch.sigmoid(text_fea)

        return text_out

### Section 2.1.3 Saving the model
We use the save state dictionary method to save a model, in this case, a dictionary is created where the keys are the layer names and the value pairs contain the weight matrix for the layer.  
While using this method, each time we want to load a model, we need to define a copy of the original model which will hold the loaded weights.

In [None]:
def save_checkpoint(save_path, model, optimizer, valid_loss):

    if save_path == None:
        return
    
    state_dict = {'model_state_dict': model.state_dict(),
                  'optimizer_state_dict': optimizer.state_dict(),
                  'valid_loss': valid_loss}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_checkpoint(load_path, model, optimizer):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    model.load_state_dict(state_dict['model_state_dict'])
    optimizer.load_state_dict(state_dict['optimizer_state_dict'])
    
    return state_dict['valid_loss']


def save_metrics(save_path, train_loss_list, valid_loss_list, global_steps_list):

    if save_path == None:
        return
    
    state_dict = {'train_loss_list': train_loss_list,
                  'valid_loss_list': valid_loss_list,
                  'global_steps_list': global_steps_list}
    
    torch.save(state_dict, save_path)
    print(f'Model saved to ==> {save_path}')


def load_metrics(load_path):

    if load_path==None:
        return
    
    state_dict = torch.load(load_path, map_location=device)
    print(f'Model loaded from <== {load_path}')
    
    return state_dict['train_loss_list'], state_dict['valid_loss_list'], state_dict['global_steps_list']

### Section 2.1.4 Training the Model
In this section we add the training part of our model, that trains it on the training set

In [None]:
def train(model,
          optimizer,
          criterion = nn.BCELoss(),
          train_loader = train_iter,
          valid_loader = valid_iter,
          num_epochs = 5,
          eval_every = len(train_iter) // 2,
          file_path = destination_folder,
          best_valid_loss = float("Inf")):
    
    # initialize running values
    running_loss = 0.0
    valid_running_loss = 0.0
    global_step = 0
    train_loss_list = []
    valid_loss_list = []
    global_steps_list = []

    # training loop
    model.train()
    for epoch in range(num_epochs):
        for (labels, (title, title_len)), _ in train_loader:           
            labels = labels.to(device)
            title = title.to(device)
            title_len = title_len.to(device)
            output = model(title, title_len)

            loss = criterion(output, labels)
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            # update running values
            running_loss += loss.item()
            global_step += 1

            # evaluation step
            if global_step % eval_every == 0:
                model.eval()
                with torch.no_grad():                    
                  # validation loop
                  for (labels, (title, title_len)), _ in valid_loader:
                      labels = labels.to(device)
                      title = title.to(device)
                      title_len = title_len.to(device)
                      output = model(title, title_len)

                      loss = criterion(output, labels)
                      valid_running_loss += loss.item()

                # evaluation
                average_train_loss = running_loss / eval_every
                average_valid_loss = valid_running_loss / len(valid_loader)
                train_loss_list.append(average_train_loss)
                valid_loss_list.append(average_valid_loss)
                global_steps_list.append(global_step)

                # resetting running values
                running_loss = 0.0                
                valid_running_loss = 0.0
                model.train()

                # print progress
                print('Epoch [{}/{}], Step [{}/{}], Train Loss: {:.4f}, Valid Loss: {:.4f}'
                      .format(epoch+1, num_epochs, global_step, num_epochs*len(train_loader),
                              average_train_loss, average_valid_loss))
                
                # checkpoint
                if best_valid_loss > average_valid_loss:
                    best_valid_loss = average_valid_loss
                    save_checkpoint(file_path + '\model.pt', model, optimizer, best_valid_loss)
                    save_metrics(file_path + '\metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    
    save_metrics(file_path + '\metrics.pt', train_loss_list, valid_loss_list, global_steps_list)
    print('Finished Training!')


model = LSTM().to(device)
pretrainedPath = torch.load('modelFinal.pth')
model.load_state_dict(pretrainedPath, strict=False)

optimizer = optim.Adam(model.parameters(), lr=0.00001)

train(model=model, optimizer=optimizer, num_epochs=12)

### Section 2.1.4 Testing the Model
In this section we add the testing part of our model, that tests it on the testing set

In [None]:
def evaluate(model, test_loader, version='title', threshold=0.5):
    y_pred = []
    y_true = []

    model.eval()
    with torch.no_grad():
        for (labels, (title, title_len)), _ in test_loader:           
            labels = labels.to(device)
            title = title.to(device)
            title_len = title_len.to(device)
            output = model(title, title_len)

            output = (output > threshold).int()
            y_pred.extend(output.tolist())
            y_true.extend(labels.tolist())
    
    print('Classification Report:')
    print(classification_report(y_true, y_pred, labels=[1,0], digits=4))
    
    cm = confusion_matrix(y_true, y_pred, labels=[1,0])
    ax= plt.subplot()
    sns.heatmap(cm, annot=True, ax = ax, cmap='Blues', fmt="d")

    ax.set_title('Confusion Matrix')

    ax.set_xlabel('Predicted Labels')
    ax.set_ylabel('True Labels')

    ax.xaxis.set_ticklabels(['FAKE', 'REAL'])
    ax.yaxis.set_ticklabels(['FAKE', 'REAL'])
    
    
best_model = LSTM().to(device)
optimizer = optim.Adam(best_model.parameters(), lr=0.001)

load_checkpoint(destination_folder + '\model.pt', best_model, optimizer)
evaluate(best_model, test_iter)

In [None]:
#final save for the model
torch.save(model.state_dict(), "modelFinalTransfer.pth")

## References

* [PyTroch LSTM Tutorial: Official Documentation](https://pytorch.org/tutorials/beginner/nlp/sequence_models_tutorial.html)
* [LSTMs and their uses](https://d2l.ai/chapter_recurrent-modern/lstm.html)
* [A basic introduction to LSTM: Blog](https://blog.floydhub.com/long-short-term-memory-from-zero-to-hero-with-pytorch/)
* [An introduction to Text Pre-Processing](https://d2l.ai/chapter_recurrent-neural-networks/text-preprocessing.html)
* [Training PyTorch Classifier: Official Documentation](https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html)