In [1]:
import numpy as np
import random
import torch
import os
import spacy
from torchtext.vocab import GloVe, FastText
from torch.utils.data.sampler import SubsetRandomSampler
from torch.utils.data import DataLoader
from sklearn.metrics import classification_report, accuracy_score
import torch.nn as nn
import torch.nn.functional as F
from tqdm import tqdm
import csv

pos_set = []
neg_set = []
with open("./Train dataset.csv", encoding='utf-8') as csvf:
    data = csv.DictReader(csvf)

    for rows in data:

        # Removing punctuations
        chars_to_remove = ['¡', '§', '…','‘', '’', '¿', '«', '»', '¨', '%', '-', '“', '”', '--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '/', '<br />']
        
        review = rows['review'].replace('<br />', " ", -1)
        for char in chars_to_remove:
            review = review.replace(char, " ", -1)

        if rows['sentiment'] == 'positive':
            pos_set.append(review)
        else:
            neg_set.append(review)

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def set_seed(seed = 42):
    '''
        For Reproducibility: Sets the seed of the entire notebook.
    '''

    np.random.seed(seed)
    random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    
    # When running on the CuDNN backend, two further options must be set
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = True
    
    # Sets a fixed value for the hash seed
    os.environ['PYTHONHASHSEED'] = str(seed)

set_seed(1)

In [3]:
from torchtext.data import get_tokenizer

# Downloads GloVe and FastText
global_vectors = GloVe(name='840B', dim=300)

# ----------- Text Preprocessing -----------
nlp = spacy.load("en_core_web_md")

data_set = []
vocab = []
tokenizer = get_tokenizer("basic_english")

for line in pos_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 1))
    # Adds the extracted words to a list
    vocab.extend(tokens)


print("--- Positive Finished ---")

for line in neg_set:

    # Tokenizes the input text into words
    tokens = tokenizer(line)

    data_set.append((tokens, 0))
    # Adds the extracted words to a list
    vocab.extend(tokens)

print("--- Negative Finished ---")

--- Positive Finished ---
--- Negative Finished ---


In [4]:
# Sorting the samples based on their sequence length
def sort_key(s):
    return len(s[0])
    
#data_set = sorted(data_set, key=sort_key)   # Sorting did not gave better result

In [5]:
# Stores all the unique words in the dataset and their frequencies
vocabulary = {}

# Calculates the frequency of each unique word in the vocabulary
for word in vocab:
    if word in vocabulary:
        vocabulary[word] += 1
    else:
        vocabulary[word] = 1

print("Number of unique words in the vocabulary: ", len(vocabulary))

# Stores the integer token for each unique word in the vocabulary
ids_vocab = {}

id = 0

# Assigns words in the vocabulary to integer tokens
for word, v in vocabulary.items():
    ids_vocab[word] = id
    id += 1

Number of unique words in the vocabulary:  94278


In [6]:
# Tokenization function
def tokenize(corpus, ids_vocab):
    """
        Converts words in the dataset to integer tokens
    """

    tokenized_corpus = []
    for line, sentiment in corpus:
        new_line = []
        for i, word in enumerate(line):
            if word in ids_vocab and (i == 0 or word != line[i-1]):
                new_line.append(ids_vocab[word])

        new_line = torch.Tensor(new_line).long()
        tokenized_corpus.append((new_line, sentiment))

    return tokenized_corpus

token_corpus = tokenize(data_set, ids_vocab)

In [7]:
# Loading the embedding matrix
emb_dim = 300

embeds = torch.zeros(len(ids_vocab) + 1, emb_dim)

for token, idx in ids_vocab.items():
    embeds[idx] = global_vectors[token]

In [8]:
# Train-Valid split of 90-10
def split_indices(n, val_pct):

    # Determine size of Validation set
    n_val = int(val_pct * n)

    # Create random permutation of 0 to n-1
    idxs = np.random.permutation(n)
    return np.sort(idxs[n_val:]), np.sort(idxs[:n_val])

train_pos_indices, val_pos_indices = split_indices(len(pos_set), 0.1)
train_neg_indices, val_neg_indices = split_indices(len(neg_set), 0.1)

train_indices = np.concatenate((train_pos_indices, train_neg_indices+len(pos_set)-1))
val_indices = np.concatenate((val_pos_indices, val_neg_indices+len(pos_set)-1))

In [9]:
from torch.nn.utils.rnn import pad_sequence

# ----------- Batching the data -----------
def collate_fn(instn):

    sentence = [x[0] for x in instn]

    # Pre padding
    sen_len = [len(x[0]) for x in instn]
    max_len = max(sen_len)

    padded_sent = torch.zeros(1, max_len)
    sentence_pad = [torch.cat((torch.zeros(max_len-len(x[0])), x[0]), dim=0) for x in instn]
    
    for i in sentence_pad:
        padded_sent = torch.cat((padded_sent, i.unsqueeze(dim=0)), dim=0)
    padded_sent = padded_sent[1:].long()

    # Post padding
    #padded_sent = pad_sequence(sentence, batch_first=True, padding_value=0)

    labels = torch.Tensor([x[1] for x in instn])

    return (padded_sent, labels)


batch_size = 128

train_sampler   = SubsetRandomSampler(train_indices)
train_loader    = DataLoader(token_corpus, batch_size, sampler=train_sampler, collate_fn=collate_fn)

val_sampler     = SubsetRandomSampler(val_indices)
val_loader      = DataLoader(token_corpus, batch_size, sampler=val_sampler, collate_fn=collate_fn)

In [10]:
# ----------- Model -----------
class BILSTM(nn.Module):
    
    def __init__(self, embeds):
        super().__init__()

        self.embeddings = nn.Embedding.from_pretrained(embeds, padding_idx=0)

        self.gru = nn.GRU(input_size = 300, hidden_size = 128, num_layers = 2, batch_first = True, bidirectional = True, dropout=0.5)

        self.lin1 = nn.Linear(256, 64)
        self.lin2 = nn.Linear(64, 1)

        self.lin3 = nn.Linear(256, 1)

    def forward(self, xb):

        xe = self.embeddings(xb)
        out, y = self.gru(xe)
        
        x = self.lin3(out).squeeze(dim=-1)
        x = torch.softmax(x, dim=-1).unsqueeze(dim=1)
        x = torch.bmm(x, out).squeeze(dim=1)              # Weighted average

        #x = torch.cat((x, y[2][ :, :], y[3][ :, :]), dim = 1) # Tried concatenating the representation with hidden units - got similar results
        x = self.lin1(x)
        x = F.relu(x)
        x = self.lin2(x)
        x = torch.sigmoid(x)
        return x

In [11]:
if torch.cuda.is_available():
    device = torch.device("cuda:0")
else:
    device = torch.device("cpu")


model = BILSTM(embeds)
model.to(device)
opt_c = torch.optim.AdamW(model.parameters(), lr = 0.001) # Same as Adam with weight decay = 0.001
# loss_fn_c = F.cross_entropy #Tried Cross Entropy with log_softmax output function - gave similar results
loss_fn_c = F.binary_cross_entropy

# ----------- Main Training Loop -----------
max_epoch = 10

best_test_acc = 0
for ep in range(max_epoch):

    epoch_loss = 0

    model.train()

    for xb, yb in tqdm(train_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn_c(y_hat.squeeze(), yb)

        loss.backward()

        opt_c.step()

        opt_c.zero_grad()

        nn.utils.clip_grad_norm_(model.parameters(), 5)

        epoch_loss += float(loss)

    print("Epoch: ", ep+1, " Training Loss: ", epoch_loss/len(train_loader))


    #----------- Validation -----------

    val_labels = []
    val_pred = []

    model.eval()

    val_epoch_loss = 0

    with torch.no_grad():
        for xb, yb in tqdm(val_loader):
            xb = xb.to(device)
            yb = yb.to(device)

            y_hat = model(xb)
            loss = loss_fn_c(y_hat.squeeze(), yb)

            val_epoch_loss += float(loss)

            val_labels.extend(torch.round(yb).cpu().detach().numpy())
            val_pred.extend(y_hat.round().cpu().detach().numpy())

    print("Validation loss: ", val_epoch_loss/len(val_loader))
    print("Validation accuracy: ", accuracy_score(val_labels, val_pred)*100)

    if ep > 5 and prev_val_loss - val_epoch_loss > 0.015:
        print("Saving Model")
        torch.save(model.state_dict(), "best_model.pt")
    
    prev_val_loss = val_epoch_loss

100%|██████████| 282/282 [00:44<00:00,  6.31it/s]


Epoch:  1  Training Loss:  0.36699220136547767


100%|██████████| 32/32 [00:01<00:00, 16.24it/s]


Validation loss:  0.25298161758109927
Validation accuracy:  89.52238059514879


100%|██████████| 282/282 [00:42<00:00,  6.69it/s]


Epoch:  2  Training Loss:  0.24559462553960212


100%|██████████| 32/32 [00:02<00:00, 14.64it/s]


Validation loss:  0.22725237463600934
Validation accuracy:  90.47261815453864


100%|██████████| 282/282 [00:43<00:00,  6.50it/s]


Epoch:  3  Training Loss:  0.21485655704605663


100%|██████████| 32/32 [00:02<00:00, 14.17it/s]


Validation loss:  0.21284557785838842
Validation accuracy:  91.22280570142536


100%|██████████| 282/282 [00:41<00:00,  6.79it/s]


Epoch:  4  Training Loss:  0.19186829854833318


100%|██████████| 32/32 [00:02<00:00, 14.39it/s]


Validation loss:  0.20287661429028958
Validation accuracy:  91.87296824206051


100%|██████████| 282/282 [00:45<00:00,  6.26it/s]


Epoch:  5  Training Loss:  0.1665186250833332


100%|██████████| 32/32 [00:01<00:00, 17.66it/s]


Validation loss:  0.23903465596958995
Validation accuracy:  90.67266816704176


100%|██████████| 282/282 [00:45<00:00,  6.24it/s]


Epoch:  6  Training Loss:  0.1424109943026135


100%|██████████| 32/32 [00:02<00:00, 13.95it/s]


Validation loss:  0.22297459421679378
Validation accuracy:  90.94773693423356


100%|██████████| 282/282 [00:43<00:00,  6.44it/s]


Epoch:  7  Training Loss:  0.12036320782458106


100%|██████████| 32/32 [00:02<00:00, 14.26it/s]


Validation loss:  0.21119020320475101
Validation accuracy:  92.1730432608152
Saving Model


100%|██████████| 282/282 [00:43<00:00,  6.47it/s]


Epoch:  8  Training Loss:  0.09301362891437102


100%|██████████| 32/32 [00:01<00:00, 16.92it/s]


Validation loss:  0.25896485010161996
Validation accuracy:  91.14778694673669


100%|██████████| 282/282 [00:43<00:00,  6.54it/s]


Epoch:  9  Training Loss:  0.07259131483205244


100%|██████████| 32/32 [00:01<00:00, 17.34it/s]


Validation loss:  0.27155631058849394
Validation accuracy:  91.57289322330583


100%|██████████| 282/282 [00:42<00:00,  6.62it/s]


Epoch:  10  Training Loss:  0.053940180878993785


100%|██████████| 32/32 [00:02<00:00, 15.63it/s]

Validation loss:  0.31143582286313176
Validation accuracy:  91.27281820455114





In [14]:
# -------- Text Preprocessing ----------

test_set = []
with open("./E0334 Assignment2 Test Dataset.csv", encoding='utf-8') as csvf:
    data = csv.DictReader(csvf)

    for rows in data:

        # Removing punctuations
        chars_to_remove = ['¡', '§', '…','‘', '’', '¿', '«', '»', '¨', '%', '-', '“', '”', '--', '`', '~', '<', '>', '*', '{', '}', '^', '=', '_', '[', ']', '|', '- ', '/', '<br />']
        
        review = rows['review'].replace('<br />', " ", -1)
        for char in chars_to_remove:
            review = review.replace(char, " ", -1)
        tokens = tokenizer(review)

        if rows['sentiment'] == 'positive':
            test_set.append((tokens, 1))
        else:
            test_set.append((tokens, 0))

test_set = sorted(test_set, key=sort_key)

token_corpus_test = tokenize(test_set, ids_vocab)

test_loader      = DataLoader(token_corpus_test, batch_size, collate_fn=collate_fn)

In [15]:
model = BILSTM(embeds)
model.load_state_dict(torch.load("best_model.pt"))
model.to(device)

test_labels = []
test_pred = []

model.eval()

test_epoch_loss = 0

# ---------- Testing ----------
with torch.no_grad():
    for xb, yb in tqdm(test_loader):
        xb = xb.to(device)
        yb = yb.to(device)

        y_hat = model(xb)
        loss = loss_fn_c(y_hat.squeeze(), yb)

        test_epoch_loss += float(loss)

        test_labels.extend(torch.round(yb).cpu().detach().numpy())
        test_pred.extend(y_hat.round().cpu().detach().numpy())

print("Test loss: ", test_epoch_loss/len(test_loader))
print("Test accuracy: ", accuracy_score(test_labels, test_pred)*100)

100%|██████████| 79/79 [00:01<00:00, 55.92it/s] 

Test loss:  0.2429095369916928
Test accuracy:  92.0





In [None]:
# Seed doesn't work in Jupyter notebook, to replicate my results, kindly, run it as .py file