In [1]:
import torch
from torchtext.datasets import IMDB, AG_NEWS, YahooAnswers
from torchtext.vocab import GloVe
from torchtext.data import to_map_style_dataset
from torchtext.data.utils import get_tokenizer
from torch.nn import LSTM, GRU, Linear, Softmax, CrossEntropyLoss
from torch.nn.utils.rnn import pad_sequence
from torch.utils.data import DataLoader, random_split, Dataset
from torch.optim import Adam
from tqdm import tqdm
import nltk
from nltk.corpus import wordnet as wn
import numpy as np

In [2]:
DATASET = 'IMDB'
MODEL = 'LSTM'
VALIDATION_SPLIT = 0.5 # of test data
BATCH_SIZE = 64
SHUFFLE = True
NUM_EPOCHS = 5

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [3]:
def build_thesaurus(all_words):
    # Expects torch.vocab.itos dictionary to extract thesaurus
    thesaurus = {}
    syns = []
    length_thesaurus = len(all_words)

    for i in range(length_thesaurus):
        if i % 10000 == 0:
            print("At index {} of the vocabulary".format(i))
        # Extract the word
        token = all_words[i]

        # Find the synsets for the token
        synsets = wn.synsets(token)

        if len(synsets) == 0:
            thesaurus[token] = ""
        
        else:
            # Iterate through all synset
            for synset in synsets:
                lemma_names = synset.lemma_names()
                for lemma in lemma_names:
                    # Check if lemma has an underscore indicating a two word token
                    if not("_" in lemma):
                        lemma = lemma.lower()
                        if (lemma != token and lemma not in syns):
                            syns.append(lemma)
                    
            
            thesaurus[token] = syns
            syns = []
        
    return thesaurus

In [4]:
class BidirectionalLSTMClassifier(torch.nn.Module):
    def __init__(self, num_classes, hidden_size, num_layers):
        super().__init__()
        self.num_layers = num_layers
        self.LSTM = LSTM(50, hidden_size, num_layers=num_layers, batch_first=True, bidirectional=True)
        self.linear = Linear(2 * hidden_size, num_classes)
        self.softmax = Softmax(dim=1)
        
    def forward(self, x):
        _, (h_n, _) = self.LSTM(x)
        h_forward = h_n[2 * self.num_layers - 2]
        h_backward = h_n[2 * self.num_layers - 1]
        y = self.linear(torch.cat((h_forward, h_backward), 1))
        return self.softmax(y)

In [5]:
class ClassificationDataset(Dataset):
    def __init__(self, dataset, num_classes, tokenizer, model):
        self.num_classes = num_classes
        self.dataset = dataset
        self.tokenizer = tokenizer
        self.model = model

    def __len__(self):
        return self.dataset.__len__()

    def __getitem__(self, idx):
        label, text = self.dataset.__getitem__(idx)
        if type(label) == str:
            if label == 'neg':
                label = 0
            else:
                label = 1
        else:
            label = int(label) - 1

        if self.model == 'BERT':
            return label, self.tokenizer(text, padding="max_length", return_tensors='pt', max_length=512, truncation=True)
        else:
            return label, self.tokenizer(text)

In [6]:
train_set = IMDB(split='train')
test_set = IMDB(split='test')

100%|██████████| 84.1M/84.1M [00:02<00:00, 39.4MB/s]


In [7]:
MODEL = 'LSTM'
num_classes = 2
train_set = to_map_style_dataset(train_set)
test_set = to_map_style_dataset(test_set)
tokenizer = get_tokenizer('basic_english')

train_set = ClassificationDataset(train_set, num_classes, tokenizer, MODEL)
test_set = ClassificationDataset(test_set, num_classes, tokenizer, MODEL)

In [8]:
embedding = GloVe(name='6B', dim=50)

.vector_cache/glove.6B.zip: 862MB [02:40, 5.37MB/s]                           
100%|█████████▉| 399999/400000 [00:14<00:00, 27079.38it/s]


In [10]:
nltk.download('wordnet')
example_thes = build_thesaurus(embedding.itos)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.
At index 0 of the vocabulary
At index 10000 of the vocabulary
At index 20000 of the vocabulary
At index 30000 of the vocabulary
At index 40000 of the vocabulary
At index 50000 of the vocabulary
At index 60000 of the vocabulary
At index 70000 of the vocabulary
At index 80000 of the vocabulary
At index 90000 of the vocabulary
At index 100000 of the vocabulary
At index 110000 of the vocabulary
At index 120000 of the vocabulary
At index 130000 of the vocabulary
At index 140000 of the vocabulary
At index 150000 of the vocabulary
At index 160000 of the vocabulary
At index 170000 of the vocabulary
At index 180000 of the vocabulary
At index 190000 of the vocabulary
At index 200000 of the vocabulary
At index 210000 of the vocabulary
At index 220000 of the vocabulary
At index 230000 of the vocabulary
At index 240000 of the vocabulary
At index 250000 of the vocabulary
At index 260000 of the 

In [11]:
def mask_replace_with_syns_add_noise(sentence, thesaurus, embedding, mask_probability=0.1, synonym_probability=0.25, pos_noise=0.1):
    tokens_to_ret = []
    for word in sentence:
        mask_flag = np.random.choice([0, 1], replace=False, p=[1-mask_probability, mask_probability])
        # Not masked
        if mask_flag == 0:
            syn_flag = np.random.choice([0, 1], replace=False, p=[1-synonym_probability, synonym_probability])
            # Not masked & replaced with synonym
            if syn_flag == 1:
                # Check if word exists in thesaurus
                synonyms = thesaurus.get(word)
                if synonyms != None and len(synonyms) != 0:
                    # randomly sample a synonym word
                    indx = np.random.randint(low=0, high=len(synonyms))
                    tokens_to_ret.append(synonyms[indx])
                # Synonym doesn't exist
                else:
                    tokens_to_ret.append(word)
            # Not masked & not replaced with synonym
            else:
                tokens_to_ret.append(word)
        # Masked
        else:
            tokens_to_ret.append("")
        
    
    # We have masked and replaced with synonyms randomly, now obtain embeddings
    embed = embedding.get_vecs_by_tokens(tokens_to_ret)
    '''pos_encoding = np.zeros(embed.shape)
    # Positional encoding introduced in Vaswani et. al.
    for i in range(embed.shape[0]):
        if i%2 == 0:
            pos_param = pos_noise*np.sin(i / (10000 ** ((2*(i//2) / embed.shape[1]))))
        else:
            pos_param = pos_noise*np.cos(i / (10000 ** ((2*(i//2) / embed.shape[1]))))'''
    return embed

In [12]:
def collate_defence_batch(batch):
    label_list, text_list = [], []
    for (_label, _tokens) in batch:
        label_list.append(_label)
        #embed = embedding.get_vecs_by_tokens(_tokens)
        embed = mask_replace_with_syns_add_noise(_tokens, example_thes, embedding)
        text_list.append(embed)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    text_list = pad_sequence(text_list, batch_first=True)
    return label_list.to(device), text_list.to(device)

In [13]:
def collate_batch(batch):
    label_list, text_list = [], []
    for (_label, _tokens) in batch:
        label_list.append(_label)
        embed = embedding.get_vecs_by_tokens(_tokens)
        text_list.append(embed)
    text_list = pad_sequence(text_list, batch_first=True)
    label_list = torch.tensor(label_list, dtype=torch.int64)
    return label_list.to(device), text_list.to(device)

In [14]:
train_loader = DataLoader(train_set, batch_size=BATCH_SIZE, collate_fn=collate_defence_batch, shuffle=SHUFFLE)
test_loader = DataLoader(test_set, batch_size=BATCH_SIZE, collate_fn=collate_batch, shuffle=SHUFFLE)

In [15]:
def evaluate(model, data_loader, loss=CrossEntropyLoss()):
    model.eval()
    total_acc, total_count = 0, 0
    
    with torch.no_grad():
        if MODEL == "BERT":
            for idx, (labels, input_ids, token_type_ids, attention_mask) in enumerate(data_loader):
                predicted_label = model(input_ids, token_type_ids, attention_mask)
                loss_ = loss(predicted_label, labels)
                total_acc += (predicted_label.argmax(1) == labels).sum().item()
                total_count += labels.size(0)
        else:
            for idx, (labels, text) in enumerate(data_loader):
                predicted_label = model(text)
                loss_ = loss(predicted_label, labels)
                total_acc += (predicted_label.argmax(1) == labels).sum().item()
                total_count += labels.size(0)
    
    return total_acc / total_count


def train(model, optimizer, train_loader, loss=CrossEntropyLoss(), log_interval=50):
    model.train()
    total_acc, total_count = 0, 0
    pbar = tqdm(total=len(train_loader), desc=f'Epoch [{epoch + 1}/{NUM_EPOCHS}]')

    if MODEL == 'BERT':
        for idx, (labels, input_ids, token_type_ids, attention_mask) in enumerate(train_loader):
            output = model(input_ids, token_type_ids, attention_mask)
            loss_ = loss(output, labels)
            optimizer.zero_grad()
            loss_.backward()
            nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
            optimizer.step()
            total_acc += (output.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
            pbar.update()
            if idx % log_interval == 0 and idx > 0:
                pbar.set_postfix(loss=loss_, accuracy=total_acc / total_count)
                total_acc, total_count = 0, 0
        
        pbar.close()
    else:
        for idx, (labels, text) in enumerate(train_loader):
            output = model(text)
            loss_ = loss(output, labels)
            optimizer.zero_grad()
            loss_.backward()
            optimizer.step()
            total_acc += (output.argmax(1) == labels).sum().item()
            total_count += labels.size(0)
            pbar.update()
            if idx % log_interval == 0 and idx > 0:
                pbar.set_postfix(loss=loss_, accuracy=total_acc / total_count)
                total_acc, total_count = 0, 0
        
        pbar.close()

In [16]:
from pathlib import Path
from IPython import get_ipython
on_colab = 'google.colab' in str(get_ipython())

if on_colab:
  from google.colab import drive
  drive.mount("/content/gdrive")

PATH =  "/content/gdrive/My Drive/DeepLearning/MODELS/" if on_colab else "./"

Mounted at /content/gdrive


In [17]:
model = BidirectionalLSTMClassifier(num_classes, 64, 1).to(device)
optim = Adam(model.parameters())
for epoch in range(NUM_EPOCHS):
    train(model, optim, train_loader)
    torch.save({'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optim.state_dict()}, PATH + MODEL + '_' + DATASET + '_' + 'WLADL' + '.pt')

Epoch [1/5]: 100%|██████████| 391/391 [18:21<00:00,  2.82s/it, accuracy=0.624, loss=tensor(0.6465, device='cuda:0', grad_fn=<NllLossBackward0>)]
Epoch [2/5]: 100%|██████████| 391/391 [18:18<00:00,  2.81s/it, accuracy=0.512, loss=tensor(0.6899, device='cuda:0', grad_fn=<NllLossBackward0>)]
Epoch [3/5]: 100%|██████████| 391/391 [18:26<00:00,  2.83s/it, accuracy=0.583, loss=tensor(0.6783, device='cuda:0', grad_fn=<NllLossBackward0>)]
Epoch [4/5]: 100%|██████████| 391/391 [18:32<00:00,  2.85s/it, accuracy=0.768, loss=tensor(0.5120, device='cuda:0', grad_fn=<NllLossBackward0>)]
Epoch [5/5]: 100%|██████████| 391/391 [18:27<00:00,  2.83s/it, accuracy=0.783, loss=tensor(0.5285, device='cuda:0', grad_fn=<NllLossBackward0>)]


In [18]:
test_accuracy = evaluate(model, test_loader)
print(f'Test accuracy: {test_accuracy}')

Test accuracy: 0.76912
