In [30]:
import torch
import torch.nn as nn
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from torch.utils.data import Dataset, random_split
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
import torch.optim as optim
import codecs
from textblob import TextBlob
from tqdm import tqdm
from transformers import BertTokenizer
from transformers import BertModel

In [11]:
!wget http://nlp.stanford.edu/data/glove.6B.zip
!unzip glove.6B.zip

--2021-02-12 19:24:45--  http://nlp.stanford.edu/data/glove.6B.zip
Resolving nlp.stanford.edu (nlp.stanford.edu)... 171.64.67.140
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:80... connected.
HTTP request sent, awaiting response... 302 Found
Location: https://nlp.stanford.edu/data/glove.6B.zip [following]
--2021-02-12 19:24:45--  https://nlp.stanford.edu/data/glove.6B.zip
Connecting to nlp.stanford.edu (nlp.stanford.edu)|171.64.67.140|:443... connected.
HTTP request sent, awaiting response... 301 Moved Permanently
Location: http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip [following]
--2021-02-12 19:24:45--  http://downloads.cs.stanford.edu/nlp/data/glove.6B.zip
Resolving downloads.cs.stanford.edu (downloads.cs.stanford.edu)... 171.64.64.22
Connecting to downloads.cs.stanford.edu (downloads.cs.stanford.edu)|171.64.64.22|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 862182613 (822M) [application/zip]
Saving to: ‘glove.6B.zip’


2021-0

In [2]:
train_df = pd.read_csv('data/task-2/train.csv')
test_df = pd.read_csv('data/task-2/dev.csv')

In [67]:
def preprocess(series):
    tokenized = []
    for sentence in tqdm(series):
        blob = TextBlob(sentence)
        words = blob.words.lower()
        lemmas = words.lemmatize()
        tokenized.append(lemmas)

    return pd.Series(tokenized)

def create_edited_sentences(data, edits):
    edited_sentences = []
    for i, headline in enumerate(data):
        start_loc = headline.find('<')
        end_loc = headline.find('/>')
        edited_sentences.append(headline[:start_loc] + edits[i] + headline[end_loc+2:])
    return pd.Series(edited_sentences)

def remove_tags_sentences(data):
    clean_sentences = []
    for _, headline in enumerate(data):
        clean_sentences.append(headline.replace('<','').replace('/>',''))
    return pd.Series(clean_sentences)

def get_max_len_sentence(data):
    return data.map(len).max()
        

In [68]:
train_original_1 = remove_tags_sentences(train_df['original1'])
train_edits_1 = train_df['edit1']
train_replaced_1 = create_edited_sentences(train_df['original1'], train_edits_1)
train_tokenized_original_1 = preprocess(train_original_1)
train_tokenized_replaced_1 = preprocess(train_replaced_1)

train_original_2 = remove_tags_sentences(train_df['original2'])
train_edits_2 = train_df['edit2']
train_replaced_2 = create_edited_sentences(train_df['original2'], train_edits_2)
train_tokenized_original_2 = preprocess(train_original_2)
train_tokenized_replaced_2 = preprocess(train_replaced_2)

test_original_1 = remove_tags_sentences(test_df['original1'])
test_edits_1 = test_df['edit1']
test_replaced_1 = create_edited_sentences(test_df['original1'], test_edits_1)
test_tokenized_original_1 = preprocess(test_original_1)
test_tokenized_replaced_1 = preprocess(test_replaced_1)

test_original_2 = remove_tags_sentences(test_df['original2'])
test_edits_2 = test_df['edit2']
test_replaced_2 = create_edited_sentences(test_df['original2'], test_edits_2)
test_tokenized_original_2 = preprocess(test_original_2)
test_tokenized_replaced_2 = preprocess(test_replaced_2)

max_sentence_length = get_max_len_sentence(train_tokenized_original_1)


100%|██████████| 9381/9381 [00:02<00:00, 4060.07it/s]
100%|██████████| 9381/9381 [00:02<00:00, 3302.36it/s]
100%|██████████| 9381/9381 [00:02<00:00, 3544.77it/s]
100%|██████████| 9381/9381 [00:02<00:00, 4153.88it/s]
100%|██████████| 2355/2355 [00:00<00:00, 3436.27it/s]
100%|██████████| 2355/2355 [00:00<00:00, 3396.46it/s]
100%|██████████| 2355/2355 [00:00<00:00, 3471.46it/s]
100%|██████████| 2355/2355 [00:00<00:00, 3667.03it/s]


In [69]:
max_sentence_length

26

In [70]:
train_tokenized = pd.concat([train_tokenized_original_1, train_tokenized_replaced_1, train_tokenized_replaced_2]).sort_index().reset_index(drop=True)
print(train_tokenized)

train_sentences = pd.concat([train_original_1, train_replaced_1, train_replaced_2]).sort_index().reset_index(drop=True)
print(train_sentences)

0        [gene, cernan, last, astronaut, on, the, moon,...
1        [gene, cernan, last, astronaut, on, the, moon,...
2        [gene, cernan, last, dancer, on, the, moon, dy...
3        [i, 'm, done, fed, up, with, california, some,...
4        [i, 'm, done, fed, up, with, pancake, some, co...
                               ...                        
28138    [“, kompromat, ”, medium, ethic, and, the, law...
28139    [“, kompromat, ”, medium, ethic, and, the, law...
28140    [“, son, of, a, bitch, ”, trump, ’, s, nfl, ta...
28141    [“, son, of, a, bitch, ”, trump, ’, s, nfl, ou...
28142    [“, son, of, a, father, ”, trump, ’, s, nfl, o...
Length: 28143, dtype: object
0        " Gene Cernan , Last Astronaut on the Moon , D...
1        " Gene Cernan , Last Astronaut on the Moon , i...
2        " Gene Cernan , Last Dancer on the Moon , Dies...
3        " I 'm done " : Fed up with California , some ...
4        " I 'm done " : Fed up with pancakes , some co...
                           

In [57]:
def add_to_vocab(vocab, series):
    for sentence in tqdm(series):
        for word in sentence:
            if(word == "dropuppiesarkets"):
                print('what the heck')
            vocab.add(word)

vocab = set()
add_to_vocab(vocab, train_tokenized_original_1)
add_to_vocab(vocab, train_tokenized_replaced_1)
add_to_vocab(vocab, train_tokenized_replaced_2)

add_to_vocab(vocab, test_tokenized_original_1)
add_to_vocab(vocab, test_tokenized_replaced_1)
add_to_vocab(vocab, test_tokenized_replaced_2)

print(len(vocab))

100%|██████████| 9381/9381 [00:00<00:00, 388138.52it/s]
100%|██████████| 9381/9381 [00:00<00:00, 246032.61it/s]
100%|██████████| 9381/9381 [00:00<00:00, 366770.44it/s]
100%|██████████| 2355/2355 [00:00<00:00, 191958.06it/s]
100%|██████████| 2355/2355 [00:00<00:00, 197255.83it/s]
100%|██████████| 2355/2355 [00:00<00:00, 312670.88it/s]

10460





In [58]:
"obamacare" in vocab

True

In [59]:
# We create representations for our tokens
wvecs = [] # word vectors
word2idx = [] # word2index
idx2word = []

# This is a large file, it will take a while to load in the memory!
with codecs.open('glove.6B.300d.txt', 'r','utf-8') as f:
  index = 1
  for line in tqdm(f.readlines()):
    # Ignore the first line - first line typically contains vocab, dimensionality
    if len(line.strip().split()) > 3:
      word = line.strip().split()[0]
      if word in vocab:
          word, vec = (word,
                     list(map(float,line.strip().split()[1:])))
          wvecs.append(vec)
          word2idx.append((word, index))
          idx2word.append((index, word))
          index += 1


wvecs = np.array(wvecs)
word2idx = dict(word2idx)
idx2word = dict(idx2word)

100%|██████████| 400000/400000 [00:11<00:00, 36065.07it/s]


In [60]:
# We define our training loop
def train(train_iter, dev_iter, model, number_epoch):
    """
    Training loop for the model, which calls on eval to evaluate after each epoch
    """

    print("Training model.")

    for epoch in range(1, number_epoch+1):
        
        model.train()
        
        epoch_loss = 0
        epoch_correct = 0
        no_observations = 0  # Observations used for training so far

        for batch in train_iter:
            feature, target = batch
            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            optimizer.zero_grad()
            loss = loss_fn(predictions, target)

            correct, __ = model_performance(np.argmax(predictions.detach().cpu().numpy(), axis=1), target.detach().cpu().numpy())

            loss.backward()
            optimizer.step()

            epoch_loss += loss.item()*target.shape[0]
            epoch_correct += correct

        valid_loss, valid_acc, __, __ = eval(dev_iter, model)

        epoch_loss, epoch_acc = epoch_loss / no_observations, epoch_correct / no_observations
        print(f'| Epoch: {epoch:02} | Train Loss: {epoch_loss:.2f} | Train Accuracy: {epoch_acc:.2f} | \
        Val. Loss: {valid_loss:.2f} | Val. Accuracy: {valid_acc:.2f} |')

In [61]:
# We evaluate performance on our dev set
def eval(data_iter, model):
    """
    Evaluating model performance on the dev set
    """
    model.eval()
    epoch_loss = 0
    epoch_correct = 0
    pred_all = []
    trg_all = []
    no_observations = 0

    with torch.no_grad():
        for batch in data_iter:
            feature, target = batch

            feature, target = feature.to(device), target.to(device)

            # for RNN:
            model.batch_size = target.shape[0]
            no_observations = no_observations + target.shape[0]
            model.hidden = model.init_hidden()

            predictions = model(feature).squeeze(1)
            loss = loss_fn(predictions, target)

            # We get the mse
            pred, trg = predictions.detach().cpu().numpy(), target.detach().cpu().numpy()
            correct, __ = model_performance(np.argmax(pred, axis=1), trg)

            epoch_loss += loss.item()*target.shape[0]
            epoch_correct += correct
            pred_all.extend(pred)
            trg_all.extend(trg)

    return epoch_loss/no_observations, epoch_correct/no_observations, np.array(pred_all), np.array(trg_all)

In [62]:
# How we print the model performance
def model_performance(output, target, print_output=False):
    """
    Returns accuracy per batch, i.e. if you get 8/10 right, this returns 0.8, NOT 8
    """

    correct_answers = (output == target)
    correct = sum(correct_answers)
    acc = np.true_divide(correct,len(output))

    if print_output:
        print(f'| Acc: {acc:.2f} ')

    return correct, acc

In [63]:
# Used for collating our observations into minibatches:
def collate_fn_padd(batch):
    '''
    We add padding to our minibatches and create tensors for our model
    '''

    batch_labels = [l for f, l in batch]
    batch_features = [f for f, l in batch]

    batch_features_len = [len(f) for f, l in batch]

    seq_tensor = torch.zeros((len(batch), max(batch_features_len))).long()

    for idx, (seq, seqlen) in enumerate(zip(batch_features, batch_features_len)):
        seq_tensor[idx, :seqlen] = torch.LongTensor(seq)

    batch_labels = torch.LongTensor(batch_labels)

    return seq_tensor, batch_labels

# We create a Dataset so we can create minibatches
class Task2Dataset(Dataset):

    def __init__(self, train_data, labels):
        self.x_train = train_data
        self.y_train = labels

    def __len__(self):
        return len(self.y_train)

    def __getitem__(self, item):
        return self.x_train[item], self.y_train[item]

In [41]:
class BiLSTM_classification(nn.Module):

    def __init__(self, embedding_dim, hidden_dim, vocab_size, batch_size, device):
        super(BiLSTM_classification, self).__init__()
        self.hidden_dim = hidden_dim
        self.embedding_dim = embedding_dim
        self.device = device
        self.batch_size = batch_size
        self.embedding = nn.Embedding(vocab_size, embedding_dim, padding_idx=0)

        # The LSTM takes word embeddings as inputs, and outputs hidden states
        # with dimensionality hidden_dim.
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, bidirectional=True)

        # The linear layer that maps from hidden state space to tag space
        self.hidden2label = nn.Linear(hidden_dim * 2, 3)
        self.hidden = self.init_hidden()

    def init_hidden(self):
        # Before we've done anything, we dont have any hidden state.
        # Refer to the Pytorch documentation to see exactly why they have this dimensionality.
        # The axes semantics are (num_layers * num_directions, minibatch_size, hidden_dim)
        return torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device), \
               torch.zeros(2, self.batch_size, self.hidden_dim).to(self.device)

    def forward(self, sentence):
        embedded = self.embedding(sentence)
        embedded = embedded.permute(1, 0, 2)

        lstm_out, self.hidden = self.lstm(
            embedded.view(len(embedded), self.batch_size, self.embedding_dim), self.hidden)

        out = self.hidden2label(lstm_out[-1])
        return out

In [42]:
# Number of epochs
epochs = 10

# Proportion of training data for train compared to dev
train_proportion = 0.8

vectorized_seqs = [[word2idx[tok] for tok in seq if tok in word2idx] for seq in train_tokenized_original_1]

INPUT_DIM = len(word2idx)
EMBEDDING_DIM = 300
BATCH_SIZE = 32

device = "cpu"
model = BiLSTM_classification(EMBEDDING_DIM, 50, INPUT_DIM, BATCH_SIZE, device)
print("Model initialised.")

model.to(device)
# We provide the model with our embeddings
model.embedding.weight.data.copy_(torch.from_numpy(wvecs))

feature = vectorized_seqs

# 'feature' is a list of lists, each containing embedding IDs for word tokens
train_and_dev = Task2Dataset(feature, train_df['label'])

train_examples = round(len(train_and_dev)*train_proportion)
dev_examples = len(train_and_dev) - train_examples

train_dataset, dev_dataset = random_split(train_and_dev,
                                           (train_examples,
                                            dev_examples))


train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)
dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE, collate_fn=collate_fn_padd)

print("Dataloaders created.")

loss_fn = nn.CrossEntropyLoss()
loss_fn = loss_fn.to(device)

optimizer = torch.optim.Adam(model.parameters())

train(train_loader, dev_loader, model, epochs)

Model initialised.
Dataloaders created.
Training model.
| Epoch: 01 | Train Loss: 0.96 | Train Accuracy: 0.45 |         Val. Loss: 0.98 | Val. Accuracy: 0.44 |
| Epoch: 02 | Train Loss: 0.96 | Train Accuracy: 0.44 |         Val. Loss: 0.97 | Val. Accuracy: 0.44 |
| Epoch: 03 | Train Loss: 0.95 | Train Accuracy: 0.48 |         Val. Loss: 0.97 | Val. Accuracy: 0.47 |
| Epoch: 04 | Train Loss: 0.88 | Train Accuracy: 0.60 |         Val. Loss: 1.00 | Val. Accuracy: 0.48 |
| Epoch: 05 | Train Loss: 0.81 | Train Accuracy: 0.66 |         Val. Loss: 1.05 | Val. Accuracy: 0.48 |
| Epoch: 06 | Train Loss: 0.74 | Train Accuracy: 0.68 |         Val. Loss: 1.12 | Val. Accuracy: 0.48 |
| Epoch: 07 | Train Loss: 0.66 | Train Accuracy: 0.69 |         Val. Loss: 1.28 | Val. Accuracy: 0.48 |
| Epoch: 08 | Train Loss: 0.59 | Train Accuracy: 0.70 |         Val. Loss: 1.40 | Val. Accuracy: 0.47 |
| Epoch: 09 | Train Loss: 0.55 | Train Accuracy: 0.70 |         Val. Loss: 1.63 | Val. Accuracy: 0.47 |
| Epoch:

In [89]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

def tokenize_sentence(data):
    tokenized = []
    attention_masks = []
    for sentence in tqdm(train_original_1):
        encoded_sent = tokenizer.encode_plus(
            text=sentence,
            add_special_tokens=True,
            max_length=max_sentence_length,
            pad_to_max_length=True
        )

        tokenized.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
        
    return torch.tensor(tokenized), torch.tensor(attention_masks)
        
original_tokenized, original_attention_masks = tokenize_sentence(train_original_1)
replaced_1_tokenized, replaced_1_attention_masks = tokenize_sentence(train_replaced_1)
replaced_2_tokenized, replaced_2_attention_masks = tokenize_sentence(train_replaced_2)

train_tokenized = torch.cat((original_tokenized, replaced_1_tokenized, replaced_2_tokenized), axis=1)

  0%|          | 0/9381 [00:00<?, ?it/s]Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
100%|██████████| 9381/9381 [00:03<00:00, 3078.72it/s]
100%|██████████| 9381/9381 [00:03<00:00, 2894.83it/s]
100%|██████████| 9381/9381 [00:02<00:00, 3517.23it/s]


In [90]:
train_loader = torch.utils.data.DataLoader(train_tokenized, shuffle=True, batch_size=BATCH_SIZE)
# dev_loader = torch.utils.data.DataLoader(dev_dataset, batch_size=BATCH_SIZE)

In [91]:
import torch
import torch.nn as nn
from transformers import BertModel

# Create the BertClassfier class
class BertClassifier(nn.Module):
    """Bert Model for Classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param    bert: a BertModel object
        @param    classifier: a torch.nn.Module classifier
        @param    freeze_bert (bool): Set `False` to fine-tune the BERT model
        """
        super(BertClassifier, self).__init__()
        # Specify hidden size of BERT, hidden size of our classifier, and number of labels
        D_in, H, D_out = 768, 50, 2

        # Instantiate BERT model
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        # Instantiate an one-layer feed-forward classifier
        self.classifier = nn.Sequential(
            nn.Linear(D_in, H),
            nn.ReLU(),
            #nn.Dropout(0.5),
            nn.Linear(H, D_out)
        )

        # Freeze the BERT model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
        
    def forward(self, input_ids):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """
        # Feed input to BERT
        outputs = self.bert(input_ids=input_ids)
        
        # Extract the last hidden state of the token `[CLS]` for classification task
        last_hidden_state_cls = outputs[0][:, 0, :]

        # Feed input to classifier to compute logits
        logits = self.classifier(last_hidden_state_cls)

        return logits