In [1]:
!pip install transformers
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
from transformers import BertTokenizer, BertModel, BertConfig

from torch import cuda

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/50/0c/7d5950fcd80b029be0a8891727ba21e0cd27692c407c51261c3c921f6da3/transformers-4.1.1-py3-none-any.whl (1.5MB)
[K     |████████████████████████████████| 1.5MB 22.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 54.3MB/s 
Collecting tokenizers==0.9.4
[?25l  Downloading https://files.pythonhosted.org/packages/0f/1c/e789a8b12e28be5bc1ce2156cf87cb522b379be9cadc7ad8091a4cc107c4/tokenizers-0.9.4-cp36-cp36m-manylinux2010_x86_64.whl (2.9MB)
[K     |████████████████████████████████| 2.9MB 53.0MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893261 sha256=d88feed43d0e630597

In [2]:
device = "cuda" if cuda.is_available() else "cpu"


df = pd.read_csv("data.csv", index_col=False)
df_copy = df[["text", "super_strategy_label"]].copy()
df_copy.head()

Unnamed: 0,text,super_strategy_label
0,"The PressTV references in Wikipedia's ""Turkey-...",1
1,"If you have time, can you correct the titles?",3
2,I had really hoped to get some more opinions o...,6
3,"I modified it a bit, possibly vote to reopen?",4
4,Well the only plausible explanation was that y...,1


In [3]:
# Clean the text 
def normalise_text(text):
    text = text.strip()
    text = text.lower() # lowercase
    text = text.replace(r"\#","") # replaces hashtags
    text = text.replace(r"http\S+","URL")  # remove URL addresses
    text = text.replace(r"@","")
    text = text.replace(r"[^A-Za-z0-9()!?\'\`\"]", " ")
    text = text.replace("\s{2,}", " ")
    text = text.replace(r"\#","")
    return text
df_copy.text = df_copy.text.apply(normalise_text)
df_copy.columns = ["text", "label"]
df_copy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5094 entries, 0 to 5093
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    5094 non-null   object
 1   label   5094 non-null   int64 
dtypes: int64(1), object(1)
memory usage: 79.7+ KB


In [4]:
# keep 1/4 0 label
df_1 = df_copy.loc[df_copy.label > 0] 
df_0 = df_copy.loc[df_copy.label == 0]
df_0 = df_0.reset_index()
df_0 = df_0.sample(frac=0.25, replace=True, random_state=1)
train_df = pd.concat([df_1, df_0], ignore_index=True)
train_df = train_df.sample(frac=1)
train_df = train_df[["text", "label"]]
train_df.to_csv("clean_dataset.csv", index=False)
print(train_df)

                                                   text  label
652   so then "good luck with your upcoming defense"...      1
334                             sorry, i thought i had.      6
524   try the electronic gadgets se:  http://area51....      4
1197    ok i've created the following test page: <url>.      6
530           have you asked any other iranian members?      4
...                                                 ...    ...
1783  i do not know what you mean by "important link...      1
2077               thanks for adding that ref to <url>.      7
902   also you talk about "similarity" - but are you...      1
817   before you get into the run-time of the algori...      4
2336  why don't use settings > application settings ...      2

[3562 rows x 2 columns]


In [5]:
# find max length
def get_text_len(text):
    return len(text.split(" "))
max_len = max(train_df.text.apply(get_text_len))
max_len

89

In [6]:
MAX_LEN = 100
TRAIN_BATCH_SIZE = 16
VALID_BATCH_SIZE = 8
EPOCHS = 50
LEARNING_RATE = 1e-05
CLASS_SIZE = 8

tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




In [7]:
class PDataset(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        text = str(self.data.text[index])
        text = " ".join(text.split())
        inputs = self.tokenizer.encode_plus(
            text,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.label[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [8]:
train_size = 0.8
train_dataset = train_df.sample(frac=train_size, random_state=200)
test_dataset= train_df.drop(train_dataset.index).reset_index(drop=True)
train_dataset = train_dataset.reset_index(drop=True)


print("FULL Dataset: {}".format(train_df.shape))
print("TRAIN Dataset: {}".format(train_dataset.shape))
print("TEST Dataset: {}".format(test_dataset.shape))

training_set = PDataset(train_dataset, tokenizer, MAX_LEN)
testing_set = PDataset(test_dataset, tokenizer, MAX_LEN)

FULL Dataset: (3562, 2)
TRAIN Dataset: (2850, 2)
TEST Dataset: (712, 2)


In [9]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(training_set, **train_params)
testing_loader = DataLoader(testing_set, **test_params)

In [10]:
class BERTClass(torch.nn.Module):
    def __init__(self):
        super(BERTClass, self).__init__()
        self.l1 = BertModel.from_pretrained("bert-base-uncased",return_dict=False)
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, CLASS_SIZE)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

model = BERTClass()
model.to(device)

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=433.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=440473133.0, style=ProgressStyle(descri…




BERTClass(
  (l1): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [13]:
loss_function = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [14]:
# Function to calcuate the accuracy of the model

def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct

In [15]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _,data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.long)

        outputs = model(ids, mask)
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(big_idx, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _ % 1000 == 0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct * 100)/nb_tr_examples 
            print(f"Training Loss per 1000 steps: {loss_step}")
            print(f"Training Accuracy per 1000 steps: {accu_step}%")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct * 100)/nb_tr_examples}%')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct * 100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}%")
    print("===========================")

    return epoch_loss, epoch_accu


In [16]:
from pandas import DataFrame
epochs_loss_history = []
epochs_accu_history = []
for epoch in range(EPOCHS):
    epoch_loss, epoch_accu = train(epoch)
    epochs_loss_history.append(epoch_loss)
    epochs_accu_history.append(epoch_accu)



Training Loss per 1000 steps: 2.0852317810058594
Training Accuracy per 1000 steps: 25.0%
The Total Accuracy for Epoch 0: 47.6140350877193%
Training Loss Epoch: 1.5141387188900783
Training Accuracy Epoch: 47.6140350877193%
Training Loss per 1000 steps: 1.0849411487579346
Training Accuracy per 1000 steps: 68.75%
The Total Accuracy for Epoch 1: 69.6140350877193%
Training Loss Epoch: 0.9098246903392856
Training Accuracy Epoch: 69.6140350877193%
Training Loss per 1000 steps: 0.42389774322509766
Training Accuracy per 1000 steps: 87.5%
The Total Accuracy for Epoch 2: 81.05263157894737%
Training Loss Epoch: 0.6097130330914226
Training Accuracy Epoch: 81.05263157894737%
Training Loss per 1000 steps: 0.732241690158844
Training Accuracy per 1000 steps: 68.75%
The Total Accuracy for Epoch 3: 87.54385964912281%
Training Loss Epoch: 0.4331059909982388
Training Accuracy Epoch: 87.54385964912281%
Training Loss per 1000 steps: 0.3732973337173462
Training Accuracy per 1000 steps: 87.5%
The Total Accurac

In [17]:
df = DataFrame(epochs_loss_history, columns=['bert_epochs_loss_history'])
df["bert_epochs_accu_history"] = epochs_accu_history
df.to_csv("bert_history.csv", index=False)

In [18]:
from sklearn.metrics import f1_score

def valid(model, testing_loader):
    model.eval()
    tr_loss = 0
    n_correct = 0 
    n_wrong = 0
    total = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    f1 = 0.0
    targets_list = []
    big_idx_list = []
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.long)
            targets_list.extend(targets.tolist())
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs, targets)
            tr_loss += loss.item()
            big_val, big_idx = torch.max(outputs.data, dim=1)
            big_idx_list.extend(big_idx.tolist())
            n_correct += calcuate_accu(big_idx, targets)

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%100 == 0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct * 100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}%")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct * 100)/nb_tr_examples
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}%")
    f1 = f1_score(targets_list, big_idx_list, average='macro')

    return epoch_accu, f1


In [19]:
acc, f1 = valid(model, testing_loader)
print(f"Accuracy on test data: {acc}")
print(f"Marco F1 score: {f1}")



Validation Loss per 100 steps: 1.9695684909820557
Validation Accuracy per 100 steps: 62.5%
Validation Loss Epoch: 0.765538587715291
Validation Accuracy Epoch: 87.92134831460675%
Accuracy on test data: 87.92134831460675
Marco F1 score: 0.8293951435093849


In [20]:
output_model_file = 'pytorch_bert_model.bin'
output_vocab_file = 'vocab_bert_model.bin'

model_to_save = model
torch.save(model_to_save, output_model_file)
tokenizer.save_vocabulary(output_vocab_file)

print('All files saved')

All files saved


In [22]:
# human annotated testset
df = pd.read_csv("testset_human.csv", index_col=False)
df.head()

Unnamed: 0,text,label
0,"if you have time, can you correct the titles?",3.0
1,i had really hoped to get some more opinions o...,6.0
2,"i modified it a bit, possibly vote to reopen?",6.0
3,well the only plausible explanation was that y...,1.0
4,is that what indeed occurred?,1.0


In [23]:
testing_set = PDataset(df, tokenizer, MAX_LEN)
testing_loader = DataLoader(testing_set, **test_params)

In [24]:
acc, f1 = valid(model, testing_loader)
print(f"Accuracy on test data: {acc}")
print(f"Marco F1 score: {f1}")



Validation Loss per 100 steps: 7.092556476593018
Validation Accuracy per 100 steps: 37.5%
Validation Loss Epoch: 3.966321740831648
Validation Accuracy Epoch: 59.61538461538461%
Accuracy on test data: 59.61538461538461
Marco F1 score: 0.5790985974809504
