In [26]:
import pandas as pd
from datasets import Dataset
from transformers import DistilBertModel, DistilBertTokenizer
import torch
from torch.utils.data import DataLoader
from torch import cuda


In [27]:
# load data
train = Dataset.load_from_disk("data/train")
val = Dataset.load_from_disk("data/val")
dev = Dataset.load_from_disk("data/dev")

In [28]:
data = {}
data['train'] = pd.DataFrame(train)
data['val'] = pd.DataFrame(val)
data['dev'] = pd.DataFrame(dev)

In [29]:
train_val_df = pd.concat([data['train'], data['val']])
train_val_df

Unnamed: 0,par_id,art_id,keyword,country_code,text,label_score,label,label_category_vector,input_ids,attention_mask
0,99,@@25281193,in-need,my,The departures from London will barely put a d...,0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 1996, 6712, 2015, 2013, 2414, 2097, 4510...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1,3162,@@20664287,migrant,nz,""" Singh is the most common surname for our dri...",0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 1000, 5960, 2003, 1996, 2087, 2691, 1198...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
2,246,@@20000497,homeless,au,What causes someone to become homeless ? Brain...,1,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 2054, 5320, 2619, 2000, 2468, 11573, 102...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
3,7613,@@15679266,vulnerable,za,- An extended-hour 's clinic pilot project was...,0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 1011, 2019, 3668, 1011, 3178, 1005, 1055...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
4,98,@@15438640,immigrant,pk,The Golden State is home to the country 's lar...,0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 1996, 3585, 2110, 2003, 2188, 2000, 1996...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
...,...,...,...,...,...,...,...,...,...,...
1670,1029,@@16905169,homeless,za,Muizenberg residents have lambasted the Muizen...,0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 14163, 4697, 11144, 3901, 2031, 12559, 1...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1671,2767,@@23925,disabled,in,Christian missionaries care for the destitute ...,4,1,"[1, 1, 0, 0, 0, 0, 0]","[101, 3017, 11743, 2729, 2005, 1996, 4078, 377...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1672,5532,@@10345914,women,gh,I bid you fraternal greetings Hon. Alex Bapula...,0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 1045, 7226, 2017, 25312, 16451, 2389, 14...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."
1673,3641,@@70027514,hopeless,jm,""" He had one good Ashes series and what he del...",0,0,"[0, 0, 0, 0, 0, 0, 0]","[101, 1000, 2002, 2018, 2028, 2204, 11289, 218...","[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ..."


# Text only model

In [30]:
train_text_only_df = train_val_df[['text', 'label']]
test_text_only_df = train_val_df[['text', 'label']]

In [31]:
MAX_LEN = 512
TRAIN_BATCH_SIZE = 4
VALID_BATCH_SIZE = 2
EPOCHS = 1
LEARNING_RATE = 1e-05
tokenizer = DistilBertTokenizer.from_pretrained('distilbert-base-cased')

In [32]:
from torch.utils.data import Dataset

class Triage(Dataset):
    def __init__(self, dataframe, tokenizer, max_len):
        self.len = len(dataframe)
        self.data = dataframe
        self.tokenizer = tokenizer
        self.max_len = max_len
        
    def __getitem__(self, index):
        title = str(self.data.TITLE[index])
        title = " ".join(title.split())
        inputs = self.tokenizer.encode_plus(
            title,
            None,
            add_special_tokens=True,
            max_length=self.max_len,
            pad_to_max_length=True,
            return_token_type_ids=True,
            truncation=True
        )
        ids = inputs['input_ids']
        mask = inputs['attention_mask']

        return {
            'ids': torch.tensor(ids, dtype=torch.long),
            'mask': torch.tensor(mask, dtype=torch.long),
            'targets': torch.tensor(self.data.ENCODE_CAT[index], dtype=torch.long)
        } 
    
    def __len__(self):
        return self.len

In [43]:
encode_dict = {} # dictionary of all unique output labels

def encode_cat(x):
    if x not in encode_dict.keys():
        encode_dict[x]=len(encode_dict)
    return encode_dict[x]

In [44]:
train_data = data['train'][['text', 'label']].rename(columns={'text': "TITLE"})
val_data = data['val'][['text', 'label']].rename(columns={'text': "TITLE"})
dev_data = data['dev'][['text', 'label']].rename(columns={'text': "TITLE"})

train_data['ENCODE_CAT'] = train_data['label'].apply(lambda x: encode_cat(x))
val_data['ENCODE_CAT'] = val_data['label'].apply(lambda x: encode_cat(x))
dev_data['ENCODE_CAT'] = dev_data['label'].apply(lambda x: encode_cat(x))


In [46]:
train_set = Triage(dataframe=train_data, tokenizer=tokenizer, max_len=MAX_LEN)
val_set = Triage(dataframe=val_data, tokenizer=tokenizer, max_len=MAX_LEN)
dev_set = Triage(dataframe=dev_data, tokenizer=tokenizer, max_len=MAX_LEN)

In [47]:
train_params = {'batch_size': TRAIN_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

test_params = {'batch_size': VALID_BATCH_SIZE,
                'shuffle': True,
                'num_workers': 0
                }

training_loader = DataLoader(train_set, **train_params)
testing_loader = DataLoader(dev_set, **test_params)

In [71]:
class DistillBERTClass(torch.nn.Module):
    def __init__(self):
        super(DistillBERTClass, self).__init__()
        self.l1 = DistilBertModel.from_pretrained("distilbert-base-uncased")
        self.pre_classifier = torch.nn.Linear(768, 768)
        self.dropout = torch.nn.Dropout(0.3)
        self.classifier = torch.nn.Linear(768, 1)

    def forward(self, input_ids, attention_mask):
        output_1 = self.l1(input_ids=input_ids, attention_mask=attention_mask)
        hidden_state = output_1[0]
        pooler = hidden_state[:, 0]
        pooler = self.pre_classifier(pooler)
        pooler = torch.nn.ReLU()(pooler)
        pooler = self.dropout(pooler)
        output = self.classifier(pooler)
        return output

In [72]:
DEVICE = device = 'cuda' if cuda.is_available() else 'cpu'
model = DistillBERTClass()
model.to(DEVICE)

DistillBERTClass(
  (l1): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0-5): 6 x TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
            (lin1): Linear(i

In [50]:
loss_function = torch.nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(params =  model.parameters(), lr=LEARNING_RATE)

In [51]:
def calcuate_accu(big_idx, targets):
    n_correct = (big_idx==targets).sum().item()
    return n_correct


In [77]:
def train(epoch):
    tr_loss = 0
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    model.train()
    for _, data in enumerate(training_loader, 0):
        ids = data['ids'].to(device, dtype = torch.long)
        mask = data['mask'].to(device, dtype = torch.long)
        targets = data['targets'].to(device, dtype = torch.float64)

        outputs = model(ids, mask).flatten()
        loss = loss_function(outputs, targets)
        tr_loss += loss.item()
        pred_val = torch.where(torch.nn.functional.sigmoid(outputs.data) >= 0.5, 1, 0)
        # big_val, big_idx = torch.max(outputs.data, dim=1)
        n_correct += calcuate_accu(pred_val, targets)

        nb_tr_steps += 1
        nb_tr_examples+=targets.size(0)
        
        if _%5000==0:
            loss_step = tr_loss/nb_tr_steps
            accu_step = (n_correct*100)/nb_tr_examples 
            print(f"Training Loss per 5000 steps: {loss_step}")
            print(f"Training Accuracy per 5000 steps: {accu_step}")

        optimizer.zero_grad()
        loss.backward()
        # # When using GPU
        optimizer.step()

    print(f'The Total Accuracy for Epoch {epoch}: {(n_correct*100)/nb_tr_examples}')
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    print(f"Training Loss Epoch: {epoch_loss}")
    print(f"Training Accuracy Epoch: {epoch_accu}")

    return 

In [78]:
for epoch in range(EPOCHS):
    train(epoch)



Training Loss per 5000 steps: 0.6576368678361177
Training Accuracy per 5000 steps: 75.0
The Total Accuracy for Epoch 0: 84.41791044776119
Training Loss Epoch: 0.6573612182276017
Training Accuracy Epoch: 84.41791044776119


In [108]:
def valid(model, testing_loader):
    tr_loss =0 
    n_correct = 0
    nb_tr_steps = 0
    nb_tr_examples = 0
    tp = 0
    fp = 0
    fn = 0
    model.eval()
    n_correct = 0; n_wrong = 0; total = 0
    with torch.no_grad():
        for _, data in enumerate(testing_loader, 0):
            ids = data['ids'].to(device, dtype = torch.long)
            mask = data['mask'].to(device, dtype = torch.long)
            targets = data['targets'].to(device, dtype = torch.float64)
            outputs = model(ids, mask).squeeze()
            loss = loss_function(outputs.flatten(), targets)
            tr_loss += loss.item()
            pred_val = torch.where(torch.nn.functional.sigmoid(outputs.data) >= 0.5, 1, 0)
            # big_val, big_idx = torch.max(outputs.data, dim=1)
            if 0 in pred_val:
                print(_)
            n_correct += calcuate_accu(pred_val, targets)
            tp += torch.sum(torch.where(targets == 1, pred_val == targets, 0)).item()
            fp += torch.sum(torch.where(targets == 0, pred_val != targets, 0)).item()
            fn += torch.sum(torch.where(targets == 1, pred_val != targets, 0)).item()

            nb_tr_steps += 1
            nb_tr_examples+=targets.size(0)
            
            if _%5000==0:
                loss_step = tr_loss/nb_tr_steps
                accu_step = (n_correct*100)/nb_tr_examples
                print(f"Validation Loss per 100 steps: {loss_step}")
                print(f"Validation Accuracy per 100 steps: {accu_step}")
    epoch_loss = tr_loss/nb_tr_steps
    epoch_accu = (n_correct*100)/nb_tr_examples
    epoch_f1 = tp / (tp + 0.5 * (fp + fn))
    print(f"Validation Loss Epoch: {epoch_loss}")
    print(f"Validation Accuracy Epoch: {epoch_accu}")
    print(f"Validation F1 Epoch: {epoch_f1}")

    
    return epoch_accu, epoch_f1

In [109]:
print('This is the validation section to print the accuracy and see how it performs')
print('Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch')

acc, f1 = valid(model, testing_loader)
print("Accuracy on test data = %0.2f%%" % acc)
print("F1 on test data = %0.2f%%" % f1)

This is the validation section to print the accuracy and see how it performs
Here we are leveraging on the dataloader crearted for the validation dataset, the approcah is using more of pytorch
0
Validation Loss per 100 steps: 0.6398966610431671
Validation Accuracy per 100 steps: 100.0
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27


KeyboardInterrupt: 