# Classify text with BERT

# Setup


We use the AdamW optimizer from [tensorflow/models](https://github.com/tensorflow/models).

In [28]:
# First install all necessary packages

!pip install torch
!pip install transformers

Looking in indexes: https://pypi.python.org/simple/
Looking in indexes: https://pypi.python.org/simple/


In [29]:
import os
import shutil

from utils import load_data

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

from sklearn import metrics, model_selection, preprocessing

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset
import transformers
from transformers import AdamW, get_linear_schedule_with_warmup
from official.nlp import optimization  # to create AdamW optimizer


In [30]:
import sys
sys.path.insert(1, '/home/jovyan/workbench-shared-folder/workbench-shared-folder/canary-project/Paula_internship/')

### Load the dataset


In [31]:
# DATA_PATH = "../data/"
DATA_PATH = "/home/jovyan/workbench-shared-folder/canary-project/Paula_internship/data/"

# Load data from Set A, B and EX
train_A, test_A, val_A = load_data(DATA_PATH, version="A", suffix="_kw", reduce_memory=True)
train_B, test_B, val_B = load_data(DATA_PATH, version="B", suffix="_kw", reduce_memory=True)
train_EX, test_EX, val_EX = load_data(DATA_PATH, version="EX", suffix="_kw", reduce_memory=True)

Mem. usage decreased to  4.83 Mb (83.6% reduction)
Mem. usage decreased to  2.42 Mb (83.6% reduction)
Mem. usage decreased to  0.81 Mb (83.6% reduction)
Set A with suffix '_kw' was loaded successfully.
Mem. usage decreased to  5.18 Mb (83.6% reduction)
Mem. usage decreased to  2.59 Mb (83.6% reduction)
Mem. usage decreased to  0.86 Mb (83.6% reduction)
Set B with suffix '_kw' was loaded successfully.




  train = pd.read_csv(f"{path}set_{version}_train{suffix}.csv", engine='python', error_bad_lines=False)
Skipping line 157634: unexpected end of data


  test = pd.read_csv(f"{path}set_{version}_test{suffix}.csv", engine='python', error_bad_lines=False)


  val = pd.read_csv(f"{path}set_{version}_val{suffix}.csv", engine='python', error_bad_lines=False)


Mem. usage decreased to 10.82 Mb (83.6% reduction)
Mem. usage decreased to  5.42 Mb (83.6% reduction)
Mem. usage decreased to  1.81 Mb (83.6% reduction)
Set EX with suffix '_kw' was loaded successfully.


In [32]:
#We concatenate the 3 different sets (A, B, EX):

train = pd.concat([train_A, train_B, train_EX])
test = pd.concat([test_A, test_B, test_EX])
val = pd.concat([val_A, val_B, val_EX])

# BERT Classifier using PyTorch

### Define useful classes

In [43]:
class Dataset:
    '''
    Dataset class to map indices/keys of data samples. Implemented __getitem__() and __len__() protocols.
    Using the tokenizer, the inputs are mapped to BERT ids/mask.
    '''
    
    def __init__(self, texts, labels, tokenizer, max_len, truncate):
        self.texts = texts
        self.labels = labels

        self.tokenizer = tokenizer
        self.max_len = max_len
        self.truncation = truncate
    
    def __len__(self):
        return len(self.texts)

    def __getitem__(self, index):
        text = self.texts[index]
        label = self.labels[index]

        inputs = self.tokenizer.__call__(text,
                                        None,
                                        add_special_tokens=True,
                                        max_length=self.max_len,
                                        padding="max_length",
                                        truncation=self.truncation,
                                        )
        ids = inputs["input_ids"]
        mask = inputs["attention_mask"]

        return {
            "ids": torch.tensor(ids, dtype=torch.long),
            "mask": torch.tensor(mask, dtype=torch.long),
            "labels": torch.tensor(label, dtype=torch.long)
        }

In [44]:
class Classifier(nn.Module):
    '''
    The actual NN used for classification
    '''
    def __init__(self, n_train_steps, n_classes, do_prob, bert_model):
        super(Classifier, self).__init__()
        self.bert = bert_model
        self.dropout = nn.Dropout(do_prob)
        self.out = nn.Linear(768, n_classes)
        self.n_train_steps = n_train_steps
        self.step_scheduler_after = "batch"

    def forward(self, ids, mask):
        output_1 = self.bert(ids, attention_mask=mask)["pooler_output"]
        output_2 = self.dropout(output_1)
        output = self.out(output_2)
        return output

In [45]:
n_labels = train.iloc[:, 3:].shape[1]
tokenizer = transformers.SqueezeBertTokenizer.from_pretrained("squeezebert/squeezebert-uncased", do_lower_case=True)

def build_dataset(tokenizer_max_len, truncate):
    '''
    Tokenize and map the training and validation sets
    '''
    train_dataset = Dataset(train.input.tolist(), train.iloc[:, 3:].values.tolist(), tokenizer, tokenizer_max_len, truncate)
    valid_dataset = Dataset(val.input.tolist(), val.iloc[:, 3:].values.tolist(), tokenizer, tokenizer_max_len, truncate)
    
    return train_dataset, valid_dataset

def build_dataloader(train_dataset, valid_dataset, batch_size):
    '''
    Create the torch dataloaders
    '''
    train_data_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=2)
    valid_data_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=True, num_workers=1)

    return train_data_loader, valid_data_loader

def ret_model(n_train_steps, do_prob):
    '''
    Retrieve the model
    '''
    model = Classifier(n_train_steps, n_labels, do_prob, bert_model=bert_model)
    return model


In [46]:
bert_model = transformers.SqueezeBertModel.from_pretrained("squeezebert/squeezebert-uncased")

Some weights of the model checkpoint at squeezebert/squeezebert-uncased were not used when initializing SqueezeBertModel: ['cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.seq_relationship.bias']
- This IS expected if you are initializing SqueezeBertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing SqueezeBertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [47]:
def ret_optimizer(model):
    '''
    Taken from Abhishek Thakur's Tez library example: 
    https://github.com/abhishekkrthakur/tez/blob/main/examples/text_classification/binary.py
    '''
    param_optimizer = list(model.named_parameters())
    no_decay = ["bias", "LayerNorm.bias"]
    optimizer_parameters = [
        {
            "params": [
                p for n, p in param_optimizer if not any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.001,
        },
        {
            "params": [
                p for n, p in param_optimizer if any(nd in n for nd in no_decay)
            ],
            "weight_decay": 0.0,
        },
    ]
#     opt = AdamW(optimizer_parameters, lr=config['learning_rate'])
    opt = torch.optim.AdamW(optimizer_parameters, lr=config['learning_rate'])
    return opt

def ret_scheduler(optimizer, num_train_steps):
    sch = get_linear_schedule_with_warmup(
        optimizer, num_warmup_steps=0, num_training_steps=num_train_steps)
    return sch

def loss_function(outputs, labels, loss='BCE'):
    if labels is None:
        return None
    if loss == 'BCE':
        # BinaryCross Entropy loss
        loss_fn = nn.BCEWithLogitsLoss()
    else:
        # BinaryCross Entropy loss
        loss_fn = nn.BCEWithLogitsLoss()
    
    return loss_fn(outputs, labels.float())

def log_metrics(preds, labels):
    preds = torch.stack(preds)
    preds = preds.cpu().detach().numpy()
    labels = torch.stack(labels)
    labels = labels.cpu().detach().numpy()
    
    '''
    auc_micro_list = []
    for i in range(n_labels):
      current_pred = preds.T[i]
      current_label = labels.T[i]
      fpr_micro, tpr_micro, _ = metrics.roc_curve(current_label.T, current_pred.T)
      auc_micro = metrics.auc(fpr_micro, tpr_micro)
      auc_micro_list.append(auc_micro)
    
    return {"auc": np.array(auc_micro).mean()}
    '''
    # https://scikit-learn.org/stable/modules/generated/sklearn.metrics.roc_curve.html#sklearn.metrics.roc_curve
    fpr_micro, tpr_micro, _ = metrics.roc_curve(labels.ravel(), preds.ravel())
    
    auc_micro = metrics.auc(fpr_micro, tpr_micro)
    return {"auc_micro": auc_micro}

### Define the training and evaluation functions

In [48]:
from tqdm.notebook import tqdm

def train_fn(data_loader, model, loss_fn, optimizer, device, scheduler):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''

    train_loss = 0.0
    model.train()
    for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
        ids = d["ids"]
        mask = d["mask"]
        targets = d["labels"]

        ids = ids.to(device, dtype=torch.long)
        mask = mask.to(device, dtype=torch.long)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(ids=ids, mask=mask)

        loss = loss_function(outputs, targets, loss_fn)
        loss.backward()
        train_loss += loss.item()
        optimizer.step()
        scheduler.step()
    return train_loss
    

def eval_fn(data_loader, model, device):
    '''
        Modified from Abhishek Thakur's BERT example: 
        https://github.com/abhishekkrthakur/bert-sentiment/blob/master/src/engine.py
    '''
    eval_loss = 0.0
    model.eval()
    fin_targets = []
    fin_outputs = []
    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            targets = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            targets = targets.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            loss = loss_function(outputs, targets)
            eval_loss += loss.item()
            fin_targets.extend(targets)
            fin_outputs.extend(torch.sigmoid(outputs))
    return eval_loss, fin_outputs, fin_targets

## Train the network

In [49]:
def trainer(config):

    train_dataset, valid_dataset = build_dataset(config['tokenizer_max_len'], config['truncate'])
    train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config['batch_size'])
    print("Length of Train Dataloader: ", len(train_data_loader))
    print("Length of Valid Dataloader: ", len(valid_data_loader))

    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    n_train_steps = int(len(train_dataset) / config['batch_size'] * 10)

    model = ret_model(n_train_steps, config['dropout'])
    optimizer = ret_optimizer(model)
    scheduler = ret_scheduler(optimizer, n_train_steps)
    model.to(device)
    model = nn.DataParallel(model)
    
    n_epochs = config['epochs']
    loss_fn = config['loss']

    best_val_loss = 100
    for epoch in tqdm(range(n_epochs)):
        print('Train EPOCH: ', epoch)
        train_loss = train_fn(train_data_loader, model, loss_fn, optimizer, device, scheduler)
        eval_loss, preds, labels = eval_fn(valid_data_loader, model, device)
        
        metrics_eval = log_metrics(preds, labels)
        try:
            auc_score  = metrics_eval["auc_micro"]
#             print("AUC score: ", auc_score)
        except:
            pass
        avg_train_loss, avg_val_loss = train_loss / len(train_data_loader), eval_loss / len(valid_data_loader)

        print("Average Train loss: ", avg_train_loss)
        print("Average Valid loss: ", avg_val_loss)
        torch.save(model.state_dict(), "./model_current.pt")  

        if avg_val_loss < best_val_loss:
            best_val_loss = avg_val_loss
            torch.save(model.state_dict(), "./model_best.pt")  
            print("Model saved as current val_loss is: ", best_val_loss)    

Set some configuration parameters (to be fine-tuned)

In [50]:
config = {
    'learning_rate': 1e-3,
    'batch_size': 32,
    'epochs': 10,
    'dropout': 0.3,
    'tokenizer_max_len': 200,
    'truncate': True,
    'loss': 'BCE',
}

In [None]:
# Train the model:

trainer(config) 

Length of Train Dataloader:  9486
Length of Valid Dataloader:  1583


  0%|          | 0/10 [00:00<?, ?it/s]

Train EPOCH:  0


  0%|          | 0/9486 [00:00<?, ?it/s]

## Define some functions for the inference

In [15]:
def load_model():
    '''
    Load a saved model
    '''
    train_dataset, valid_dataset = build_dataset(config['tokenizer_max_len'], config['truncate'])
    train_data_loader, valid_data_loader = build_dataloader(train_dataset, valid_dataset, config['batch_size'])
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

    n_train_steps = int(len(train_dataset) / config['batch_size'] * 10)

    model = ret_model(n_train_steps, config['dropout'])
    optimizer = ret_optimizer(model)
    scheduler = ret_scheduler(optimizer, n_train_steps)
    model.to(device)
    model = nn.DataParallel(model)

    Models_PATH = "/home/jovyan/workbench-shared-folder/canary-project/Paula_internship/"
    model.load_state_dict(torch.load(Models_PATH + "model_best.pt", map_location=device))
    
    return model

In [16]:
def inference_batches(test, model):
    '''
    Predict outputs for inference phase
    '''
    test_targets = []
    test_outputs = []
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    
    test_dataset = Dataset(test.input.tolist(), test.iloc[:, 3:].values.tolist(), tokenizer, config['tokenizer_max_len'], config['truncate'])
    data_loader = DataLoader(test_dataset, batch_size=1024, shuffle=True, num_workers=2)

    with torch.no_grad():
        for bi, d in tqdm(enumerate(data_loader), total=len(data_loader)):
            ids = d["ids"]
            mask = d["mask"]
            labels = d["labels"]

            ids = ids.to(device, dtype=torch.long)
            mask = mask.to(device, dtype=torch.long)
            labels = labels.to(device, dtype=torch.float)

            outputs = model(ids=ids, mask=mask)
            test_targets.extend(labels.cpu().numpy())
            test_outputs.extend(torch.sigmoid(outputs).cpu().numpy())


    return test_outputs, test_targets

In [17]:
def remove_duplicates():
    '''
    Remove duplicates of train/val datasets present in the test set
    '''
    # Get the training duplicates:
    duplicates_train = set(test.pui) & set(train.pui) 
    test_clean = test[~test['pui'].isin(duplicates_train)]
    
    # Get the validation duplicates:
    duplicates_val = set(test.pui) & set(val.pui) 
    test_clean = test_clean[~test_clean['pui'].isin(duplicates_val)]
    
    assert test_clean.shape[0] == test.shape[0] - len(duplicates_train) - len(duplicates_val)
    
    return test_clean

In [18]:
def get_metrics(preds, labels):
    '''
    Create some metrics: precison, recall, F1...
    '''
    # Convert the lists to dataframes
    lab_df = pd.DataFrame(labels)
    pred_df = pd.DataFrame(preds).round(0).astype(int)
    
    # Calculate tp/fp/fn/tn per class:
    tp = (pred_df + lab_df).eq(2).sum()
    fp = (pred_df - lab_df).eq(1).sum()
    fn = (pred_df - lab_df).eq(-1).sum()
    tn = (pred_df + lab_df).eq(0).sum()
    
    # Calculate precision and recall:
    prec = [tp[i] / (tp[i] + fp[i]) * 100.0 if tp[i] + fp[i] != 0 else 0.0 for i in range(len(tp))]
    rec = [tp[i] / (tp[i] + fn[i]) * 100.0 if tp[i] + fn[i] != 0 else 0.0 for i in range(len(tp))]
    
    # Calculate F1 score:
    f1_score = [2 * prec[i] * rec[i] / (prec[i] + rec[i]) if tp[i] > 0 else 0.0 for i in range(len(tp))]
    
    # Weighted F1 score:
    weight = lab_df.sum() / sum(lab_df.sum())
    f1_wght = [weight[i] * 2 * prec[i] * rec[i] / (prec[i] + rec[i]) if tp[i] > 0 else 0.0 for i in range(len(tp))]
    
    # Macro average:
    prec_avg = sum(prec) / len(prec)
    rec_avg = sum(rec) / len(rec)
    f1_avg = sum(f1_score) / len(f1_score)
    f1wgt_avg = sum(f1_wght) / len(f1_wght)
    
    return {
        'Precision': prec,
        'Recall': rec,
        'F1 score': f1_score,
        'Weighted F1 score': f1_wght,
        'Average precision': prec_avg.round(2),
        'Average recall': rec_avg.round(2),
        'Average F1 score': f1_avg.round(2),
        'Average weighted F1 score': f1wgt_avg.round(2),
    }

# all_metrics = get_metrics(preds, labels)

## Inference:

In [20]:
# Predict outputs:
test_clean = remove_duplicates()
model = load_model()
preds, labels = inference_batches(test_clean, model)
all_metrics = get_metrics(preds, labels)

  0%|          | 0/124 [00:00<?, ?it/s]

In [21]:
all_metrics = get_metrics(preds, labels)

for metr, val in all_metrics.items():
    if 'Average' in metr:
        print(metr, val)

Average precision 5.89
Average recall 9.62
Average F1 score 7.27
Average weighted F1 score 0.71
