<a href="https://colab.research.google.com/github/mantasbandonis/CHEERS-challenge/blob/main/round1_undersampling_task2.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [2]:
!pip install transformers
!pip install imbalanced-learn

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 6.7MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 22.7MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 34.2MB/s 
Installing collected packages: tokenizers, sacremoses, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1


In [3]:
from transformers import BertTokenizer, BertForSequenceClassification, DistilBertForSequenceClassification, DistilBertTokenizer
import torch
from torch import nn
import pandas as pd
import pickle
import numpy as np
import matplotlib.pyplot as plt
from sklearn.metrics import f1_score, accuracy_score, roc_curve, auc
from imblearn.under_sampling import RandomUnderSampler
import torch.nn.functional as F
import transformers
from sklearn.preprocessing import MultiLabelBinarizer

from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")



In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [5]:
documents_train = pd.read_csv("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/documents_en_train.csv")
sentences_train = pd.read_csv("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/sentences_en_train.csv")

documents_val = pd.read_csv("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/documents_en_val.csv")
sentences_val = pd.read_csv("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/sentences_en_val.csv")

documents_test = pd.read_csv("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/documents_en_test.csv")
sentences_test = pd.read_csv("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/sentences_en_test.csv")

with open("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/word_embedding.txt", "rb") as fp: 
    embedding_distilbert = pickle.load(fp)

with open("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/attention_masks.txt", "rb") as fp: 
    attention_masks = pickle.load(fp)

with open("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/word_embedding_val.txt", "rb") as fp: 
    embedding_distilbert_val = pickle.load(fp)

with open("/content/drive/MyDrive/cheers_challenge/round1/data_round_1/attention_masks_val.txt", "rb") as fp: 
    attention_masks_val = pickle.load(fp)

#immap_sector_name_to_id.json

In [6]:
def process_sector_ids(sentences):
    mlb = MultiLabelBinarizer()
    sectors = pd.DataFrame(mlb.fit_transform(sentences["sector_ids"]),columns=mlb.classes_)
    sectors = sectors.drop([",", "[", "]", " "], axis = 1)
    sectors["-1"] = 0
    sectors["-1"][sectors.sum(axis=1) == 0] = 1

    return sectors.values

In [7]:
sectors_train = process_sector_ids(sentences_train)
sectors_val = process_sector_ids(sentences_val)

In [8]:
y = sectors_train

In [9]:
def undersampler(embedding, attention_mask, y):
    '''
    Given the embedding, attention masks and y returns undersampled versions 
    of the input variables
    '''

    nr_y1 = len(pd.DataFrame(y)[y==1])
    y_0_idx = np.asarray(pd.DataFrame(y)[y==0].index).flatten()

    print("Nr samples class 0:", len(y_0_idx))
    print("Nr samples class 1:", nr_y1)

    idx_list = np.random.choice(y_0_idx, size = nr_y1)

    embedding_0 = embedding_distilbert[idx_list]
    masks_0 = np.array(attention_masks)[idx_list]
    y_0 = y[idx_list].values

    if (len(embedding_0) == len(masks_0) & len(embedding_0) == len(y_0)):
        print("Nr undersampled samples class 0:", len(embedding_0))

    y_1_idx = np.asarray(pd.DataFrame(y)[y==1].index).flatten()
    
    embedding_1 = embedding_distilbert[y_1_idx]
    masks_1 = np.array(attention_masks)[y_1_idx]
    y_1 = y[y_1_idx].values


    embedding_undersampled = np.concatenate((embedding_0, embedding_1))
    masks_undersampled = np.concatenate((masks_0, masks_1))
    y_undersampled = np.concatenate((y_0, y_1))

    if (len(embedding_undersampled) == len(masks_undersampled) & len(embedding_undersampled) == len(y_undersampled)):
        print("\nSize undersampled dataset:", len(embedding_undersampled))

    print("Done!")
    return embedding_undersampled, masks_undersampled, y_undersampled

In [10]:
y[:30]

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 1, 0, 0, 1, 0, 0, 0, 0, 0],
       [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0],
       [0, 0, 0, 1, 0, 0, 0, 0, 0,

In [52]:
print(sectors_train.shape)
print(sectors_val.shape)

(261981, 11)
(37109, 11)


In [9]:
embedding_distilbert, attention_masks, y = undersampler(embedding_distilbert, attention_masks, y)

Nr samples class 0: 2581283
Nr samples class 1: 300508


KeyboardInterrupt: ignored

In [13]:
nr_classes = y.shape[1]
print("nr_classes:", nr_classes)

nr_classes: 11


In [41]:
tokenizer = DistilBertTokenizer.from_pretrained('bert-base-uncased')
input_ids = torch.tensor(embedding_distilbert)
attention_mask = torch.tensor(attention_masks)
labels = torch.tensor(y)
data_size = len(labels)

In [64]:
print(labels.shape)
print(input_ids.shape)

torch.Size([261981, 11])
torch.Size([261981, 512])


In [42]:
batchsize = 8
def get_batches(input_ids, attention_mask, labels, batch_size=batchsize):

    tensor_dataset = torch.utils.data.TensorDataset(input_ids, attention_mask, labels)    
    tensor_randomsampler = torch.utils.data.RandomSampler(tensor_dataset)    
    tensor_dataloader = torch.utils.data.DataLoader(tensor_dataset, sampler=tensor_randomsampler, batch_size=batch_size)    
    return tensor_dataloader

In [43]:
batch_train = get_batches(input_ids, attention_mask, labels)

In [44]:
#validation dataloader
y_val = sectors_val
y_val = torch.tensor(y_val)#.to(device)
input_ids_val = torch.tensor(embedding_distilbert_val)#.to(device)
attention_mask_val = torch.tensor(attention_masks_val)#.to(device)

batch_size = 8
#test_dataset = TensorDataset(input_ids_val, attention_mask_val)
#test_sampler = SequentialSampler(test_dataset)
#test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=8)

val_data = TensorDataset(input_ids_val, attention_mask_val, y_val)
val_sampler = SequentialSampler(val_data)
val_dataloader = DataLoader(val_data, 
                            sampler=val_sampler, 
                            batch_size=batch_size,
                            num_workers= 0)

In [45]:
def eval_bert(model, validation_dataloader):
    preds = torch.tensor([]).to("cpu")
    with torch.no_grad():
        i = 0
        for b_input_ids, b_input_mask, b_labels in validation_dataloader:
            if i % 1000 == 0:
                print("Batches validated:", i)
            i+=1
            b_input_ids = b_input_ids.to(device)
            b_input_mask = b_input_mask.to(device)
            b_labels = b_labels.to(device)
            model.eval()

            output = model(b_input_ids, b_input_mask)
            y_hat = output.logits.to("cpu")
            del b_input_ids, b_input_mask, b_labels, output

            preds = torch.cat((preds, y_hat))
            del y_hat
            
        #probs = F.softmax(all_logits, dim=1).cpu().numpy()
    print("Validation done!")
    return preds

In [46]:
from sklearn.metrics import confusion_matrix
import seaborn as sns
def evaluate_roc(probs, y_true):
    """
    - Print AUC and accuracy on the test set
    - Plot ROC
    @params    probs (np.array): an array of predicted probabilities with shape (len(y_true), 2)
    @params    y_true (np.array): an array of the true values with shape (len(y_true),)
    """
    preds = probs[:, 1]
    fpr, tpr, threshold = roc_curve(y_true, preds)
    roc_auc = auc(fpr, tpr)
    print(f'AUC: {roc_auc:.4f}')
       
    # Get accuracy over the test set
    y_pred = np.where(preds >= 0.5, 1, 0)
    accuracy = accuracy_score(y_true, y_pred)
    print(f'Accuracy: {accuracy*100:.2f}%')
    f1 = f1_score(y_true, y_pred)
    print('F1 score:', f1)

    cm=confusion_matrix(y_true,y_pred)
    print(cm)
    cm = pd.DataFrame(cm, index = [0,1], columns = [0,1])
    sns.heatmap(cm, annot=True, fmt = '.0f')
    plt.show()
    # Plot ROC AUC
    plt.title('Receiver Operating Characteristic')
    plt.plot(fpr, tpr, 'b', label = 'AUC = %0.2f' % roc_auc)
    plt.legend(loc = 'lower right')
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0, 1])
    plt.ylim([0, 1])
    plt.ylabel('True Positive Rate')
    plt.xlabel('False Positive Rate')
    plt.show()

In [47]:
def train_model(batch, model, optimizer, scheduler, epochs, device, batchsize):    
    model.train()  # Set the mode to training    
    for e in range(epochs):
        epochs_loss = []        
        for i, batch_tuple in enumerate(batch):            
            batch_tuple = (t.to(device) for t in batch_tuple)            
            input_ids, attention_mask, labels = batch_tuple            
            outputs = model(input_ids=input_ids, attention_mask=attention_mask, labels=labels)            
            loss = outputs[0]
            epochs_loss.append(loss.item())
            logits = outputs[1]
            hidden_states_output = outputs[2]
            attention_mask_output = outputs[3]
            steps = 1000
            if i % steps == 0:
                if i == 0:
                    avg_loss = loss.item()
                else:
                    avg_loss = np.mean(epochs_loss[-steps:])
                
                depleted = round(100*i*batchsize/data_size, 2)
                print("Avg. Train Loss: {0}, Step: {1}, depleted: {2}%".format(round(avg_loss,3), i, depleted))            
            
            model.zero_grad()            
            optimizer.zero_grad()         
            loss.backward()            
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])            
            optimizer.step()            
            scheduler.step()
        
        model.save_pretrained('/content/drive/MyDrive/cheers_challenge/round1/models/distilbert_task2_epoch_'+str(e))

In [71]:
def train_eval(batch, model, optimizer, scheduler, epochs, device, batchsize, validation_dataloader):    
    for e in range(epochs):
        model.train()
        epochs_loss = []
        print("Training: epoch", e+1)        
        for i, batch_tuple in enumerate(batch):          
            batch_tuple = (t.to(device) for t in batch_tuple)    
            
            input_ids, attention_mask, labels = batch_tuple            
            print("input_ids:", input_ids.shape)
            print("attention_mask:", attention_mask.shape)
            print("labels:", labels.shape)

            outputs = model(input_ids, attention_mask=attention_mask)
                        
            loss = outputs[0]
            epochs_loss.append(loss.item())
            logits = outputs[1]
            hidden_states_output = outputs[2]
            attention_mask_output = outputs[3]
            steps = 1000
            if i % steps == 0:
                if i == 0:
                    avg_loss = loss.item()
                else:
                    avg_loss = np.mean(epochs_loss[-steps:])
                
                depleted = round(100*i*batchsize/data_size, 2)
                print("Avg. Train Loss: {0}, Step: {1}, depleted: {2}%".format(round(avg_loss,3), i, depleted))            
            
            model.zero_grad()            
            optimizer.zero_grad()         
            loss.backward()            
            torch.nn.utils.clip_grad_norm_(model.parameters(), parameters['max_grad_norm'])            
            optimizer.step()            
            scheduler.step()
        
        #save model after epoch
        model.save_pretrained('/content/drive/MyDrive/cheers_challenge/round1/models/distilbert_task2_epoch_'+str(e))
        print("Model distilbert_under_1_epoch_"+str(e)+ " saved!")
        print("\n***************************************************\n")
        print("Evaluation")
        #evaluate after epoch
        probs = eval_bert(model, validation_dataloader)
        print("\n")
        softm_probs = F.softmax(probs).cpu().numpy()
        evaluate_roc(softm_probs, y_val)
        print("\n---------------------------------------------------\n")

In [None]:
self.criterion = nn.BCEWithLogitsLoss()

model = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased', 
                                                      num_labels=nr_classes, 
                                                      output_hidden_states=True, 
                                                      output_attentions=True)
model.to(device)

In [78]:
class BERT_MODEL(nn.Module):

    def __init__(self):
      
      super(BERT_MODEL, self).__init__()

      self.bert = DistilBertForSequenceClassification.from_pretrained('distilbert-base-uncased') 
      
      # dense layer 1
      self.fc1 = nn.Linear(768,11)

      #softmax activation function
      self.sigmoid = torch.sigmoid

    #define the forward pass
    def forward(self, input_ids, attention_mask):

      #pass the inputs to the model  
      _, cls_hs = self.bert(input_ids, attention_mask)
      
      x = self.fc1(cls_hs)
      x = self.sigmoid(x)

      return x

In [73]:
epochs=3
parameters = {
    'learning_rate': 1e-5,
    'num_warmup_steps': 1500,
    'num_training_steps': len(batch_train) * epochs,
    'max_grad_norm': 1
    }

optimizer = transformers.AdamW(model.parameters(), lr=parameters['learning_rate'], correct_bias=False)
scheduler = transformers.get_linear_schedule_with_warmup(optimizer,
                                                         num_warmup_steps=parameters['num_warmup_steps'],
                                                         num_training_steps=parameters['num_training_steps'])

In [74]:
print("batch_train:", batch_train)
print("batchsize:", batchsize)
print("val_dataloader:", val_dataloader)

batch_train: <torch.utils.data.dataloader.DataLoader object at 0x7f785e44ff10>
batchsize: 8
val_dataloader: <torch.utils.data.dataloader.DataLoader object at 0x7f785e3d3450>


In [81]:
model = BERT_MODEL
model = model.to(device)
train_eval(batch_train, model(), optimizer, scheduler, epochs, device, batchsize, val_dataloader)

AttributeError: ignored

In [None]:
attention_mask#In case you want to load the own trained BERT on the dataset:

#model = DistilBertForSequenceClassification.from_pretrained('/content/drive/MyDrive/cheers_challenge/round1/models/distilbert_1')
#model.to(device)

DistilBertForSequenceClassification(
  (distilbert): DistilBertModel(
    (embeddings): Embeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (transformer): Transformer(
      (layer): ModuleList(
        (0): TransformerBlock(
          (attention): MultiHeadSelfAttention(
            (dropout): Dropout(p=0.1, inplace=False)
            (q_lin): Linear(in_features=768, out_features=768, bias=True)
            (k_lin): Linear(in_features=768, out_features=768, bias=True)
            (v_lin): Linear(in_features=768, out_features=768, bias=True)
            (out_lin): Linear(in_features=768, out_features=768, bias=True)
          )
          (sa_layer_norm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
          (ffn): FFN(
            (dropout): Dropout(p=0.1, inplace=False)
       