In [1]:
import os
import numpy as np
from tqdm import tqdm
import pandas as pd

from scipy.sparse import load_npz

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from transformers import BertModel, BertForSequenceClassification, BertTokenizerFast
from transformers import get_linear_schedule_with_warmup, AdamW

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics import Accuracy, F1Score

2022-12-19 23:11:46.205153: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


### Loading Dataset

In [2]:
data_dir = "Data/"# "drive/MyDrive/COMP 5505 O/Project/Data/"

x_train = pd.read_csv(data_dir + "x_train_final.csv")["question"].to_numpy()
x_test = pd.read_csv(data_dir + "x_val_final.csv")["question"].to_numpy()

y_train = load_npz(data_dir + "y_train_final.npz").toarray()
y_test = load_npz(data_dir + "y_val_final.npz").toarray()

In [3]:
final_train_size = x_train.shape[0] // 5
final_test_size = x_test.shape[0] // 5

x_train_final = x_train[:final_train_size]
y_train_final = y_train[:final_train_size]

x_test_final = x_test[:final_test_size]
y_test_final = y_test[:final_test_size]

print(x_train_final.shape, y_train_final.shape, x_test_final.shape, y_test_final.shape)

(77733,) (77733, 500) (19433,) (19433, 500)


### Models and DataLoader functions

In [4]:
class QTagDataset(Dataset):
    def __init__(self,quest,tags, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = quest
        self.labels = tags
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(text, None, add_special_tokens=True, max_length= self.max_len, 
                                            padding = "max_length", return_token_type_ids= False, 
                                            return_attention_mask= True, truncation=True, return_tensors = "pt")
        
        input_ids = inputs["input_ids"].flatten()
        attn_mask = inputs["attention_mask"].flatten()
               
        return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": torch.tensor(self.labels[item_idx], dtype=torch.float)}





class QTagDataModule (pl.LightningDataModule):
    
    def __init__(self, x_train, y_train, x_val, y_val, x_test, y_test, tokenizer, batch_size=16, max_token_len=200):
            super().__init__()
            self.tr_text = x_train
            self.tr_label = y_train
            self.val_text = x_val
            self.val_label = y_val
            self.test_text = x_test
            self.test_label = y_test
            self.tokenizer = tokenizer
            self.batch_size = batch_size
            self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = QTagDataset(quest=self.tr_text,  tags=self.tr_label, tokenizer=self.tokenizer, max_len= self.max_token_len)
        self.val_dataset= QTagDataset(quest=self.val_text, tags=self.val_label, tokenizer=self.tokenizer, max_len = self.max_token_len)
        self.test_dataset =QTagDataset(quest=self.test_text, tags=self.test_label, tokenizer=self.tokenizer, max_len = self.max_token_len)


    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size= self.batch_size, shuffle = True , num_workers=4)

    def val_dataloader(self):
        return DataLoader(self.val_dataset,batch_size= 16)

    def test_dataloader(self):
        return DataLoader(self.test_dataset,batch_size= 16)




class QTagBert(pl.LightningModule):
    def __init__(self, model_path_or_name, n_classes = 500, steps_per_epoch = None, n_epochs = 3, lr = 2e-5, n_training_steps=None, n_warmup_steps=None):
        super().__init__()

        self.bert = BertModel.from_pretrained(model_path_or_name, return_dict=True)
        self.classifier= nn.Linear(self.bert.config.hidden_size, n_classes)
        # self.sigmoid = nn.Sigmoid()
        
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.acc_fn = Accuracy(task="multilabel", num_labels=500)
        self.sig_fn = nn.Sigmoid()
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self,input_ids, attn_mask, labels=None):
        output = self.bert(input_ids=input_ids,attention_mask=attn_mask)
        output = self.classifier(output.pooler_output)
        loss = 0
        if labels is not None:
            loss = self.criterion(output, labels)
        return loss, output
    
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}
    
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
#     def training_epoch_end(self, outputs):
#         labels = []
#         predictions = []
#         for output in outputs:
#             for out_labels in output["labels"].detach().cpu():
#                 labels.append(out_labels)
#             for out_predictions in output["predictions"].detach().cpu():
#                 predictions.append(out_predictions)
        
#         labels = torch.stack(labels).int()
#         predictions = self.sig_fn(torch.stack(predictions))
#         for i in range(500):
#             class_accuracy = self.acc_fn(predictions[:, i], labels[:, i])
#             self.logger.experiment.add_scalar("Tag (" + str(i) + ")_accuracy/Train", class_accuracy, self.current_epoch)
    
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = self.n_warmup_steps, 
                                                    num_training_steps=self.n_training_steps)

        return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))

### Initializing BERT, Tokenizer, and Data

In [9]:
bert_save_path = "results/trained_models/Bert/"# "drive/MyDrive/COMP 5505 O/Project/results/trained_models/Bert/"

if not os.path.exists(bert_save_path):
    os.makedirs(bert_save_path)

In [13]:
checkpoint_callback = ModelCheckpoint(dirpath = bert_save_path, monitor="val_loss", filename="QTag-{epoch:02d}-{val_loss:.2f}", save_top_k=3,
                                      mode="min")

In [5]:
n_epochs = 2
batch_size = 4
max_len = 512
lr = 2e-05

steps_per_epoch = x_train.shape[0] // batch_size
total_training_steps = steps_per_epoch * n_epochs
warmup_steps = total_training_steps // 10

In [6]:
Bert_tokenizer = BertTokenizerFast.from_pretrained("bert-base-uncased")
bert_pretrained = "bert-base-uncased"

In [7]:
model = QTagBert(model_path_or_name = "bert-base-uncased", n_classes = 500, steps_per_epoch = steps_per_epoch, n_epochs = n_epochs, lr = lr, 
                 n_training_steps = total_training_steps, n_warmup_steps = warmup_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.dense.bias']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [8]:
QT_data_module = QTagDataModule(x_train = x_train_final, y_train = y_train_final, x_val = x_test_final, y_val = y_test_final, x_test = x_test_final, y_test = y_test_final, 
                                tokenizer = Bert_tokenizer, batch_size = batch_size, max_token_len = max_len)
QT_data_module.setup()

### Training

In [9]:
trainer = pl.Trainer(gpus = 1, max_epochs = 2, callbacks=[checkpoint_callback])

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [14]:
trainer.fit(model = model, datamodule = QT_data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name       | Type              | Params
-------------------------------------------------
0 | bert       | BertModel         | 109 M 
1 | classifier | Linear            | 384 K 
2 | criterion  | BCEWithLogitsLoss | 0     
-------------------------------------------------
109 M     Trainable params
0         Non-trainable params
109 M     Total params
439.467   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

Training: 19434it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 0it [00:00, ?it/s]

	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


ValueError: Expected argument `task` to either be `'binary'`, `'multiclass'` or `'multilabel'` but got tensor([-1.5942, -1.6668, -1.7241,  ..., -4.9376, -4.9409, -4.6679])

In [21]:
torch.save(model.state_dict(), bert_save_path + "bert_model")

### Evaluation

In [50]:
fine_tuned_model = QTagBert(model_path_or_name = "bert-base-uncased", n_classes = 500, steps_per_epoch = steps_per_epoch, n_epochs = n_epochs, lr = lr, 
                 n_training_steps = total_training_steps, n_warmup_steps = warmup_steps)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [51]:
fine_tuned_model.load_state_dict(torch.load(bert_save_path + "bert_model"))
fine_tuned_model.eval()

QTagBert(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
   

In [11]:
x_test_main = x_test[-final_test_size:]
y_test_main = y_test[-final_test_size:]

main_QT_data_module = QTagDataModule(x_train = x_train_final, y_train = y_train_final, x_val = x_test_final, y_val = y_test_final, 
                                     x_test = x_test_main, y_test = y_test_main, tokenizer = Bert_tokenizer, 
                                     batch_size = batch_size, max_token_len = max_len)
main_QT_data_module.setup()

##### On Validation set

In [24]:
test_loss = trainer.test(new_model, datamodule = QT_data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss          0.015214087441563606
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


##### on test set

In [28]:
test_loss = trainer.test(new_model, datamodule = main_QT_data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss           0.01500459760427475
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [24]:
accuracy_fn = Accuracy(task = "multilabel", num_labels=500)

In [44]:
f1_score_fn = F1Score(task="multilabel", num_labels=500)

In [54]:
def get_predictions(test_loader, model):
    actual_labels = []
    predictions = []
    
    with tqdm(total=final_test_size, desc="Progress") as pbar:
    
        with torch.no_grad():
            for batch_idx, batch in enumerate(main_QT_data_module.test_dataloader()):
                preds = model(input_ids = batch["input_ids"], attn_mask = batch["attention_mask"])
                predictions.extend(preds[1].detach().cpu().numpy().tolist())
                actual_labels.extend(batch["labels"].detach().cpu().numpy().tolist())
                pbar.update(preds[1].detach().cpu().size()[0])

    return torch.tensor(actual_labels), torch.tensor(predictions)

In [55]:
y_test_labels, y_pred_test = get_predictions(main_QT_data_module.test_dataloader(), fine_tuned_model)

Progress: 100%|█████████████████████████| 19433/19433 [1:47:37<00:00,  3.01it/s]


In [56]:
bert_accuracy = accuracy_fn(y_pred_test, y_test_labels)
bert_f1 = f1_score_fn(y_pred_test, y_test_labels)

In [58]:
print("Evaluation results:\nAccuracy: ", bert_accuracy.numpy(), "\nF1-Score: ", bert_f1.numpy())

Evaluation results:
Accuracy:  0.9969146 
F1-Score:  0.24564052
