In [1]:
import os
import numpy as np
from tqdm import tqdm
import pandas as pd

from scipy.sparse import load_npz

import torch
from torch.utils.data import Dataset, DataLoader
from torch import nn

from transformers import DebertaModel, DebertaForSequenceClassification, DebertaTokenizerFast
from transformers import get_linear_schedule_with_warmup, AdamW

import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint
from torchmetrics import Accuracy, F1Score

2022-12-21 12:23:22.682940: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.


In [2]:
data_dir = "Data/" # "drive/MyDrive/COMP 5505 O/Project/Data/"

x_train = pd.read_csv(data_dir + "x_train_final.csv")["question"].to_numpy()
x_test = pd.read_csv(data_dir + "x_val_final.csv")["question"].to_numpy()


y_train = load_npz(data_dir + "y_train_final.npz").toarray()
y_test = load_npz(data_dir + "y_val_final.npz").toarray()

In [3]:
final_train_size = x_train.shape[0] // 5
final_test_size = x_test.shape[0] // 5

x_train_final = x_train[:final_train_size]
y_train_final = y_train[:final_train_size]

x_val_final = x_test[:final_test_size]
y_val_final = y_test[:final_test_size]

x_test_final = x_test[-final_test_size:]
y_test_final = y_test[-final_test_size:]

print(x_train_final.shape, y_train_final.shape, x_val_final.shape, y_val_final.shape, x_test_final.shape, y_test_final.shape)

(77733,) (77733, 500) (19433,) (19433, 500) (19433,) (19433, 500)


### Models and DataLoader functions

In [4]:
class QTagDataset(Dataset):
    def __init__(self,quest,tags, tokenizer, max_len):
        self.tokenizer = tokenizer
        self.text = quest
        self.labels = tags
        self.max_len = max_len
        
    def __len__(self):
        return len(self.text)
    
    
    def __getitem__(self, item_idx):
        text = self.text[item_idx]
        inputs = self.tokenizer.encode_plus(text, None, add_special_tokens=True, max_length= self.max_len, 
                                            padding = "max_length", return_token_type_ids= False, 
                                            return_attention_mask= True, truncation=True, return_tensors = "pt")
        
        input_ids = inputs["input_ids"].flatten()
        attn_mask = inputs["attention_mask"].flatten()
               
        return {"input_ids": input_ids, "attention_mask": attn_mask, "labels": torch.tensor(self.labels[item_idx], dtype=torch.float)}





class QTagDataModule (pl.LightningDataModule):
    
    def __init__(self, x_train, y_train, x_val, y_val, x_test, y_test, tokenizer, batch_size=16, max_token_len=200):
            super().__init__()
            self.tr_text = x_train
            self.tr_label = y_train
            self.val_text = x_val
            self.val_label = y_val
            self.test_text = x_test
            self.test_label = y_test
            self.tokenizer = tokenizer
            self.batch_size = batch_size
            self.max_token_len = max_token_len

    def setup(self, stage=None):
        self.train_dataset = QTagDataset(quest=self.tr_text,  tags=self.tr_label, tokenizer=self.tokenizer, max_len= self.max_token_len)
        self.val_dataset= QTagDataset(quest=self.val_text, tags=self.val_label, tokenizer=self.tokenizer, max_len = self.max_token_len)
        self.test_dataset =QTagDataset(quest=self.test_text, tags=self.test_label, tokenizer=self.tokenizer, max_len = self.max_token_len)


    def train_dataloader(self):
        return DataLoader(self.train_dataset,batch_size= self.batch_size, shuffle = True , num_workers=4)

    def val_dataloader(self):
        return DataLoader (self.val_dataset,batch_size= 16)

    def test_dataloader(self):
        return DataLoader (self.test_dataset,batch_size= 16)




class QTagDeBerta(pl.LightningModule):
    def __init__(self, model_path_or_name, n_classes = 500, steps_per_epoch = None, n_epochs = 3, lr = 2e-5, n_training_steps=None, n_warmup_steps=None):
        super().__init__()

        self.deberta = DebertaForSequenceClassification.from_pretrained(model_path_or_name, num_labels=n_classes)
        #self.classifier= nn.Linear(self.distilbert.config.hidden_size, n_classes)
        # self.sigmoid = nn.Sigmoid()
        
        self.steps_per_epoch = steps_per_epoch
        self.n_epochs = n_epochs
        self.lr = lr
        self.n_training_steps = n_training_steps
        self.n_warmup_steps = n_warmup_steps
        self.criterion = nn.BCEWithLogitsLoss()

    def forward(self,input_ids, attn_mask, labels=None):
        output = self.deberta(input_ids=input_ids,attention_mask=attn_mask, labels=labels)
        #output = self.classifier(output.last_hidden_state)
        return output["loss"], output["logits"]
    
    
    def training_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("train_loss", loss, prog_bar=True, logger=True)
        return {"loss": loss, "predictions": outputs, "labels": labels}
    
    
    def validation_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("val_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
    def test_step(self, batch, batch_idx):
        input_ids = batch["input_ids"]
        attention_mask = batch["attention_mask"]
        labels = batch["labels"]
        loss, outputs = self(input_ids, attention_mask, labels)
        self.log("test_loss", loss, prog_bar=True, logger=True)
        return loss
    
    
    # def training_epoch_end(self, outputs):
    #     labels = []
    #     predictions = []
    #     for output in outputs:
    #         for out_labels in output["labels"].detach().cpu():
    #             labels.append(out_labels)
    #         for out_predictions in output["predictions"].detach().cpu():
    #             predictions.append(out_predictions)
        
    #     labels = torch.stack(labels).int()
    #     predictions = torch.stack(predictions)
    #     for i in range(500):
    #         class_accuracy = Accuracy(predictions[:, i], labels[:, i])
    #         self.logger.experiment.add_scalar("Tag (" + str(i) + ")_accuracy/Train", class_accuracy, self.current_epoch)
    
    
    
    def configure_optimizers(self):
        optimizer = AdamW(self.parameters(), lr=2e-5)
        scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps = self.n_warmup_steps, 
                                                    num_training_steps=self.n_training_steps)

        return dict(optimizer=optimizer, lr_scheduler=dict(scheduler=scheduler, interval='step'))

### Initializing DistilBERT, Tokenizer, and Data

In [5]:
deberta_save_path = "results/trained_models/deberta/"# "drive/MyDrive/COMP 5505 O/Project/results/trained_models/DistilBert/"

if not os.path.exists(deberta_save_path):
    os.makedirs(deberta_save_path)

checkpoint_callback = ModelCheckpoint(dirpath = deberta_save_path, monitor="val_loss", filename="QTag-{epoch:02d}-{val_loss:.2f}", save_top_k=3,
                                      mode="min")

In [6]:
n_epochs = 1
batch_size = 4
max_len = 512
lr = 2e-05

steps_per_epoch = x_train.shape[0] // batch_size
total_training_steps = steps_per_epoch * n_epochs
warmup_steps = total_training_steps // 10

In [7]:
deberta_tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
deberta_pretrained = "microsoft/deberta-base"

model = QTagDeBerta(model_path_or_name = deberta_pretrained, n_classes = 500, steps_per_epoch = steps_per_epoch, n_epochs = n_epochs, lr = lr, 
                 n_training_steps = total_training_steps, n_warmup_steps = warmup_steps)

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.LayerNorm.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.dense.bias']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.weight', 'classifi

In [8]:
QT_data_module = QTagDataModule(x_train = x_train_final, y_train = y_train_final, x_val = x_val_final, y_val = y_val_final, x_test = x_test_final, y_test = y_test_final, 
                                tokenizer = deberta_tokenizer, batch_size = batch_size, max_token_len = max_len)
QT_data_module.setup()

### Training

In [9]:
trainer = pl.Trainer(gpus = 1, max_epochs = 1, callbacks=[checkpoint_callback])

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs


In [10]:
trainer.fit(model = model, datamodule = QT_data_module)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

  | Name      | Type                             | Params
---------------------------------------------------------------
0 | deberta   | DebertaForSequenceClassification | 139 M 
1 | criterion | BCEWithLogitsLoss                | 0     
---------------------------------------------------------------
139 M     Trainable params
0         Non-trainable params
139 M     Total params
558.307   Total estimated model params size (MB)


Sanity Checking: 0it [00:00, ?it/s]

  rank_zero_warn(


Training: 0it [00:00, ?it/s]

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Validation: 0it [00:00, ?it/s]

`Trainer.fit` stopped: `max_epochs=1` reached.


	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [11]:
try:
    torch.save(model, deberta_save_path + "deberta_model")
    print("Model saved!")
except:
    torch.save(model.state_dict(), deberta_save_path + "deberta_model")
    print("Model weights saved!")

Model weights saved!


### Evaluation

In [7]:
deberta_save_path = "results/trained_models/deberta/" # "drive/MyDrive/COMP 5505 O/Project/results/trained_models/DistilBert/"

deberta_tokenizer = DebertaTokenizerFast.from_pretrained("microsoft/deberta-base")
deberta_pretrained = "microsoft/deberta-base"

fine_tuned_model = QTagDeBerta(model_path_or_name = deberta_pretrained, n_classes = 500, steps_per_epoch = steps_per_epoch, n_epochs = n_epochs, lr = lr, 
                                  n_training_steps = total_training_steps, n_warmup_steps = warmup_steps)


fine_tuned_model.load_state_dict(torch.load(deberta_save_path + "deberta_model"))

Some weights of the model checkpoint at microsoft/deberta-base were not used when initializing DebertaForSequenceClassification: ['lm_predictions.lm_head.dense.weight', 'lm_predictions.lm_head.bias', 'lm_predictions.lm_head.dense.bias', 'lm_predictions.lm_head.LayerNorm.bias', 'lm_predictions.lm_head.LayerNorm.weight']
- This IS expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing DebertaForSequenceClassification from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of DebertaForSequenceClassification were not initialized from the model checkpoint at microsoft/deberta-base and are newly initialized: ['classifier.bias', 'classifier

<All keys matched successfully>

In [8]:
fine_tuned_model.eval()

QT_data_module = QTagDataModule(x_train = x_train_final, y_train = y_train_final, x_val = x_val_final, y_val = y_val_final, x_test = x_test_final, y_test = y_test_final, 
                                tokenizer = deberta_tokenizer, batch_size = batch_size, max_token_len = max_len)
QT_data_module.setup()

In [10]:
trainer = pl.Trainer(gpus = 1, max_epochs = 1, callbacks=[checkpoint_callback])
test_loss = trainer.test(fine_tuned_model, datamodule = QT_data_module)

  rank_zero_deprecation(
GPU available: True (cuda), used: True
TPU available: False, using: 0 TPU cores
IPU available: False, using: 0 IPUs
HPU available: False, using: 0 HPUs
LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]
  rank_zero_warn(


Testing: 0it [00:00, ?it/s]

────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
       Test metric             DataLoader 0
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
        test_loss            5.129960060119629
────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────


In [11]:
accuracy_fn = Accuracy(task = "multilabel", num_labels = 500)
f1_score_fn = F1Score(task = "multilabel", num_labels = 500)

In [12]:
def get_predictions(test_loader, model):
    actual_labels = []
    predictions = []

    with tqdm(total=final_test_size, desc="Progress") as pbar:

        with torch.no_grad():
            for batch_idx, batch in enumerate(test_loader):

                preds = model(input_ids = batch["input_ids"], attn_mask = batch["attention_mask"], labels=batch["labels"])
                predictions.extend(preds[1].detach().cpu().numpy().tolist())
                actual_labels.extend(batch["labels"].detach().cpu().numpy().tolist())
                pbar.update(batch["labels"].detach().cpu().size()[0])

    return torch.tensor(actual_labels), torch.tensor(predictions)

In [13]:
y_test_labels, y_pred_test = get_predictions(QT_data_module.test_dataloader(), fine_tuned_model)

Progress: 100%|█████████████████████████| 19433/19433 [6:00:41<00:00,  1.11s/it]


In [21]:
deberta_accuracy = accuracy_fn(y_pred_test, y_test_labels)
deberta_f1_score = f1_score_fn(y_pred_test, y_test_labels)

In [22]:
print("Evaluation results:\nAccuracy: ", deberta_accuracy.numpy(), "\nF1-Score: ", deberta_f1_score.numpy())

Evaluation results:
Accuracy:  0.96895325 
F1-Score:  0.4507246494293213
