In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install datasets numpy
!pip install SupCL_Seq
!pip install transformers[torch]

Collecting datasets
  Downloading datasets-2.19.1-py3-none-any.whl (542 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.0/542.0 kB[0m [31m6.0 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m17.5 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from datasets)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m23.4 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from datasets)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m134.8/134.8 kB[0m [31m20.8 MB/s[0m eta [36m0:00:00[0m
Collecting huggingface-hub>=0.21.2 (from datasets)
  Downloading huggingface_hub-0.23.0-py3-none-a

In [3]:
!nvidia-smi

Wed May 15 12:08:33 2024       
+---------------------------------------------------------------------------------------+
| NVIDIA-SMI 535.104.05             Driver Version: 535.104.05   CUDA Version: 12.2     |
|-----------------------------------------+----------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |         Memory-Usage | GPU-Util  Compute M. |
|                                         |                      |               MIG M. |
|   0  Tesla T4                       Off | 00000000:00:04.0 Off |                    0 |
| N/A   45C    P8               9W /  70W |      0MiB / 15360MiB |      0%      Default |
|                                         |                      |                  N/A |
+-----------------------------------------+----------------------+----------------------+
                                                                    

In [4]:
from datasets import load_metric, DatasetDict, Dataset, ClassLabel, load_from_disk, concatenate_datasets
from transformers import TrainingArguments, Trainer, AutoTokenizer, AutoModel, AutoModelForSequenceClassification
import warnings
import numpy as np
import os
import re

from SupCL_Seq import SupCsTrainer

warnings.filterwarnings('ignore')

##Model Setup

In [9]:
model_path = '/content/drive/MyDrive/FYP/models-final/KanoonBert'

tokenizer = AutoTokenizer.from_pretrained(model_path, use_fast=True)
model = AutoModel.from_pretrained(model_path)

## Data Preparation


###Training Dataset

In [10]:
old_train_data = load_from_disk('/content/drive/MyDrive/FYP/Dataset/dataset_final')['train']

In [11]:
train_dataset = []
directory = '/content/drive/MyDrive/FYP/Dataset/encoded_dataset'
for file in os.listdir(directory):
    file_name = os.path.join(directory, file)
    e_data = load_from_disk(file_name)

    match = int(re.search(r'\d+', file_name).group())
    o_data = old_train_data[match:match+4]
    label_col = o_data['label']

    new_train_data = e_data.add_column('label',label_col)
    train_dataset.append(new_train_data)

In [12]:
final_train_dataset = concatenate_datasets(train_dataset)

In [13]:
print(final_train_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 3702
})


###Validation Dataset

In [14]:
old_val_data = load_from_disk('/content/drive/MyDrive/FYP/Dataset/dataset_final')['validation']

In [15]:
val_dataset = []
directory = '/content/drive/MyDrive/FYP/Dataset/encoded_dataset_validation'
for file in os.listdir(directory):
    file_name = os.path.join(directory, file)
    e_val_data = load_from_disk(file_name)

    match = int(re.search(r'\d+', file_name).group())
    o_val_data = old_val_data[match:match+4]
    label_col = o_val_data['label']

    new_val_data = e_val_data.add_column('label',label_col)
    val_dataset.append(new_val_data)

In [16]:
final_val_dataset = concatenate_datasets(val_dataset)

In [17]:
print(final_val_dataset)

Dataset({
    features: ['input_ids', 'token_type_ids', 'attention_mask', 'label'],
    num_rows: 924
})


##Task Setup

In [18]:
metric = load_metric('glue', 'mrpc')

# Used in the trainers
def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    predictions = np.argmax(predictions, axis=1)
    return metric.compute(predictions=predictions, references=labels)

Downloading builder script:   0%|          | 0.00/1.84k [00:00<?, ?B/s]

##Training Setup

In [None]:
CL_args = TrainingArguments(
        output_dir = '/content/drive/MyDrive/FYP/fine-tuning/models/output',
        # save_total_limit = 1,
        num_train_epochs=5,
        per_device_train_batch_size=2,
        evaluation_strategy = 'no',
        logging_steps = 50,
        learning_rate = 5e-05,
        warmup_steps=50,
        report_to ='tensorboard',
        weight_decay=0.01,
        logging_dir='/content/drive/MyDrive/FYP/fine-tuning/models/logs',
    )

In [None]:
SupCL_trainer = SupCsTrainer.SupCsTrainer(
    w_drop_out=[0.0, 0.1, 0.2],
    temperature=0.05,
    def_drop_out=0.1,
    pooling_strategy='pooler',
    model=model,
    args=CL_args,
    train_dataset=final_train_dataset,
    eval_dataset=final_val_dataset,
    tokenizer=tokenizer,
    compute_metrics=compute_metrics
)

# Employing pooler ([CLS]) output.


In [None]:
SupCL_trainer.train()

Step,Training Loss
50,2.0841
100,1.2362
150,1.1994
200,1.0153
250,1.1807
300,1.0337
350,1.1997
400,1.2379
450,1.2067
500,1.1003


Step,Training Loss
50,2.0841
100,1.2362
150,1.1994
200,1.0153
250,1.1807
300,1.0337
350,1.1997
400,1.2379
450,1.2067
500,1.1003


TrainOutput(global_step=9255, training_loss=1.102652887522627, metrics={'train_runtime': 7827.8731, 'train_samples_per_second': 2.365, 'train_steps_per_second': 1.182, 'total_flos': 4870098179850240.0, 'train_loss': 1.102652887522627, 'epoch': 5.0})

In [None]:
SupCL_trainer.save_model('/content/drive/MyDrive/FYP/fine-tuning/models/fine-tuned-model')

##Single Layer Training

In [19]:
finetuned_model_path = '/content/drive/MyDrive/FYP/fine-tuning/models/fine-tuned-model'#"./results/checkpoint-500/"
num_labels =  2

#------ Add classification layer ---------#
#model = RobertaForSequenceClassification.from_pretrained(model_name,num_labels=num_labels)
fine_tuned_model = AutoModelForSequenceClassification.from_pretrained(finetuned_model_path,num_labels=num_labels)
# ---- Freeze the base model -------#
for param in fine_tuned_model.base_model.parameters():
    param.requires_grad = False

Some weights of BertForSequenceClassification were not initialized from the model checkpoint at /content/drive/MyDrive/FYP/fine-tuning/models/fine-tuned-model and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [20]:
args = TrainingArguments(
        output_dir = '/content/drive/MyDrive/FYP/fine-tuning/models/results',
        save_total_limit = 1,
        num_train_epochs=5,
        per_device_train_batch_size=28,
        per_device_eval_batch_size=64,
        evaluation_strategy = 'epoch',
        logging_steps = 200,
        learning_rate = 1e-03,
        eval_steps = 200,
        warmup_steps=50,
        report_to ='tensorboard',
        weight_decay=0.01,
        logging_dir='/content/drive/MyDrive/FYP/fine-tuning/models/logs_2',
    )

In [21]:
trainer = Trainer(
            fine_tuned_model,
            args,
            train_dataset=final_train_dataset,
            eval_dataset=final_val_dataset,
            tokenizer=tokenizer,
            compute_metrics=compute_metrics
        )

In [22]:
trainer.train()

Epoch,Training Loss,Validation Loss,Accuracy,F1
1,No log,0.002373,1.0,1.0
2,0.057600,0.000696,1.0,1.0
3,0.057600,0.006326,1.0,1.0
4,0.015200,0.000732,1.0,1.0
5,0.015600,0.000884,1.0,1.0


TrainOutput(global_step=665, training_loss=0.02801147717282288, metrics={'train_runtime': 879.9271, 'train_samples_per_second': 21.036, 'train_steps_per_second': 0.756, 'total_flos': 4870185634713600.0, 'train_loss': 0.02801147717282288, 'epoch': 5.0})

In [23]:
trainer.save_model('/content/drive/MyDrive/FYP/fine-tuning/models/fine-tuned-model-single-layer')