In [2]:
import os
# Making sure it downloads models on my D drive, as no space in defualt file location
os.environ['HF_HOME'] = 'D:\\Download\\UCSD\\cache'
from tqdm.notebook import tqdm
import pandas as pd
import os
import csv
import sys
import numpy as np
import time
import random
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
import textwrap
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DebertaV2TokenizerFast
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from adapters import AdapterConfig

In [3]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Using Device: {device}')

Using Device: cuda


In [4]:
# Define model directories
BASE_MODEL_DIR = "./base_model"
LORA_MODEL_DIR = "./lora_model"
ADAPTER_MODEL_DIR = "./adapter_model"

In [5]:
# Load Llama 1B and tokenizer
model_name = "meta-llama/Llama-3.2-1B"  # Using LLama 1B as base model

# Couldn't train Llama because of lower mem GPUs so shifting to roberta
model_name = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as PAD token
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
base_model.config.pad_token_id = base_model.config.eos_token_id

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [6]:
ds = load_dataset("stanfordnlp/imdb")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

# Tokenize datasets
tokenized_datasets = ds.map(preprocess_function, batched=True)

# Prepare train and test datasets
train_dataset = tokenized_datasets["train"].shuffle(seed=42)  # Use full training dataset
test_dataset = tokenized_datasets["test"].shuffle(seed=42)    # Use full testing dataset

# Veyr big dataset
# Load a sentiment dataset (example: SST2)
# ds = load_dataset("facebook/xnli", "all_languages")
# train_data = ds['train']
# val_data = ds['validation']

In [7]:
def print_trainable_params(model, stage_name="Model"):
    print(f"\nTrainable Parameters in {stage_name}:")
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Parameters: {total_params}")
    print(f"Trainable Parameters: {trainable_params}")
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"  - {name}: {param.numel()} params")


In [6]:
# Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate periodically during training
    #eval_steps=100,               # Frequency of evaluation (adjust as needed)
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training for GPU
    report_to="none",  # Disable reporting to avoid unnecessary overhead
)

# Train base model
trainer_base = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)



In [7]:
base_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [None]:
print_trainable_params(base_model, stage_name="Base Model")

In [9]:
If 
print("\nTraining Base Model...")
# Resize model embeddings after adding new special tokens
base_model.resize_token_embeddings(len(tokenizer))
trainer_base.train()


Training Base Model...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save base model
tokenizer.save_pretrained("./base_model")
base_model.save_pretrained("./base_model")

In [None]:
# Evaluate base model
print("\nEvaluating Base Model...")
base_results = trainer_base.evaluate()
print("Base Model Results:", base_results)

In [None]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS", 
    inference_mode=False,
)

# Apply LoRA to model
lora_model = get_peft_model(base_model, lora_config).to(device)

# Print trainable parameters
lora_model.print_trainable_parameters()

trainer_lora = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
start_time = time.time()
print("\nTraining LoRA Model...")
trainer_lora.train()
print(f"LoRa trained in: {time.time() - start_time}s")



In [None]:
# Save LoRA model
tokenizer.save_pretrained("./lora_model")
lora_model.save_pretrained("./lora_model")

In [None]:
# Evaluate LoRA model
print("\nEvaluating LoRA Model...")
lora_results = trainer_lora.evaluate()
print("LoRA Model Results:", lora_results)

In [16]:
import torch
from torch import nn
from transformers.models.roberta.modeling_roberta import RobertaModel, RobertaLayer, RobertaAttention, RobertaIntermediate, RobertaOutput

# class CustomRobertaLayer(RobertaModel):
#     def __init__(self, config):
#         super().__init__(config)
#         self.attention = RobertaAttention(config)  # Multi-head attention
#         self.intermediate = RobertaIntermediate(config)  # Feed-forward network
#         self.output = RobertaOutput(config)  # Projection back
#         self.down_layer = nn.Linear(config.hidden_size, config.hidden_size // 2)
#         self.up_layer = nn.Linear(config.hidden_size // 2, config.hidden_size)
#         self.activation = nn.ReLU()
#         self.up_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

#     def forward(
#         self,
#         hidden_states,
#         attention_mask=None,
#         head_mask=None,
#         output_attentions=None,
#         output_hidden_states=None, input_ids=None, token_type_ids=None, position_ids=None, 
#         inputs_embeds=None, labels=None, 
#     ):
#         # 1. Self-attention
#         attention_output = self.attention(
#             hidden_states, attention_mask=attention_mask, head_mask=head_mask
#         )
        
#         # 2. Downsample → Activation → Upsample
#         downsampled = self.activation(self.down_layer(attention_output))
#         upsampled = self.up_layer(downsampled)
#         normalized = self.up_norm(upsampled + attention_output)  # Add residual connection

#         # 3. Intermediate feed-forward network
#         intermediate_output = self.intermediate(normalized)

#         # 4. Final output projection and residual connection
#         layer_output = self.output(intermediate_output, normalized)
#         return layer_output

# class CustomRobertaLayer(nn.Module):
#     def __init__(self, config):
#         super().__init__()
#         self.attention = RobertaAttention(config)
#         self.intermediate = RobertaIntermediate(config)
#         self.output = RobertaOutput(config)
#         self.down_layer = nn.Linear(config.hidden_size, 512)
#         self.up_layer = nn.Linear(512, config.hidden_size)
#         self.activation = nn.ReLU()
#         self.up_norm = nn.LayerNorm(config.hidden_size, eps=config.layer_norm_eps)

#     def forward(self, hidden_states, attention_mask=None, head_mask=None):
#         hidden_states = self.attention(hidden_states, attention_mask, head_mask)
#         intermediate_output = self.intermediate(hidden_states)
#         output = self.output(intermediate_output, hidden_states)
#         output = self.down_layer(output)
#         output = self.activation(output)
#         output = self.up_layer(output)
#         return self.up_norm(output)

class CustomRobertaLayer(RobertaLayer):
    def __init__(self, config):
        super().__init__(config)
        self.down_layer = nn.Linear(config.hidden_size, config.hidden_size // 2)  # Down-project
        self.up_layer = nn.Linear(config.hidden_size // 2, config.hidden_size)  # Up-project
        self.activation = nn.ReLU()  # You can use other activations like GELU
        self.up_norm = nn.LayerNorm(config.hidden_size)  #Normalization layer 

    def forward(
    self,
    hidden_states,
    attention_mask=None,
    head_mask=None,
    encoder_hidden_states=None,
    encoder_attention_mask=None,
    past_key_value=None,
    use_cache=False,
    output_attentions=False,
    **kwargs,
    ):
    # Ensure the attention mask is in the correct dtype
        if attention_mask is not None:
            attention_mask = attention_mask.to(dtype=hidden_states.dtype)  # Match precision (e.g., float16)
        
        # Original attention operation
        attention_outputs = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
        )
    
        attention_output = attention_outputs[0]
        print(f"Attention Output Shape: {attention_output.shape}")
        
        # Down-projection
        down_projected = self.activation(self.down_layer(attention_output))
        print(f"Down Projected Shape: {down_projected.shape}")
        # Up-projection
        up_projected = self.activation(self.up_layer(down_projected))
        print(f"Up Projected Shape: {up_projected.shape}")
        #normalization
        normalized_output = self.up_norm(up_projected)
        print(f"Normalization Shape: {normalized_output.shape}")
        
        # Add & Norm after FF layers
        layer_output = self.output(hidden_states=normalized_output, input_tensor=attention_output)
        return (layer_output,) + attention_outputs[1:]


In [17]:
from transformers.models.roberta.modeling_roberta import RobertaEncoder, RobertaConfig, RobertaEmbeddings


class CustomRobertaModel(RobertaModel):
    def __init__(self, config):
        super().__init__(config)

        # Replace the encoder with the custom encoder
        self.embeddings = RobertaEmbeddings(config)
        self.encoder = CustomRobertaEncoder(config)

        # Add the classification head at the end
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 1024),
            nn.ReLU(),
            nn.Linear(1024, config.num_labels),
        )

        # Freeze existing layers and enable gradients only for the new layers
        self.freeze_pretrained_layers()

    def freeze_pretrained_layers(self):
        # Freeze all layers except the new classifier and encoder's added layers
        for name, param in self.named_parameters():
            if "classifier" in name or "down_layer" in name or "up_layer" in name or "up_norm" in name:
                param.requires_grad = True  # Enable gradients for new layers
            else:
                param.requires_grad = False  # Freeze existing layers

    def forward(
        self, input_ids=None, attention_mask=None, token_type_ids=None, position_ids=None, 
        head_mask=None, inputs_embeds=None, output_attentions=None, output_hidden_states=None, labels=None, 
    ):

        print("Calling super()")
        print("What are labels?: ", labels)
        # Embeddings layer
        embeddings_output = self.embeddings(input_ids, token_type_ids, position_ids)

        # Encoder layer
        encoder_outputs = self.encoder(embeddings_output, attention_mask)

        # Extract [CLS] token for classification
        cls_token_output = encoder_outputs[:, 0, :]
        
        # outputs = super().forward(
        #     input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids, position_ids=position_ids, 
        #     head_mask=head_mask, inputs_embeds=inputs_embeds, output_attentions=output_attentions, output_hidden_states=output_hidden_states
        # )
        # print("Super Done")
        # pooled_output = outputs[0][:, 0, :]        
        print(f"Encoder Output Shape: {cls_token_output.shape}")

        # Pass through the classification head
        logits = self.classifier(pooled_output)
        print(f"Logits Shape: {logits.shape}") 

        return (logits,) + outputs[2:]  # Return logits along with other outputs
        
class CustomRobertaEncoder(RobertaEncoder):
    def __init__(self, config):
        super().__init__(config)
        self.layer = nn.ModuleList([CustomRobertaLayer(config) for _ in range(config.num_hidden_layers)])


In [18]:
from transformers import RobertaConfig

# Load the configuration
model_name = "FacebookAI/roberta-large"
config = RobertaConfig.from_pretrained(model_name, num_labels=2)

# Create the custom model
custom_model = CustomRobertaModel(config)

# Load pretrained weights
pretrained_model = RobertaModel.from_pretrained(model_name, num_labels=2)
custom_model.load_state_dict(pretrained_model.state_dict(), strict=False)
def initialize_weights(module):
    if isinstance(module, nn.Linear):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)

custom_model.apply(initialize_weights)


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


CustomRobertaModel(
  (embeddings): RobertaEmbeddings(
    (word_embeddings): Embedding(50265, 1024, padding_idx=1)
    (position_embeddings): Embedding(514, 1024, padding_idx=1)
    (token_type_embeddings): Embedding(1, 1024)
    (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): CustomRobertaEncoder(
    (layer): ModuleList(
      (0-23): 24 x CustomRobertaLayer(
        (attention): RobertaAttention(
          (self): RobertaSdpaSelfAttention(
            (query): Linear(in_features=1024, out_features=1024, bias=True)
            (key): Linear(in_features=1024, out_features=1024, bias=True)
            (value): Linear(in_features=1024, out_features=1024, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): RobertaSelfOutput(
            (dense): Linear(in_features=1024, out_features=1024, bias=True)
            (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwi

In [13]:
import torch
import torch.nn as nn
import torch.nn.functional as F
from transformers.models.roberta.modeling_roberta import (
    RobertaModel,
    RobertaEncoder,
    RobertaLayer,
    RobertaEmbeddings,
    RobertaConfig,
)

# Custom Layer that adds Down-Up Projection and LayerNorm after FF layer
class CustomRobertaLayer(RobertaLayer):
    def __init__(self, config):
        super().__init__(config)
        self.down_layer = nn.Linear(config.hidden_size, config.hidden_size // 2)  # Down-projection
        self.up_layer = nn.Linear(config.hidden_size // 2, config.hidden_size)    # Up-projection
        self.activation = nn.ReLU()                                              # Activation function
        self.layer_norm_new = nn.LayerNorm(config.hidden_size)                       # LayerNorm after Up-projection
        # intializing all as new layers
        self.down_layer._is_new = True
        self.up_layer._is_new = True
        self.activation._is_new = True
        self.layer_norm_new._is_new = True

    def forward(
        self,
        hidden_states,
        attention_mask=None,
        head_mask=None,
        encoder_hidden_states=None,
        encoder_attention_mask=None,
        past_key_value=None,
        use_cache=False,
        output_attentions=False,
        **kwargs,
    ):
        # Ensure the attention mask matches the required dimensions
        if attention_mask is not None:
            # Expand dimensions for multi-head attention
            attention_mask = attention_mask[:, None, None, :]  # Shape: [batch_size, 1, 1, seq_len]
            attention_mask = attention_mask.to(dtype=hidden_states.dtype)  # Match precision (e.g., float16)


        # Attention sub-layer
        attention_outputs = self.attention(
            hidden_states,
            attention_mask=attention_mask,
            head_mask=head_mask,
            encoder_hidden_states=encoder_hidden_states,
            encoder_attention_mask=encoder_attention_mask,
            past_key_value=past_key_value,
            output_attentions=output_attentions,
        )
        attention_output = attention_outputs[0]

        # Feed-forward sub-layer
        intermediate_output = self.intermediate(attention_output)
        layer_output = self.output(hidden_states=intermediate_output, input_tensor=attention_output)

        # Down-projection, activation, up-projection, and LayerNorm
        down_projected = self.activation(self.down_layer(layer_output))
        up_projected = self.activation(self.up_layer(down_projected))
        norm_output = self.layer_norm_new(up_projected + layer_output)  # Residual connection

        return (norm_output,) + attention_outputs[1:]  # Return outputs

# Custom Encoder
class CustomRobertaEncoder(RobertaEncoder):
    def __init__(self, config):
        super().__init__(config)
        self.layer = nn.ModuleList([CustomRobertaLayer(config) for _ in range(config.num_hidden_layers)])

# Custom Model
class CustomRobertaModel(RobertaModel):
    def __init__(self, config):
        super().__init__(config)

        # Replace the encoder with the custom encoder
        self.embeddings = RobertaEmbeddings(config)
        self.encoder = CustomRobertaEncoder(config)

        # Add the classification head at the end
        self.classifier = nn.Sequential(
            nn.Linear(config.hidden_size, 1024),
            nn.ReLU(),
            nn.Linear(1024, config.num_labels),
        )
        self.classifier._is_new = True

        # Freeze existing layers if needed
        self.freeze_pretrained_layers()

    def freeze_pretrained_layers(self):
        # Freeze all layers except the classifier and custom layers
        for name, param in self.named_parameters():
            if "classifier" in name or "down_layer" in name or "up_layer" in name or "layer_norm" in name:
                param.requires_grad = True
            else:
                param.requires_grad = False

    def forward(
        self,
        input_ids=None,
        attention_mask=None,
        token_type_ids=None,
        position_ids=None,
        head_mask=None,
        inputs_embeds=None,
        output_attentions=None,
        output_hidden_states=None,
        labels=None,
    ):
        # Embedding layer
        embedding_output = self.embeddings(input_ids, token_type_ids=token_type_ids, position_ids=position_ids)

        # Encoder layer
        encoder_outputs = self.encoder(
            embedding_output,
            attention_mask=attention_mask,
            head_mask=head_mask,
            output_attentions=output_attentions,
            output_hidden_states=output_hidden_states,
        )
        sequence_output = encoder_outputs[0]

        # Extract the [CLS] token representation
        cls_token_output = sequence_output[:, 0, :]

       # Classification head
        logits = self.classifier(cls_token_output)

        # Compute loss if labels are provided
        loss = None
        if labels is not None:
            loss = F.cross_entropy(logits, labels)

        # Return loss if available, otherwise logits
        return (loss, logits) if loss is not None else logits


In [14]:
# Instantiate the model
from transformers import RobertaConfig

# Load configuration
model_name = "FacebookAI/roberta-large"
config = RobertaConfig.from_pretrained(model_name, num_labels=2)

# Create the custom model
custom_model = CustomRobertaModel(config)

# Load pretrained weights
pretrained_model = RobertaModel.from_pretrained(model_name)
original_weights = pretrained_model.state_dict()

custom_model.load_state_dict(pretrained_model.state_dict(), strict=False)

def initialize_weights(module):
    if isinstance(module, nn.Linear) and getattr(module, "_is_new", False):
        nn.init.xavier_uniform_(module.weight)
        if module.bias is not None:
            nn.init.zeros_(module.bias)

custom_model.apply(initialize_weights)

# Compare weights
for name, param in custom_model.named_parameters():
    if any(keyword in name for keyword in ["classifier", "down_layer", "up_layer", "layer_norm",]):
        print("Found:", name, param)
    # if name in original_weights:
    #     if not torch.equal(param, original_weights[name]):
    #         print(f"Layer {name} weights were modified.")
    #     else:
    #         print(f"Layer {name, param} weights are unchanged.")
    # else:
    #     print("Newly Added:", name, param)


Some weights of RobertaModel were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Found: encoder.layer.0.down_layer.weight Parameter containing:
tensor([[ 0.0320,  0.0090, -0.0493,  ...,  0.0186,  0.0459, -0.0127],
        [-0.0592,  0.0212,  0.0170,  ..., -0.0181, -0.0251, -0.0375],
        [-0.0496, -0.0348,  0.0044,  ..., -0.0325, -0.0543, -0.0345],
        ...,
        [-0.0485, -0.0443,  0.0314,  ..., -0.0545, -0.0387,  0.0126],
        [-0.0586,  0.0558, -0.0169,  ..., -0.0381,  0.0244,  0.0500],
        [ 0.0610, -0.0195,  0.0125,  ...,  0.0501,  0.0268,  0.0381]],
       requires_grad=True)
Found: encoder.layer.0.down_layer.bias Parameter containing:
tensor([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,

In [16]:
for name, param in custom_model.named_parameters():
    if any(keyword in name for keyword in ["classifier", "down_layer", "up_layer", "layer_norm_new",]):
        print("New:", name)
    if all(keyword not in name for keyword in ["classifier", "down_layer", "up_layer", "layer_norm_new",]):
        print("Old:", name)

Old: embeddings.word_embeddings.weight
Old: embeddings.position_embeddings.weight
Old: embeddings.token_type_embeddings.weight
Old: embeddings.LayerNorm.weight
Old: embeddings.LayerNorm.bias
Old: encoder.layer.0.attention.self.query.weight
Old: encoder.layer.0.attention.self.query.bias
Old: encoder.layer.0.attention.self.key.weight
Old: encoder.layer.0.attention.self.key.bias
Old: encoder.layer.0.attention.self.value.weight
Old: encoder.layer.0.attention.self.value.bias
Old: encoder.layer.0.attention.output.dense.weight
Old: encoder.layer.0.attention.output.dense.bias
Old: encoder.layer.0.attention.output.LayerNorm.weight
Old: encoder.layer.0.attention.output.LayerNorm.bias
Old: encoder.layer.0.intermediate.dense.weight
Old: encoder.layer.0.intermediate.dense.bias
Old: encoder.layer.0.output.dense.weight
Old: encoder.layer.0.output.dense.bias
Old: encoder.layer.0.output.LayerNorm.weight
Old: encoder.layer.0.output.LayerNorm.bias
New: encoder.layer.0.down_layer.weight
New: encoder.layer

In [10]:
# Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",  # Evaluate periodically during training
    #eval_steps=100,               # Frequency of evaluation (adjust as needed)
    save_strategy="epoch",
    learning_rate=1e-4,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training for GPU
    report_to="none",  # Disable reporting to avoid unnecessary overhead
)

# Train base model
trainer_base = Trainer(
    model=custom_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)

In [11]:
torch.cuda.empty_cache()
custom_model.to(device)
print("\nTraining Custom Adapter Model...")
# Resize model embeddings after adding new special tokens
# custom_model.resize_token_embeddings(len(tokenizer))
trainer_base.train()


Training Custom Adapter Model...


Epoch,Training Loss,Validation Loss
1,0.6957,0.694092


KeyboardInterrupt: 

In [28]:
26303490/381663234

0.06891806088924982

In [27]:
print_trainable_params(custom_model, stage_name="Base Model")


Trainable Parameters in Base Model:
Total Parameters: 381663234
Trainable Parameters: 26303490
  - encoder.layer.0.down_layer.weight: 524288 params
  - encoder.layer.0.down_layer.bias: 512 params
  - encoder.layer.0.up_layer.weight: 524288 params
  - encoder.layer.0.up_layer.bias: 1024 params
  - encoder.layer.0.layer_norm.weight: 1024 params
  - encoder.layer.0.layer_norm.bias: 1024 params
  - encoder.layer.1.down_layer.weight: 524288 params
  - encoder.layer.1.down_layer.bias: 512 params
  - encoder.layer.1.up_layer.weight: 524288 params
  - encoder.layer.1.up_layer.bias: 1024 params
  - encoder.layer.1.layer_norm.weight: 1024 params
  - encoder.layer.1.layer_norm.bias: 1024 params
  - encoder.layer.2.down_layer.weight: 524288 params
  - encoder.layer.2.down_layer.bias: 512 params
  - encoder.layer.2.up_layer.weight: 524288 params
  - encoder.layer.2.up_layer.bias: 1024 params
  - encoder.layer.2.layer_norm.weight: 1024 params
  - encoder.layer.2.layer_norm.bias: 1024 params
  - enc