In [1]:
import os
# Making sure it downloads models on my D drive, as no space in defualt file location
os.environ['HF_HOME'] = 'D:\\Download\\UCSD\\cache'
from tqdm.notebook import tqdm
import pandas as pd
import os
import csv
import sys
import numpy as np
import time
import random
from typing import Optional, List, Tuple
import matplotlib.pyplot as plt
import textwrap
import torch

from transformers import AutoTokenizer, AutoModelForSequenceClassification, Trainer, TrainingArguments, DebertaV2TokenizerFast
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from adapters import AdapterConfig

In [2]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f'Using Device: {device}')

Using Device: cuda


In [None]:
# Define model directories
BASE_MODEL_DIR = "./base_model"
LORA_MODEL_DIR = "./lora_model"
ADAPTER_MODEL_DIR = "./adapter_model"

In [3]:
# Load Llama 1B and tokenizer
model_name = "meta-llama/Llama-3.2-1B"  # Using LLama 1B as base model

# Couldn't train Llama because of lower mem GPUs so shifting to roberta
model_name = "FacebookAI/roberta-large"
tokenizer = AutoTokenizer.from_pretrained(model_name)
# Ensure tokenizer has a padding token
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token  # Use EOS token as PAD token
base_model = AutoModelForSequenceClassification.from_pretrained(model_name, num_labels=2)
base_model.config.pad_token_id = base_model.config.eos_token_id

tokenizer_config.json:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/482 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/899k [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/1.42G [00:00<?, ?B/s]

Some weights of RobertaForSequenceClassification were not initialized from the model checkpoint at FacebookAI/roberta-large and are newly initialized: ['classifier.dense.bias', 'classifier.dense.weight', 'classifier.out_proj.bias', 'classifier.out_proj.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [4]:
ds = load_dataset("stanfordnlp/imdb")

def preprocess_function(examples):
    return tokenizer(examples["text"], truncation=True, padding=True, max_length=128)

# Tokenize datasets
tokenized_datasets = ds.map(preprocess_function, batched=True)

# Prepare train and test datasets
train_dataset = tokenized_datasets["train"].shuffle(seed=42)  # Use full training dataset
test_dataset = tokenized_datasets["test"].shuffle(seed=42)    # Use full testing dataset

# Veyr big dataset
# Load a sentiment dataset (example: SST2)
# ds = load_dataset("facebook/xnli", "all_languages")
# train_data = ds['train']
# val_data = ds['validation']

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/25000 [00:00<?, ? examples/s]

Map:   0%|          | 0/50000 [00:00<?, ? examples/s]

In [5]:
def print_trainable_params(model, stage_name="Model"):
    print(f"\nTrainable Parameters in {stage_name}:")
    total_params = sum(p.numel() for p in model.parameters())
    trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)
    print(f"Total Parameters: {total_params}")
    print(f"Trainable Parameters: {trainable_params}")
    for name, param in model.named_parameters():
        if param.requires_grad:
            print(f"  - {name}: {param.numel()} params")


In [6]:
# Prepare training arguments
training_args = TrainingArguments(
    output_dir="./results",
    evaluation_strategy="epoch",  # Evaluate periodically during training
    #eval_steps=100,               # Frequency of evaluation (adjust as needed)
    save_strategy="epoch",
    learning_rate=2e-5,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    num_train_epochs=3,
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    load_best_model_at_end=True,
    fp16=True,  # Enable mixed precision training for GPU
    report_to="none",  # Disable reporting to avoid unnecessary overhead
)

# Train base model
trainer_base = Trainer(
    model=base_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)



In [7]:
base_model.to(device)

RobertaForSequenceClassification(
  (roberta): RobertaModel(
    (embeddings): RobertaEmbeddings(
      (word_embeddings): Embedding(50265, 1024, padding_idx=1)
      (position_embeddings): Embedding(514, 1024, padding_idx=1)
      (token_type_embeddings): Embedding(1, 1024)
      (LayerNorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): RobertaEncoder(
      (layer): ModuleList(
        (0-23): 24 x RobertaLayer(
          (attention): RobertaAttention(
            (self): RobertaSdpaSelfAttention(
              (query): Linear(in_features=1024, out_features=1024, bias=True)
              (key): Linear(in_features=1024, out_features=1024, bias=True)
              (value): Linear(in_features=1024, out_features=1024, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): RobertaSelfOutput(
              (dense): Linear(in_features=1024, out_features=1024, bias=Tru

In [8]:
print_trainable_params(base_model, stage_name="Base Model")


Trainable Parameters in Base Model:
Total Parameters: 355361794
Trainable Parameters: 355361794
  - roberta.embeddings.word_embeddings.weight: 51471360 params
  - roberta.embeddings.position_embeddings.weight: 526336 params
  - roberta.embeddings.token_type_embeddings.weight: 1024 params
  - roberta.embeddings.LayerNorm.weight: 1024 params
  - roberta.embeddings.LayerNorm.bias: 1024 params
  - roberta.encoder.layer.0.attention.self.query.weight: 1048576 params
  - roberta.encoder.layer.0.attention.self.query.bias: 1024 params
  - roberta.encoder.layer.0.attention.self.key.weight: 1048576 params
  - roberta.encoder.layer.0.attention.self.key.bias: 1024 params
  - roberta.encoder.layer.0.attention.self.value.weight: 1048576 params
  - roberta.encoder.layer.0.attention.self.value.bias: 1024 params
  - roberta.encoder.layer.0.attention.output.dense.weight: 1048576 params
  - roberta.encoder.layer.0.attention.output.dense.bias: 1024 params
  - roberta.encoder.layer.0.attention.output.Layer

In [9]:
If 
print("\nTraining Base Model...")
# Resize model embeddings after adding new special tokens
base_model.resize_token_embeddings(len(tokenizer))
trainer_base.train()


Training Base Model...


Epoch,Training Loss,Validation Loss


KeyboardInterrupt: 

In [None]:
# Save base model
tokenizer.save_pretrained("./base_model")
base_model.save_pretrained("./base_model")

In [None]:
# Evaluate base model
print("\nEvaluating Base Model...")
base_results = trainer_base.evaluate()
print("Base Model Results:", base_results)

In [None]:
# Define LoRA configuration
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="SEQ_CLS", 
    inference_mode=False,
)

# Apply LoRA to model
lora_model = get_peft_model(base_model, lora_config).to(device)

# Print trainable parameters
lora_model.print_trainable_parameters()

trainer_lora = Trainer(
    model=lora_model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
)
start_time = time.time()
print("\nTraining LoRA Model...")
trainer_lora.train()
print(f"LoRa trained in: {time.time() - start_time}s")



In [None]:
# Save LoRA model
tokenizer.save_pretrained("./lora_model")
lora_model.save_pretrained("./lora_model")

In [None]:
# Evaluate LoRA model
print("\nEvaluating LoRA Model...")
lora_results = trainer_lora.evaluate()
print("LoRA Model Results:", lora_results)