We just be taking a sample model, taken with help of gpt and have a quick glance whether the model is correct or not. Since my main focus is checking whether my implementation is correct or not



In [None]:
import torch
import torch.nn as nn
from transformers import AutoModelForCausalLM, AutoTokenizer
import  qlora_and_lora  as Q
from Q import quantize_model, Qlora_4_bit
# 1. Load a small, real model from Hugging Face
model_id = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
print(f"Loading {model_id}...")
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    device_map="cpu", # Load to CPU first for safety
    torch_dtype=torch.bfloat16
)

# 2. Inspect the original structure to find target layer names
print("\n--- Original Layer Structure (First Block) ---")
print(model.model.layers[0].self_attn)
# You will see names like: 'q_proj', 'k_proj', 'v_proj', 'o_proj'

# 3. Define the target layers you want to convert
# For Llama models, typical targets are q_proj and v_proj
target_layers = ["q_proj", "v_proj", "o_proj", "gate_proj","up_gate", "down_gate", "k_proj", "up_proj", "down_proj"] # check using name, _ in model.named_modules

# 4. Apply YOUR quantize_model function
print(f"\nApplying QLoRA to layers: {target_layers}...")

# (Assuming your classes Qlora_4_bit and quantize_model are defined above)
quantize_model(
    module=model,
    mlp_layers=target_layers,
    alpha=4,
    r=1,
    require_lora=True,
    use_qlora=True # Set False for CPU testing, True if you have GPU+BitsAndBytes setup
)

# 5. VERIFICATION 1: Check if layers changed
print("\n--- Modified Layer Structure (First Block) ---")
# Access the first layer to see if it is now Qlora_4_bit
print(model.model.layers[0].self_attn.q_proj)

if isinstance(model.model.layers[0].self_attn.q_proj, Qlora_4_bit):
    print("\n‚úÖ SUCCESS: Layer successfully replaced with custom QLoRA class!")
else:
    print("\n‚ùå FAILURE: Layer is still the original nn.Linear.")

# 6. VERIFICATION 2: The "Smoke Test" (Forward Pass)
print("\n--- Running Forward Pass (Dimension Check) ---")
dummy_input = torch.randint(0, 31000, (1, 20), device='cpu').long()
output = model(dummy_input)

In [None]:
def print_trainable_parameters(model):
    """
    Prints the number of trainable parameters in the model.
    """
    trainable_params = 0
    all_param = 0

    for _, param in model.named_parameters():
        all_param += param.numel()
        if param.requires_grad:
            trainable_params += param.numel()

    print(f"all params: {all_param:,d}")
    print(f"trainable params: {trainable_params:,d}")
    print(f"trainable%: {100 * trainable_params / all_param:.4f}%")

# Run it
print_trainable_parameters(model)

In [None]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

# --- 1. CONFIGURATION ---
MODEL_ID = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
DATASET_ID = "Abirate/english_quotes" # Small, fast dataset
MAX_STEPS = 60  # Short run to verify loss drop
LEARNING_RATE = 2e-4 # High LR for visible results

# --- 2. LOAD MODEL & TOKENIZER ---
print(f"Loading {MODEL_ID}...")
tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_ID,
    device_map="auto",
    torch_dtype=torch.float16
)

# Fix for Llama/Mistral Tokenizers
tokenizer.pad_token = tokenizer.eos_token


print("Injecting Custom Adapters...")
quantize_model(
    module=model,
    mlp_layers=target_layers,
    alpha=4, #<----- KEEP IT HIGH OTHER WISE YOU WONT SEE ANYTHING TRAINING ALPHA/R ~ 2
    r=2,
    require_lora=True,
    use_qlora=True # use 4 bit module and check for normal nn.Linear
)
print("Locking weights...")
model.model.embed_tokens.weight.requires_grad = False

# Verify correct parameter count
def print_trainable(m):
    trainable = sum(p.numel() for p in m.parameters() if p.requires_grad)
    total = sum(p.numel() for p in m.parameters())
    print(f"‚úÖ Final Trainable Params: {trainable:,} ({100*trainable/total:.3f}%)")

print_trainable(model)

# --- 5. PREPARE DATASET ---
print(f"Loading dataset: {DATASET_ID}...")
data = load_dataset(DATASET_ID)

def tokenize(element):
    return tokenizer(
        element["quote"],
        truncation=True,
        padding="max_length",
        max_length=64
    )

tokenized_dataset = data.map(
    tokenize,
    batched=True,
    remove_columns=data["train"].column_names
)

# --- 6. DEFINE TRAINER ---
print("Setting up Trainer...")
# ALL GPT SINCE I WANNA SEE THE LOSS
args = TrainingArguments(
    output_dir="custom_qlora_outputs",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=4,
    learning_rate=LEARNING_RATE,
    logging_steps=5,         # Show loss every 5 steps
    max_steps=MAX_STEPS,     # Stop after 60 steps
    save_strategy="no",      # Don't save checkpoints during test
    optim="adamw_torch",     # Standard optimizer
    report_to="none",        # Disable WandB for now
    remove_unused_columns=False # Important for custom models
)

trainer = Trainer(
    model=model,
    args=args,
    train_dataset=tokenized_dataset["train"],
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False),
)

# --- 7. START TRAINING ---
print("\nüöÄ Starting Training! Watch the 'Loss' column...\n")
trainer.train()