<a href="https://colab.research.google.com/github/matteraggi/FineTuningAI/blob/main/vsProject.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [3]:
!pip install --user transformers torch datasets peft tf-keras accelerate bitsandbytes trl

!pip install datasets --upgrade --force-reinstall

Collecting datasets
  Downloading datasets-3.2.0-py3-none-any.whl.metadata (20 kB)
Collecting bitsandbytes
  Downloading bitsandbytes-0.45.1-py3-none-manylinux_2_24_x86_64.whl.metadata (5.8 kB)
Collecting trl
  Downloading trl-0.14.0-py3-none-any.whl.metadata (12 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_

In [4]:
# Importing stock ml libraries
import numpy as np
import pandas as pd
from sklearn import metrics
import transformers
import torch
from torch.utils.data import Dataset, DataLoader, RandomSampler, SequentialSampler
# from transformers import BertTokenizer, BertModel, BertConfig
from datasets import load_dataset
from trl import SFTTrainer
from transformers import AutoTokenizer, AutoModelForCausalLM, Trainer, TrainingArguments,TrainingArguments,pipeline, logging,BitsAndBytesConfig
from peft import LoraConfig, PeftModel

In [5]:
# The model that you want to train from the Hugging Face hub
model_name = "NousResearch/Llama-2-7b-hf"  # Standard.  Good choice for a powerful and widely used LLM.

# The instruction dataset to use
dataset_name = "mlabonne/guanaco-llama2-1k"  # Standard. A popular dataset for instruction tuning.

# Fine-tuned model name
new_model = "llama-2-7b-miniguanaco"  # Standard.  Clear and descriptive.

################################################################################
# QLoRA parameters
################################################################################

# LoRA attention dimension. It determines the size and parameter count of the low-rank adaptation
lora_r = 16  # Medium-high.  This is a reasonable value.  You could experiment with lower values (e.g., 32) for even more memory efficiency or higher values (e.g., 128) for potentially better performance (but more memory usage).

# Alpha parameter for LoRA scaling factor that determines the impact of the low-rank matrices on the original model's output.
# Controls the overall strength of the low-rank adaptation.
lora_alpha = 2  # Standard.  Often set to 2-4 times `lora_r`.  This is a common setting.

# Dropout probability for LoRA layers
lora_dropout = 0.05  # Standard.  A common dropout rate.  You could try slightly lower values (e.g., 0.05) if you're not seeing overfitting.

################################################################################
# bitsandbytes parameters
################################################################################

# Activate 4-bit precision base model loading
use_4bit = True  # Standard. Essential for fitting large models on consumer GPUs.

# Compute dtype for 4-bit base models
bnb_4bit_compute_dtype = "float16"  # Standard.  Using float16 can speed up computation if your GPU supports it.  If you encounter issues, try "bfloat16" (if your GPU supports it) or "float32".

# Quantization type (fp4 or nf4)
bnb_4bit_quant_type = "nf4"  # Standard.  nf4 (NormalFloat4) is generally preferred.

# Activate nested quantization for 4-bit base models (double quantization)
use_nested_quant = False  # Standard.  Double quantization can sometimes improve performance but increases complexity.  It's often left disabled initially.

################################################################################
# TrainingArguments parameters
################################################################################

# Output directory where the model predictions and checkpoints will be stored
output_dir = "./results"  # Standard.

# Number of training epochs
num_train_epochs = 1  # Low.  One epoch is very low.  You'll almost certainly need more epochs (e.g., 3-5 or even more) to get good results.

# Enable fp16/bf16 training (set bf16 to True with an A100)
fp16 = True  # Standard.  You can enable this if your GPU supports it and `bnb_4bit_compute_dtype` is also set to a compatible value (float16).
bf16 = False  # Standard.  bf16 is generally preferred over fp16 if your GPU supports it.  Not available on a T4.

# Batch size per GPU for training
per_device_train_batch_size = 2  # Low.  You could try increasing this to 8 or even 16 if your GPU memory allows.  Higher batch sizes can often lead to faster training.

# Batch size per GPU for evaluation
per_device_eval_batch_size = 2  # Low.  Same as training batch size.

# Number of update steps to accumulate the gradients for
gradient_accumulation_steps = 4  # Standard.  If you run out of memory, you can increase this to 2, 4, or higher.  This effectively increases the batch size.

# Enable gradient checkpointing
gradient_checkpointing = True  # Standard.  Essential for reducing memory usage, especially with larger models.

# Maximum gradient normal (gradient clipping)
max_grad_norm = 0.3  # Standard.  A common value for gradient clipping.

# Initial learning rate (AdamW optimizer)
learning_rate = 2e-4  # Standard.  A good starting point.  You might want to experiment with values between 1e-4 and 5e-4.

# Weight decay to apply to all layers except bias/LayerNorm weights
weight_decay = 0.001  # Standard.

# Optimizer to use
optim = "paged_adamw_32bit"  # Standard.  A good optimizer.

# Learning rate schedule
lr_scheduler_type = "cosine"  # Standard.  A common and effective learning rate scheduler.

# Number of training steps (overrides num_train_epochs)
max_steps = 5000  # Standard.  Using `num_train_epochs` is usually sufficient.

# Ratio of steps for a linear warmup (from 0 to learning rate)
warmup_ratio = 0.03  # Standard.

# Group sequences into batches with same length
# Saves memory and speeds up training considerably
group_by_length = True  # Standard.  Highly recommended for efficiency.

# Save checkpoint every X updates steps
save_steps = 500  # Low.  You should set this to a value greater than 0 (e.g., every 500 or 1000 steps) to save checkpoints during training.

# Log every X updates steps
logging_steps = 25  # Standard.

################################################################################
# SFT parameters
################################################################################

# Maximum sequence length to use
max_seq_length = 1024  # Important.  You *must* set this.  A common value is 2048 or 4096, but it depends on your data and GPU memory.  Start with a lower value and increase it if possible.

# Pack multiple short examples in the same input sequence to increase efficiency
packing = False  # Standard.  Can improve efficiency if your dataset has many short sequences.

# Load the entire model on the GPU 0
device_map = {"": 0}  # Standard.  This maps the model to the first GPU.

In [9]:
# Load dataset (you can process it here)
dataset = load_dataset(dataset_name, split="train")

# Load tokenizer and model with QLoRA configuration
compute_dtype = getattr(torch, bnb_4bit_compute_dtype)

bnb_config = BitsAndBytesConfig(
    load_in_4bit=use_4bit,
    bnb_4bit_quant_type=bnb_4bit_quant_type,
    bnb_4bit_compute_dtype=compute_dtype,
    bnb_4bit_use_double_quant=use_nested_quant,
)

# Check GPU compatibility with bfloat16
if compute_dtype == torch.float16 and use_4bit:
    major, _ = torch.cuda.get_device_capability()
    if major >= 8:
        print("=" * 80)
        print("Your GPU supports bfloat16: accelerate training with bf16=True")
        print("=" * 80)

# Load base model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map=device_map
)



model.config.use_cache = False
model.config.pretraining_tp = 1

# Load LLaMA tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right" # Fix weird overflow issue with fp16 training

# Load LoRA configuration
peft_config = LoraConfig(
    lora_alpha=lora_alpha,
    lora_dropout=lora_dropout,
    r=lora_r,
    bias="none",
    task_type="CAUSAL_LM",
)

# Set training parameters
training_arguments = TrainingArguments(
    output_dir=output_dir,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    optim=optim,
    save_steps=save_steps,
    logging_steps=logging_steps,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    fp16=fp16,
    bf16=bf16,
    max_grad_norm=max_grad_norm,
    max_steps=max_steps,
    warmup_ratio=warmup_ratio,
    group_by_length=group_by_length,
    lr_scheduler_type=lr_scheduler_type,
    report_to="tensorboard"
)

# Set supervised fine-tuning parameters
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    peft_config=peft_config,
    args=training_arguments,
)

# Train model
trainer.train()

# Save trained model
trainer.model.save_pretrained(new_model)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]



OutOfMemoryError: CUDA out of memory. Tried to allocate 172.00 MiB. GPU 0 has a total capacity of 14.74 GiB of which 74.12 MiB is free. Process 18797 has 14.67 GiB memory in use. Of the allocated memory 14.09 GiB is allocated by PyTorch, and 461.27 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
# Ignore warnings
logging.set_verbosity(logging.CRITICAL)

# Run text generation pipeline with our next model
prompt = "What is a large language model?"
pipe = pipeline(task="text-generation", model=model, tokenizer=tokenizer, max_length=200)
result = pipe(f"<s>[INST] {prompt} [/INST]")
print(result[0]['generated_text'])