# Download the model and dataset from Hugging Face and fine-tune it using LoRA.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "tiiuae/Falcon3-1B-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="cuda")

tokenizer_config.json:   0%|          | 0.00/362k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/91.0 [00:00<?, ?B/s]

In [3]:
from peft import get_peft_model, LoraConfig

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank for LoRA
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA layers
    task_type="CAUSAL_LM"
)

# Apply LoRA to the base model
model = get_peft_model(model, lora_config)

In [4]:
from datasets import DatasetDict
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned")

# Assuming 'dataset' has a 'train' split
train_dataset = dataset["train"]

# Split the 'train' dataset into train (80%) and eval (20%)
train_split = train_dataset.train_test_split(test_size=0.2)

# Now you have train_split['train'] and train_split['test']
train_dataset = train_split['train']  # 80% for training
eval_dataset = train_split['test']   # 20% for evaluation

# Function to merge instruction and input into a single string
def merge_instruction_input(example):
    # Concatenate instruction and input, you can add a separator if needed
    example['merged_input'] = example['instruction'] + " " + example['input']
    return example

# Apply the merge function to both train and eval datasets
train_dataset = train_dataset.map(merge_instruction_input)
eval_dataset = eval_dataset.map(merge_instruction_input)

# Function to tokenize the merged input and output
def tokenize_function(example):
    # Tokenize the 'merged_input' and 'output'
    input_encoding = tokenizer(example['merged_input'], padding="max_length", truncation=True, max_length=256)
    target_encoding = tokenizer(example['output'], padding="max_length", truncation=True, max_length=256)

    # Set input_ids and labels
    input_encoding['labels'] = target_encoding['input_ids']  # Use output as labels
    return input_encoding

# Apply the tokenization to both train and eval datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# List of columns to remove
columns_to_remove = ['output', 'input', 'instruction', 'merged_input']

# Remove the columns
train_dataset = train_dataset.remove_columns(columns_to_remove)
eval_dataset = eval_dataset.remove_columns(columns_to_remove)

# Print the modified train and eval datasets
print("Train Dataset:")
print(train_dataset)

print("\nEval Dataset:")
print(eval_dataset)

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/41408 [00:00<?, ? examples/s]

Map:   0%|          | 0/10352 [00:00<?, ? examples/s]

Map:   0%|          | 0/41408 [00:00<?, ? examples/s]

Map:   0%|          | 0/10352 [00:00<?, ? examples/s]

Train Dataset:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 41408
})

Eval Dataset:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10352
})


# Use wandb to log and save the model

In [7]:
import wandb
from transformers import Trainer, TrainingArguments
from huggingface_hub import login
login("hf_token")

wandb.login(key="wandb_token")
# Example fine-tuning parameters
training_args = TrainingArguments(
    output_dir="./results",              # Directory where model checkpoints and logs will be saved
    per_device_train_batch_size=8,       # Batch size per GPU (if multiple GPUs, total batch size = batch_size * num_gpus)
    per_device_eval_batch_size=8,        # Batch size per GPU for evaluation
    evaluation_strategy="epoch",         # Evaluate at the end of every epoch
    num_train_epochs=3,                  # Train for 3 full passes through the dataset
    logging_dir="./logs",                # Directory for logs (useful for TensorBoard)
    save_strategy="epoch",               # Save checkpoints at the end of each epoch
    save_total_limit=2,                  # Keep only the last 2 checkpoints, deleting older ones
    report_to="wandb",                   # Report training metrics to Weights & Biases
    push_to_hub=True,                    # Push model checkpoints to Hugging Face Hub
    fp16=True,                           # Enable mixed precision (use bf16=True for newer GPUs)
    torch_compile=True,                  # Enable PyTorch 2.0 compilation
    ddp_find_unused_parameters=False,    # Optimize DDP (if using multiple GPUs)
    gradient_accumulation_steps=2,       # Simulates larger batch size without extra GPU memory
    save_steps=500,                      # Save every 500 steps instead of just every epoch
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your custom dataset
    eval_dataset=eval_dataset,  # Your validation dataset
)

trainer.train()


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,7.4839,7.477202
2,7.3911,7.422033
3,7.3718,7.405322


No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=15528, training_loss=7.509777732389235, metrics={'train_runtime': 38296.9512, 'train_samples_per_second': 3.244, 'train_steps_per_second': 0.405, 'total_flos': 2.6751395593558426e+17, 'train_loss': 7.509777732389235, 'epoch': 3.0})

In [11]:
import wandb

artifact = wandb.Artifact("model_checkpoint", type="model")
artifact.add_dir("./results/")  # Upload all checkpoint files
wandb.log_artifact(artifact)


[34m[1mwandb[0m: Adding directory to artifact (./results)... Done. 0.0s


<Artifact model_checkpoint>

# Import the model artifact from wandb and compare its performance to the base model.

In [11]:
from transformers import  AutoModelForCausalLM

import wandb
wandb.init()
artifact = wandb.use_artifact("1257979-konkuk-university/huggingface/model_checkpoint:latest", type="model")
artifact_dir = artifact.download()

# Load the model
from transformers import AutoModel
model = AutoModelForCausalLM.from_pretrained(artifact_dir)


[34m[1mwandb[0m:   19 of 19 files downloaded.  


In [22]:
from transformers import AutoTokenizer, GenerationConfig

model_name = "tiiuae/Falcon3-1B-Base"
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "What is the capital of South Korea?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=10)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

What is the capital of South Korea? Seoul

.

.





In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "tiiuae/Falcon3-1B-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="cuda")
model2.generation_config = GenerationConfig.from_pretrained(model_name)
model2.generation_config.pad_token_id = model2.generation_config.eos_token_id

text = "What is the capital of South Korea?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model2.generate(**inputs.to(model2.device), max_new_tokens=50)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


What is the capital of South Korea?


What is the capital of South Korea?


What is the capital of South Korea?


What is the capital of South Korea?


What is the capital of South Korea?


What is the


# I was able to 10x train the model faster using mixed precision and ddp

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, get_linear_schedule_with_warmup
from peft import get_peft_model, LoraConfig
from datasets import load_dataset
from torch.utils.data import DataLoader
from tqdm.auto import tqdm  # For progress bars
import time

# --- Model and Tokenizer Setup ---
model_name = "tiiuae/Falcon3-1B-Base"  # Or any other Falcon model
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# --- Device Setup ---
device = "cuda" if torch.cuda.is_available() else "cpu"
num_gpus = torch.cuda.device_count()
print(f"Using {num_gpus} {device} GPUs")

torch.set_float32_matmul_precision('high') #sets to tf32 during matmul operations

model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
model = model.to(device)  # Move to device *before* DataParallel
# model = torch.compile(model)

# --- LoRA Configuration ---
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    lora_dropout=0.1,
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)

# --- DataParallel (AFTER moving to device and applying LoRA) ---
if num_gpus > 1:
    model = torch.nn.DataParallel(model)

# --- Dataset Loading and Preprocessing ---
dataset = load_dataset("yahma/alpaca-cleaned")

train_dataset = dataset["train"]
train_split = train_dataset.train_test_split(test_size=0.2)
train_dataset = train_split['train']
eval_dataset = train_split['test']

def merge_instruction_input(example):
    example['merged_input'] = example['instruction'] + " " + example['input']
    return example

train_dataset = train_dataset.map(merge_instruction_input)
eval_dataset = eval_dataset.map(merge_instruction_input)

def tokenize_function(example):
    input_encoding = tokenizer(example['merged_input'], padding="max_length", truncation=True, max_length=256)
    target_encoding = tokenizer(example['output'], padding="max_length", truncation=True, max_length=256)
    input_encoding['labels'] = target_encoding['input_ids']
    return input_encoding

train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

columns_to_remove = ['output', 'input', 'instruction', 'merged_input']
train_dataset = train_dataset.remove_columns(columns_to_remove)
eval_dataset = eval_dataset.remove_columns(columns_to_remove)

train_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])
eval_dataset.set_format(type='torch', columns=['input_ids', 'attention_mask', 'labels'])

In [None]:
# --- Training Loop ---
num_epochs = 2
learning_rate = 2e-4
batch_size_per_gpu = 8  # Batch size *per GPU*
batch_size = batch_size_per_gpu * num_gpus  # *Effective* batch size
gradient_accumulation_steps = 4  # Adjust if needed
warmup_steps = 50
output_dir = "falcon-lora-alpaca"
eval_steps = 200
save_steps = 200

# DataLoaders
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
eval_dataloader = DataLoader(eval_dataset, batch_size=batch_size) # eval batch size can often be larger

# Optimizer and Scheduler
optimizer = torch.optim.AdamW(model.parameters(), lr=learning_rate)
total_steps = len(train_dataloader) * num_epochs // gradient_accumulation_steps
scheduler = get_linear_schedule_with_warmup(optimizer, num_warmup_steps=warmup_steps, num_training_steps=total_steps)

# Training Loop
model.train()
for epoch in range(num_epochs):
    total_loss = 0
    t0 = time.time()
    for step, batch in enumerate(tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")):
        batch = {k: v.to(device) for k, v in batch.items()}

        with torch.autocast(device_type=device, dtype=torch.float16):
            outputs = model(**batch)
        loss = outputs.loss  # This is now a tensor, e.g., shape (num_gpus,)

        # --- KEY CHANGE: Reduce the loss to a scalar ---
        loss = loss.mean()  # Take the mean across the GPU losses

        loss = loss / gradient_accumulation_steps
        total_loss += loss.detach().float()
        loss.backward()

        if (step + 1) % gradient_accumulation_steps == 0 or step == len(train_dataloader) - 1:
            optimizer.step()
            scheduler.step()
            optimizer.zero_grad()
            torch.cuda.synchronize()
            t1 = time.time()
            dt = (t1 - t0)
            t0 = t1
            tokens_processed = train_dataloader.batch_size * 256 * gradient_accumulation_steps #256 is the max_length for the tokenizer
            tokens_per_sec = tokens_processed / dt
            print(f"dt: {dt:.2f}sec, tok/sec: {tokens_per_sec:.2f}")

        if (step + 1) % eval_steps == 0:
            model.eval()
            eval_loss = 0
            with torch.no_grad():
                for eval_batch in tqdm(eval_dataloader, desc="Evaluating"):
                    eval_batch = {k: v.to(device) for k, v in eval_batch.items()}
                    eval_outputs = model(**eval_batch)
                    eval_loss += eval_outputs.loss.detach().float()

            avg_eval_loss = eval_loss / len(eval_dataloader)
            print(f"Step {step+1}: Eval Loss: {avg_eval_loss:.4f}")
            model.train()  # Switch back to train mode

        if (step + 1) % save_steps == 0:
            checkpoint_dir = os.path.join(output_dir, f"checkpoint-{step + 1}")
            # Save the *underlying* model (important for DataParallel)
            if num_gpus > 1:
                model.module.save_pretrained(checkpoint_dir)
            else:
                model.save_pretrained(checkpoint_dir)
            tokenizer.save_pretrained(checkpoint_dir)

    avg_train_loss = total_loss / len(train_dataloader) * gradient_accumulation_steps
    print(f"Epoch {epoch + 1}: Train Loss: {avg_train_loss:.4f}")

# Save final model (again, handle DataParallel)
if num_gpus > 1:
    model.module.save_pretrained(output_dir)
else:
    model.save_pretrained(output_dir)

tokenizer.save_pretrained(output_dir)
print("Training complete!")