# Download the model and dataset from Hugging Face and fine-tune it using LoRA.


In [2]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "tiiuae/Falcon3-1B-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="cuda")

tokenizer_config.json:   0%|          | 0.00/362k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/9.78M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/826 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/653 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/3.34G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/91.0 [00:00<?, ?B/s]

In [None]:
import torch
print(torch.cuda.device_count())  # Check number of GPUs available
print(training_args.local_rank)  # Should be set for multi-GPU runs


In [3]:
from peft import get_peft_model, LoraConfig

# Define LoRA configuration
lora_config = LoraConfig(
    r=8,  # Rank for LoRA
    lora_alpha=32,  # Scaling factor
    lora_dropout=0.1,  # Dropout rate for LoRA layers
    task_type="CAUSAL_LM"
)

# Apply LoRA to the base model
model = get_peft_model(model, lora_config)

In [4]:
from datasets import DatasetDict
from datasets import load_dataset

dataset = load_dataset("yahma/alpaca-cleaned")

# Assuming 'dataset' has a 'train' split
train_dataset = dataset["train"]

# Split the 'train' dataset into train (80%) and eval (20%)
train_split = train_dataset.train_test_split(test_size=0.2)

# Now you have train_split['train'] and train_split['test']
train_dataset = train_split['train']  # 80% for training
eval_dataset = train_split['test']   # 20% for evaluation

# Function to merge instruction and input into a single string
def merge_instruction_input(example):
    # Concatenate instruction and input, you can add a separator if needed
    example['merged_input'] = example['instruction'] + " " + example['input']
    return example

# Apply the merge function to both train and eval datasets
train_dataset = train_dataset.map(merge_instruction_input)
eval_dataset = eval_dataset.map(merge_instruction_input)

# Function to tokenize the merged input and output
def tokenize_function(example):
    # Tokenize the 'merged_input' and 'output'
    input_encoding = tokenizer(example['merged_input'], padding="max_length", truncation=True, max_length=256)
    target_encoding = tokenizer(example['output'], padding="max_length", truncation=True, max_length=256)

    # Set input_ids and labels
    input_encoding['labels'] = target_encoding['input_ids']  # Use output as labels
    return input_encoding

# Apply the tokenization to both train and eval datasets
train_dataset = train_dataset.map(tokenize_function, batched=True)
eval_dataset = eval_dataset.map(tokenize_function, batched=True)

# List of columns to remove
columns_to_remove = ['output', 'input', 'instruction', 'merged_input']

# Remove the columns
train_dataset = train_dataset.remove_columns(columns_to_remove)
eval_dataset = eval_dataset.remove_columns(columns_to_remove)

# Print the modified train and eval datasets
print("Train Dataset:")
print(train_dataset)

print("\nEval Dataset:")
print(eval_dataset)

README.md:   0%|          | 0.00/11.6k [00:00<?, ?B/s]

alpaca_data_cleaned.json:   0%|          | 0.00/44.3M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/51760 [00:00<?, ? examples/s]

Map:   0%|          | 0/41408 [00:00<?, ? examples/s]

Map:   0%|          | 0/10352 [00:00<?, ? examples/s]

Map:   0%|          | 0/41408 [00:00<?, ? examples/s]

Map:   0%|          | 0/10352 [00:00<?, ? examples/s]

Train Dataset:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 41408
})

Eval Dataset:
Dataset({
    features: ['input_ids', 'attention_mask', 'labels'],
    num_rows: 10352
})


# Use wandb to log and save the model

In [7]:
import wandb
from transformers import Trainer, TrainingArguments
from huggingface_hub import login
login("hf_token")

wandb.login(key="wandb_token")
# Example fine-tuning parameters
training_args = TrainingArguments(
    output_dir="./results",              # Directory where model checkpoints and logs will be saved
    per_device_train_batch_size=8,       # Batch size per GPU (if multiple GPUs, total batch size = batch_size * num_gpus)
    per_device_eval_batch_size=8,        # Batch size per GPU for evaluation
    evaluation_strategy="epoch",         # Evaluate at the end of every epoch
    num_train_epochs=3,                  # Train for 3 full passes through the dataset
    logging_dir="./logs",                # Directory for logs (useful for TensorBoard)
    save_strategy="epoch",               # Save checkpoints at the end of each epoch
    save_total_limit=2,                  # Keep only the last 2 checkpoints, deleting older ones
    report_to="wandb",                   # Report training metrics to Weights & Biases
    push_to_hub=True,                    # Push model checkpoints to Hugging Face Hub
    fp16=True,                           # Enable mixed precision (use bf16=True for newer GPUs)
    torch_compile=True,                  # Enable PyTorch 2.0 compilation
    ddp_find_unused_parameters=False,    # Optimize DDP (if using multiple GPUs)
    gradient_accumulation_steps=2,       # Simulates larger batch size without extra GPU memory
    save_steps=500,                      # Save every 500 steps instead of just every epoch
)


trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,  # Your custom dataset
    eval_dataset=eval_dataset,  # Your validation dataset
)

trainer.train()


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


Epoch,Training Loss,Validation Loss
1,7.4839,7.477202
2,7.3911,7.422033
3,7.3718,7.405322


No files have been modified since last commit. Skipping to prevent empty commit.


TrainOutput(global_step=15528, training_loss=7.509777732389235, metrics={'train_runtime': 38296.9512, 'train_samples_per_second': 3.244, 'train_steps_per_second': 0.405, 'total_flos': 2.6751395593558426e+17, 'train_loss': 7.509777732389235, 'epoch': 3.0})

In [11]:
import wandb

artifact = wandb.Artifact("model_checkpoint", type="model")
artifact.add_dir("./results/")  # Upload all checkpoint files
wandb.log_artifact(artifact)


[34m[1mwandb[0m: Adding directory to artifact (./results)... Done. 0.0s


<Artifact model_checkpoint>

# Import the model artifact from wandb and compare its performance to the base model.

In [11]:
from transformers import  AutoModelForCausalLM

import wandb
wandb.init()
artifact = wandb.use_artifact("1257979-konkuk-university/huggingface/model_checkpoint:latest", type="model")
artifact_dir = artifact.download()

# Load the model
from transformers import AutoModel
model = AutoModelForCausalLM.from_pretrained(artifact_dir)


[34m[1mwandb[0m:   19 of 19 files downloaded.  


In [22]:
from transformers import AutoTokenizer, GenerationConfig

model_name = "tiiuae/Falcon3-1B-Base"
model.generation_config = GenerationConfig.from_pretrained(model_name)
model.generation_config.pad_token_id = model.generation_config.eos_token_id

tokenizer = AutoTokenizer.from_pretrained(model_name)
text = "What is the capital of South Korea?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model.generate(**inputs.to(model.device), max_new_tokens=10)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)

What is the capital of South Korea? Seoul

.

.





In [21]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, GenerationConfig

model_name = "tiiuae/Falcon3-1B-Base"

tokenizer = AutoTokenizer.from_pretrained(model_name)
model2 = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.bfloat16, device_map="cuda")
model2.generation_config = GenerationConfig.from_pretrained(model_name)
model2.generation_config.pad_token_id = model2.generation_config.eos_token_id

text = "What is the capital of South Korea?"
inputs = tokenizer(text, return_tensors="pt")
outputs = model2.generate(**inputs.to(model2.device), max_new_tokens=50)

result = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(result)


What is the capital of South Korea?


What is the capital of South Korea?


What is the capital of South Korea?


What is the capital of South Korea?


What is the capital of South Korea?


What is the
