In [None]:
print("helloworld")

# Create and activate your venv
# run the pip install -r requirements.txt

In [None]:
#import dependencies
import torch
import pandas as pd
from datasets import load_dataset, Dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments
from trl import SFTTrainer, DataCollatorForCompletionOnlyLM, SFTConfig

In [None]:
#check if gpu is available
print(torch.cuda.is_available())
print(torch.version.cuda)


# Define Helpers Functions

In [None]:
def generate_responses(model, tokenizer, user_message, system_message=None, max_new_tokens=100):
    # Format chat using tokenizer's chat template
    messages = []
    if system_message:
        messages.append({"role": "system", "content": system_message})
    # we assume the data are all single turn conversation
    messages.append({"role": "user", "content": user_message})

    prompt = tokenizer.apply_chat_template(
        messages, 
        tokenize=False, 
        add_generation_prompt=True,
        enable_thinking=False
    )

    # Convert messages to token IDs, send to device incase the model is on gpu
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    # Recommended to use vllm, sgland or TensorRt
    with torch.no_grad():
        outputs = model.generate(
            **inputs, 
            max_new_tokens=max_new_tokens,
            do_sample=False,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
        )
    # extract the generated ids and the responses
    input_len = inputs["input_ids"].shape[1]
    generated_ids = outputs[0][input_len:]
    # Decode response to text base response
    response = tokenizer.decode(generated_ids, skip_special_tokens=True).strip()
    return response


In [None]:
def test_model_with_questions(model, tokenizer, questions,
                             system_message=None, title="Model Output"):
    print(f"\n==== {title} ====\n")
    rows = []
    for i, question in enumerate(questions, 1):
        response = generate_responses(model, tokenizer, question, system_message)
        rows.append({"User Prompt": question, "Assistant Response": response})
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None) # avoid truncating long text
    display(df)


In [None]:
def load_model_and_tokenizer(model_name, use_gpu=True):
    """
    Load the model and tokenizer from the model name
    """
    tokenizer = AutoTokenizer.from_pretrained(model_name)
    model = AutoModelForCausalLM.from_pretrained(model_name)
    if use_gpu:
        model.to("cuda")
    # if the model doesn't have a chat template, we need to define it
    if not tokenizer.chat_template:
        tokenizer.chat_template = """{% for message in messages %}
            {% if message['role'] == 'system' % }System: {{ message['content'] }}\n
            {% elif message['role'] == 'user' % }User: {{ message['content'] }}\n
            {% elif message['role'] == 'assistant' % }Assistant: {{ message['content'] }} <|endoftext|>\n
            {% endif %}
            {% endfor %}"""
    # Tokenizer config
    if not tokenizer.pad_token:
        tokenizer.pad_token = tokenizer.eos_token
        
    return model, tokenizer


In [None]:
def display_dataset(dataset):
    # Visualize the dataset
    rows = []
    # select the first 10 examples
    #print the length of the dataset
    print(f"Length of the dataset: {len(dataset)}")
    for i in dataset:
        example = i
        user_msg = next(m['content'] for m in example['messages'] if m['role'] == 'user')
        assistant_msg = next(m['content'] for m in example['messages'] if m['role'] == 'assistant')
        rows.append({
            "User Prompt": user_msg,
            "Assistant Response": assistant_msg
        })
    # Display as a table
    df = pd.DataFrame(rows)
    pd.set_option('display.max_colwidth', None) # avoid truncating long text
    display(df)



# Load base model & test on simple questions

In [None]:
USE_GPU = torch.cuda.is_available()
print(f"Using GPU: {USE_GPU}")

questions = [
    "Give me aan 1-sentence introduction to the topic of AI",
    "Calculate the sum of 1+1-1",
    "What is the difference between a thread and a process?"
]

In [None]:
# using a base model before sft
model, tokenizer = load_model_and_tokenizer("Qwen/Qwen3-0.6B-Base", use_gpu=USE_GPU)

test_model_with_questions(model, tokenizer, questions, title="Base Model (Before SFT) Output")

#delete model and tokenizer after testing
del model, tokenizer

In [None]:
# using a sft model
model, tokenizer = load_model_and_tokenizer("banghua/Qwen3-0.6B", use_gpu=USE_GPU)
test_model_with_questions(model, tokenizer, questions, title="Base Model (After SFT) Output")

# Doing SFT on a small model

In [None]:
model_name = "Qwen/Qwen3-0.6B-Base"
model, tokenizer = load_model_and_tokenizer(model_name, use_gpu=USE_GPU)

In [None]:
train_dataset = load_dataset("banghua/DL-SFT-Dataset")["train"]
if not USE_GPU:
    # limit the dataset to 100
    train_dataset = train_dataset.select(range(100))
display_dataset(train_dataset.select(range(200)))

In [None]:
# SFTTrainer config
sft_config = SFTConfig(
  learning_rate=8e-5, # Learning rate for training.
  num_train_epochs=1, # Set the number of epochs to train the model.
  per_device_train_batch_size=1, # Batch size for each device (e.g, GPU) during training.
  gradient_accumulation_steps=8, # Number of steps before performing a backward/update pass to accumulate gradients.
  gradient_checkpointing=False, # Enable gradient checkpointing to reduce memory usage during training at the cost of slower training speed
  logging_steps=2 # Frequency of logging during training progress (log every 2 steps).
)

## show current memory stats before training


In [None]:
import psutil  
#@title Show current memory stats
if USE_GPU:
    gpu_stats = torch.cuda.get_device_properties(0)
    start_gpu_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    max_memory = round(gpu_stats.total_memory / 1024 / 1024 / 1024, 3)
    print(f"GPU = {gpu_stats.name}. Max memory = {max_memory} GB.")
    print(f"{start_gpu_memory} GB of memory reserved.")
    
else:
    cpu_stats = psutil.virtual_memory()
    cpu_usage = psutil.cpu_percent(interval=1)
    print(f"CPU Usage: {cpu_usage}%")
    print(f"CPU Memory: {round(cpu_stats.used / 1024 / 1024 / 1024, 3)} GB used / {round(cpu_stats.total / 1024 / 1024 / 1024, 3)} GB total")
    print(f"CPU Memory Available: {round(cpu_stats.available / 1024 / 1024 / 1024, 3)} GB")

In [None]:
sft_trainer = SFTTrainer(
    model=model,
    args=sft_config,
    train_dataset=train_dataset,
    processing_class=tokenizer,
)
#train the model
trainer_stats = sft_trainer.train()

## show final memory and time stats after training, only when running on gpu

In [None]:
#@title Show final memory and time stats
if USE_GPU:
    used_memory = round(torch.cuda.max_memory_reserved() / 1024 / 1024 / 1024, 3)
    used_memory_for_lora = round(used_memory - start_gpu_memory, 3)
    used_percentage = round(used_memory         /max_memory*100, 3)
    lora_percentage = round(used_memory_for_lora/max_memory*100, 3)
    
    print(f"{sft_trainer.metrics['train_runtime']} seconds used for training.")
    print(f"{round(sft_trainer.metrics['train_runtime']/60, 2)} minutes used for training.")
    print(f"Peak reserved memory = {used_memory} GB.")
    print(f"Peak reserved memory for training = {used_memory_for_lora} GB.")
    print(f"Peak reserved memory % of max memory = {used_percentage} %.")
    print(f"Peak reserved memory for training % of max memory = {lora_percentage} %.")

# Testing training results on small model and small dataset

In [None]:
if not USE_GPU: # move model to CPU  when GPU is not requested
    sft_trainer.model.to("cpu")
test_model_with_questions(sft_trainer.model, sft_trainer.processing_class, questions, title="SFT Model (After SFT) Output")