In [None]:
import torch
from datasets import load_dataset
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer
from transformers import EarlyStoppingCallback
from huggingface_hub import login
import pandas as pd
from peft import LoraConfig, get_peft_model

In [None]:
# paths for datasets
MED_DATASET_PATH = "data/medical.csv"
PYTHON_DATASET_PATH = "data/python.csv"

# paths for adapters
MED_LORA_PATH = "adapters/med_lora_adapter"
PYTHON_LORA_PATH = "adapters/python_lora_adapter"

# education lora is loaded from hugging face
EDU_LORA_PATH = "kaitchup/Llama-3.2-3B-Instruct-educational-chatbot"

ROUTER_SAFETENSORS = "router/router.safetensors"
COUNT_VECTORIZER = "router/count_vectorizer.joblib"

HUGGING_FACE_TOKEN = ""

DRAFT_MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"
TARGET_MODEL_NAME = "meta-llama/Llama-3.2-3B-Instruct"

# Data prep + training

This notebook serves as an example of how we trained our LoRA adapters. This specific example is on medical data and the output saves a medical LoRA adapter. We repeated the same process for the adapter trained on the python data.

In [None]:
df = pd.read_csv(MED_DATASET_PATH)

# Since these datasets are Q&A based, combine question and answer into instruction-output pairs
df["input"] = "Q: " + df["question"] + "\nFocus Area: " + df["focus_area"] + "\nA:"
df["output"] = df["answer"]

# Save the prepared dataset which will be used in training
df[["input", "output"]].to_csv("finetuning_data.csv", index=False)

sampled_df = df.sample(n=5000, random_state=42)
sampled_df[["input", "output"]].to_csv("sampled_finetuning_data.csv", index=False)

In [None]:
login(token=HUGGING_FACE_TOKEN)

# Load in the intermediate dataset
dataset = load_dataset("csv", data_files="sampled_finetuning_data.csv", split="train")

model_name = DRAFT_MODEL_NAME
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token

# Tokenize the dataset
def preprocess_data(examples):

    # Convert all None type inputs as the empty string
    examples_fixed = examples.copy()
    for key, value in examples_fixed.items():
        examples_fixed[key] = [str(item) if item is not None else "" for item in value]

    examples = examples_fixed

    inputs = tokenizer(examples["input"], truncation=True, padding="max_length", max_length=512)
    outputs = tokenizer(examples["output"], truncation=True, padding="max_length", max_length=512)
    inputs["labels"] = outputs["input_ids"]
    return inputs

tokenized_dataset = dataset.map(preprocess_data, batched=True)

In [None]:
# LoRA configuration
lora_config = LoraConfig(
    task_type="CAUSAL_LM",
    inference_mode=False,
    r=4,
    lora_alpha=16,
    lora_dropout=0.1,
)


model = get_peft_model(model, lora_config)

# Training arguments
training_args = TrainingArguments(
    output_dir="./lora_finetuned",
    evaluation_strategy="no",
    learning_rate=3e-4,
    per_device_train_batch_size=4,
    num_train_epochs=3,
    save_strategy="epoch",
    save_total_limit=1,
    logging_dir="./logs",
    logging_steps=10,
    report_to="none",
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset,
    tokenizer=tokenizer,
)

trainer.train()

In [None]:
# Save the LoRA adapter to file
model.save_pretrained("adapter/med_lora_adapter")