In [None]:
#Step 1
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import load_dataset,  Dataset
import transformers
import torch

In [None]:
# Step 2: Load the Base Model and Tokenizer
model_name = "Qwen/Qwen2.5-1.5B-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype="auto",  # Use FP32 for CPU
    device_map="cpu"     # Run on CPU
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [None]:
# Step 3: Prepare the Model for LoRA Fine-Tuning
model = prepare_model_for_kbit_training(model)

In [None]:
#identifying potential modules for use in Lora configuration (useful if you don't know the modules in model, run list through chatgpt to find out which ones to use in your model)
for name, module in model.named_modules():
    print(name)

In [None]:
# Define LoRA Configuration
lora_config = LoraConfig(
    r=16,  # LoRA rank
    lora_alpha=32,  # Scaling factor
    target_modules=["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj"],  # Layers to adapt, identified in the optional step above
    lora_dropout=0.1,  # Dropout for LoRA
    bias="none",  # Do not fine-tune biases
    task_type="CAUSAL_LM"  # Task type: causal language modeling
)

# Apply LoRA to the Model
model = get_peft_model(model, lora_config)

In [None]:
# Step 4: Load and Prepare the Datasets
# Load the question-answer dataset
import json
with open("cancer_qa.txt", "r") as f:
    qa_data = json.load(f)

qa_dataset = Dataset.from_dict({
    "prompt": [item["question"] for item in qa_data],
    "response": [item["answer"] for item in qa_data]
})

In [None]:
# Load unstructured course notes
with open("cancer_data.txt", "r") as f:
    course_notes = f.readlines()

# Create synthetic prompt-response pairs from course notes
unstructured_data = [
    {"prompt": f"Explain: {note.strip()}", "response": note.strip()} for note in course_notes
]

In [None]:
# Combine datasets
all_data = qa_dataset.add_batch(
    {"prompt": [item["prompt"] for item in unstructured_data], 
     "response": [item["response"] for item in unstructured_data]}
)

In [None]:
# Tokenize and preprocess the dataset
def preprocess_function(examples):
    # Concatenate prompt and response element-wise
    combined_texts = [p + " " + r for p, r in zip(examples["prompt"], examples["response"])]
    # Tokenize the concatenated texts
    inputs = tokenizer(combined_texts, max_length=512, truncation=True, padding="max_length")
    inputs["labels"] = inputs["input_ids"].copy()  # Set input_ids as labels
    return inputs

In [None]:
# Apply the preprocessing to the dataset
tokenized_dataset = all_data.map(preprocess_function, batched=True)

In [None]:
training_args = TrainingArguments(
    output_dir="./qwen2.5-c-tuned",  # Directory to save the fine-tuned model
    per_device_train_batch_size=1,  # Batch size for CPU
    num_train_epochs=3,  # Number of epochs
    save_steps=500,  # Save checkpoint every 500 steps
    logging_dir="./logs",  # Log directory
    logging_steps=10,  # Log every 10 steps
    evaluation_strategy="no",  # Disable evaluation (CPU performance optimization)
    fp16=False,  # Disable mixed precision (CPU only)
    push_to_hub=False  # Do not push to Hugging Face Hub
)


In [None]:
# Step 6: Train the Model
trainer = Trainer(
    model=model,  # The model to train
    args=training_args,  # Training arguments
    train_dataset=tokenized_dataset,  # The tokenized dataset
    tokenizer=tokenizer  # Tokenizer for preprocessing
)

trainer.train()
#TrainOutput(global_step=249, training_loss=1.2162665707998008, metrics={'train_runtime': 7465.7666, 'train_samples_per_second': 0.033, 'train_steps_per_second': 0.033, 'total_flos': 1004597150220288.0, 'train_loss': 1.2162665707998008, 'epoch': 3.0}) on AWS workspace desktop

In [None]:
trainer.save_model("./qwen2.5-finetuned")  # Save the model
tokenizer.save_pretrained("./qwen2.5-finetuned")  # Save the tokenizer