In [1]:
import os
os.environ["CC"] = os.popen("which gcc").read().strip()
os.environ["CXX"] = os.popen("which g++").read().strip()

from datasets import load_dataset

# Load the processed dataset
dataset = load_dataset("json", data_files="nonbooks_poutput_sharegpt.jsonl")
full_dataset = dataset["train"]

# Step 1: Split into training (70%) and temp (30%)
train_temp_split = full_dataset.train_test_split(test_size=0.3, seed=42)
train_dataset = train_temp_split["train"]  # 70% for training
temp_dataset = train_temp_split["test"]    # 30% for validation + test

# Step 2: Split temp into validation (20%) and test (10%)
valid_test_split = temp_dataset.train_test_split(test_size=1/3, seed=42)  # 1/3 of 30% is 10%
eval_dataset = valid_test_split["train"]  # 20% for validation
test_dataset = valid_test_split["test"]   # 10% for testing

# Print dataset sizes
print(f"Training examples: {len(train_dataset)}")
print(f"Validation examples: {len(eval_dataset)}")
print(f"Test examples: {len(test_dataset)}")

Training examples: 5963
Validation examples: 1704
Test examples: 852


In [2]:
def flatten_conversations(example):
    flattened_text = ""
    for turn in example["conversations"]:
        role = turn["role"]
        content = turn["content"].strip()
        if role == "user":
            flattened_text += f"### User:\n{content}\n"
        elif role == "assistant":
            flattened_text += f"### Assistant:\n{content}\n"
        else:
            raise ValueError(f"Invalid role: {role}")
    return {"text": flattened_text}

# Apply the transformation
train_dataset = train_dataset.map(flatten_conversations)
eval_dataset = eval_dataset.map(flatten_conversations)
# Inspect the result
print(train_dataset[0])
print(eval_dataset[0])

{'conversations': [{'role': 'user', 'content': 'The U.S. always portrays itself as the greatest force on the planet for peace, justice, human rights, racial equality, etc. Polls tell us that most other nations actually regard the U.S. as the greatest threat to stability. What in your view is the truth here?'}, {'role': 'assistant', 'content': 'Even during the Obama years, international polls showed that world opinion regarded the US as the greatest threat to world peace, no other country even close. Americans were protected from the news, though one could learn about it from foreign media and dissident sources. Sometimes illustrations are reported. Thus there has been some mention of the recent UN vote condemning the savage Cuba sanctions, virtually a blockade: 180-2 (US-Israel). The NY Times dismissed it as a chance for critics of the US to blow off steam. That’s quite normal. When there are reports of how the world is out of step, the usual framework is curiosity about the psychic ma

In [3]:
from unsloth import FastLanguageModel
import torch

max_seq_length = 512
dtype = None
load_in_4bit = True

model, tokenizer = FastLanguageModel.from_pretrained(
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit",
    max_seq_length = max_seq_length,
    dtype = dtype,
    load_in_4bit = load_in_4bit,
)

🦥 Unsloth: Will patch your computer to enable 2x faster free finetuning.


    PyTorch 2.5.1 with CUDA 1201 (you have 2.6.0+cu124)
    Python  3.11.10 (you have 3.11.11)
  Please reinstall xformers (see https://github.com/facebookresearch/xformers#installing-xformers)
  Memory-efficient attention, SwiGLU, sparse and more won't be available.
  Set XFORMERS_MORE_DETAILS=1 for more details


Unsloth: Failed to patch Gemma3ForConditionalGeneration.
🦥 Unsloth Zoo will now patch everything to make training faster!
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


model.safetensors:   0%|          | 0.00/1.03G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/234 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/54.7k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/17.2M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/454 [00:00<?, ?B/s]

In [4]:
model = FastLanguageModel.get_peft_model(
    model,
    r = 8,
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
    lora_alpha = 16, 
    lora_dropout = 0.1,
    bias = "none",
    use_gradient_checkpointing = "unsloth",
    random_state = 3407,
    use_rslora = True,
    loftq_config = None,
)

Unsloth: Dropout = 0 is supported for fast patching. You are using dropout = 0.1.
Unsloth will patch all other layers, except LoRA matrices, causing a performance hit.
Unsloth 2025.3.19 patched 16 layers with 0 QKV layers, 0 O layers and 0 MLP layers.


In [8]:
from transformers import TrainingArguments
from trl import SFTTrainer
from unsloth import is_bfloat16_supported

training_args = TrainingArguments(
    output_dir="outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=8,
    gradient_checkpointing=True,
    warmup_steps=100,
    num_train_epochs=10,
    learning_rate=1e-5,
    fp16=not is_bfloat16_supported(),
    bf16=is_bfloat16_supported(),
    logging_steps=10,
    save_steps=100,
    save_total_limit=2,
    logging_dir="logs",
    report_to="none",
)

trainer = SFTTrainer(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    dataset_text_field="text",
    max_seq_length=512,
    dataset_num_proc=2,
    packing=False,
    args=training_args,
)
# Start training
trainer.train()

# Save the fine-tuned model and tokenizer
model.save_pretrained("model/Llama-3.2-1B-Instruct-bnb-4bit/model/fine-tuned-model")
tokenizer.save_pretrained("model/Llama-3.2-1B-Instruct-bnb-4bit/token/fine-tuned-model")

Unsloth: Tokenizing ["text"] (num_proc=2):   0%|          | 0/5963 [00:00<?, ? examples/s]

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 5,963 | Num Epochs = 10 | Total steps = 7,450
O^O/ \_/ \    Batch size per device = 1 | Gradient accumulation steps = 8
\        /    Data Parallel GPUs = 1 | Total batch size (1 x 8 x 1) = 8
 "-____-"     Trainable parameters = 5,636,096/1,000,000,000 (0.56% trained)


Step,Training Loss
10,0.936
20,0.9648
30,0.8846
40,0.7858
50,0.5268
60,0.2283
70,0.0953
80,0.0561
90,0.0496
100,0.0539


('model/Llama-3.2-1B-Instruct-bnb-4bit/token/fine-tuned-model/tokenizer_config.json',
 'model/Llama-3.2-1B-Instruct-bnb-4bit/token/fine-tuned-model/special_tokens_map.json',
 'model/Llama-3.2-1B-Instruct-bnb-4bit/token/fine-tuned-model/tokenizer.json')