In [2]:
import os
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    pipeline,
    logging,
)
from peft import LoraConfig
from trl import SFTTrainer

In [3]:
from datasets import load_dataset
from transformers import AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from peft import LoraConfig, get_peft_model, TaskType

In [4]:
dataset = load_dataset("json", data_files="train_finetune.jsonl")["train"]


In [5]:
def format_example(example):
    return {
        "text": f"### Instruction:\n{example['input']}\n\n### Response:\n{example['output']}"
    }

dataset = dataset.map(format_example)


In [None]:
from huggingface_hub import login

# Paste your Hugging Face access token here
login("")


In [7]:
model_name = "meta-llama/Llama-2-7b-chat-hf"

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Important: LLaMA 2 tokenizer may not have a pad token
# safest is to set pad_token = eos_token
tokenizer.pad_token = tokenizer.eos_token

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",  # will put model on GPU(s) if available
    dtype="auto"
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [8]:
split_dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_data = split_dataset["train"]
val_data = split_dataset["test"]

print(train_data[0]["text"])

### Instruction:
Context:
Why does a tree make sound when it crashes to the ground? How does the sound reach peoples ears if they happen to be in the forest? And in general, how do sounds get started, and how do they travel? Keep reading to find out. All sounds begin with vibrating matter. It could be the ground vibrating when a tree comes crashing down. Or it could be guitar strings vibrating when they are plucked. You can see a guitar string vibrating in Figure 20.2. The vibrating string repeatedly pushes against the air particles next to it. The pressure of the vibrating string causes these air particles to vibrate. The air particles alternately push together and spread apart. This starts waves of vibrations that travel through the air in all directions away from the strings. The vibrations pass through the air as longitudinal waves, with individual air particles vibrating back and forth in the same direction that the waves travel. You can see an animation of sound waves moving thro

In [9]:
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

train_tokenized = train_data.map(tokenize, batched=True, remove_columns=train_data.column_names)
val_tokenized = val_data.map(tokenize, batched=True, remove_columns=val_data.column_names)


Map:   0%|          | 0/866 [00:00<?, ? examples/s]

In [10]:
from transformers import AutoModelForCausalLM, BitsAndBytesConfig

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_compute_dtype="float16",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    device_map="auto",
    quantization_config=bnb_config
)


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [11]:
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    r=16,
    lora_alpha=32,
    lora_dropout=0.1
)
model = get_peft_model(model, peft_config)


In [12]:

model.print_trainable_parameters()


trainable params: 8,388,608 || all params: 6,746,804,224 || trainable%: 0.1243


In [13]:
training_args = TrainingArguments(
    output_dir="/content/tinyllama-finetuned",
    per_device_train_batch_size=2,
    gradient_accumulation_steps=4,
    learning_rate=2e-4,
    num_train_epochs=3,
    logging_steps=50,
    save_strategy="epoch",   # still valid
    eval_strategy="epoch",   # <-- fix for older versions
    save_total_limit=2,
    fp16=True,
    report_to="none"
)


In [14]:

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_tokenized,
    eval_dataset=val_tokenized,
    data_collator=DataCollatorForLanguageModeling(tokenizer, mlm=False)
)


In [15]:
trainer.train()


Epoch,Training Loss,Validation Loss
1,0.402,0.405009
2,0.1849,0.174156
3,0.0906,0.101879


TrainOutput(global_step=2922, training_loss=0.3819257597006159, metrics={'train_runtime': 3390.6057, 'train_samples_per_second': 6.89, 'train_steps_per_second': 0.862, 'total_flos': 4.747779701295022e+17, 'train_loss': 0.3819257597006159, 'epoch': 3.0})

In [16]:
trainer.save_model("new_fine_tunned_lama2")


In [1]:
import math
import pandas as pd
from datasets import load_dataset
from tqdm import tqdm
from transformers import AutoTokenizer

# 1. Reload tokenizer (fix padding issue for LLaMA)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Llama-2-7b-hf")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# 2. Define preprocess function
def preprocess(examples):
    combined = [f"{i}\n{o}" for i, o in zip(examples["input"], examples["output"])]
    return tokenizer(
        combined,
        truncation=True,
        padding="max_length",  # or "longest" if you prefer dynamic padding
        max_length=1024
    )


In [22]:
# 3. Load test dataset
test_dataset = load_dataset("json", data_files={"test": "test_finetune.jsonl"})

# 4. Preprocess test dataset
tokenized_test = test_dataset.map(
    preprocess,
    batched=True,
    remove_columns=test_dataset["test"].column_names
)

Map:   0%|          | 0/2512 [00:00<?, ? examples/s]

In [23]:
# 5. Evaluate with Trainer (loss + perplexity)
metrics = trainer.evaluate(eval_dataset=tokenized_test["test"])
print("Test Metrics:", metrics)
print("Perplexity:", math.exp(metrics["eval_loss"]))


Test Metrics: {'eval_loss': 2.900541067123413, 'eval_runtime': 226.7565, 'eval_samples_per_second': 11.078, 'eval_steps_per_second': 1.385, 'epoch': 3.0}
Perplexity: 18.18398146275199


In [None]:
# 6. Exact-Match Accuracy Evaluation
predictions_data = []
correct, total = 0, 0

for example in tqdm(test_dataset["test"], desc="Evaluating"):
    inputs = tokenizer(example["input"], return_tensors="pt").to(model.device)
    outputs = model.generate(**inputs, max_new_tokens=50)
    prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)

    match = example["output"].strip() in prediction
    if match:
        correct += 1
    total += 1

    predictions_data.append({
        "input": example["input"],
        "expected_output": example["output"],
        "model_prediction": prediction,
        "match": match
    })

accuracy = correct / total if total > 0 else 0
print(f"\nExact-Match Accuracy on Test Set: {accuracy:.2%}")



Evaluating:   4%|▍         | 95/2512 [05:25<1:54:33,  2.84s/it]