In [1]:
# Check if GPU is used
import torch
print(torch.cuda.is_available())

True


In [2]:
import pandas as pd
df = pd.read_csv("../Dataset/dataset_QA.csv")

In [3]:
df.head(2)

Unnamed: 0,question,answer
0,What is the goal of this machine learning tech...,To help you or your team work on a machine lea...
1,What does the guide assume about the reader's ...,It assumes the reader has taken a machine lear...


In [4]:
from transformers import AutoTokenizer, DataCollatorForLanguageModeling, Trainer, TrainingArguments
from datasets import Dataset
import pandas as pd

# Load tokenizer and data collator
tokenizer = AutoTokenizer.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False,
    pad_to_multiple_of=8,  # Helps with efficiency
    return_tensors="pt"
)
# Convert to Dataset
dataset = Dataset.from_pandas(df[["question", "answer"]])

# Format question-answer into a text prompt
def format_qa(example):
    prompt = f"Q: {example['question']}\nA: {example['answer']}"
    return {"text": prompt}

# Tokenize formatted prompt
def tokenize(example):
    return tokenizer(
        example["text"],
        truncation=True,
        padding="max_length",
        max_length=512
    )

# Set labels = input_ids for causal LM
def set_labels(example):
    example["labels"] = example["input_ids"]
    return example

# Apply formatting, tokenization, and labels
formatted_dataset = dataset.map(format_qa)
tokenized_dataset = formatted_dataset.map(tokenize, batched=True)
tokenized_dataset = tokenized_dataset.map(set_labels)
tokenized_dataset = tokenized_dataset.remove_columns(["question", "answer", "text"])
tokenized_dataset.set_format(type="torch", columns=["input_ids", "attention_mask", "labels"])


# Train/test split
split_dataset = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = split_dataset["train"]
test_dataset = split_dataset["test"]


  from scipy.sparse import csr_matrix, issparse





Map:   0%|          | 0/2592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2592 [00:00<?, ? examples/s]

Map:   0%|          | 0/2592 [00:00<?, ? examples/s]

In [5]:
from transformers import AutoModelForCausalLM
from transformers import BitsAndBytesConfig

from peft import LoraConfig, get_peft_model
# Load model directly
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.float16,
    bnb_4bit_quant_type="nf4"
)

model = AutoModelForCausalLM.from_pretrained("TinyLlama/TinyLlama-1.1B-Chat-v1.0",
                                             quantization_config=bnb_config,
                                             device_map="auto",
                                             trust_remote_code=True)

# Configure LoRA
lora_config = LoraConfig(
    r=256,
    lora_alpha=32,
    target_modules = ["self_attn.q_proj", "self_attn.k_proj", "self_attn.v_proj", "self_attn.o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.gradient_checkpointing_disable()
model.train() 
...

Ellipsis

In [6]:
model.print_trainable_parameters()

trainable params: 72,089,600 || all params: 1,172,137,984 || trainable%: 6.1503


In [8]:
import os
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"
#model.save_pretrained("../Models/Base_Model_TinyLLama-1.1B-Chat-v1.0")

In [10]:
from transformers import TrainingArguments
from transformers import Trainer
import torch
torch.cuda.empty_cache()
training_args = TrainingArguments(
    output_dir="checkpoints",
    num_train_epochs=3,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=1,
    eval_steps=1000,
    save_steps=1000,
    logging_steps=50,
    fp16=True,
    save_total_limit=1,
    eval_strategy="steps",
    save_strategy="steps",
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    dataloader_pin_memory=False,
    remove_unused_columns=False,
    label_names=["labels"]
)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=test_dataset,
    tokenizer=tokenizer,
    compute_metrics=None,
    data_collator = data_collator
)

trainer.train()

  trainer = Trainer(


Step,Training Loss,Validation Loss
1000,1.8842,1.924438
2000,1.8253,1.884871
3000,1.7081,1.865707
4000,1.806,1.84553
5000,1.6425,1.841192
6000,1.522,1.836893


TrainOutput(global_step=6996, training_loss=1.7227391951284523, metrics={'train_runtime': 59911.2765, 'train_samples_per_second': 0.117, 'train_steps_per_second': 0.117, 'total_flos': 2.378277113875661e+16, 'train_loss': 1.7227391951284523, 'epoch': 3.0})

In [13]:
import evaluate
from transformers import TrainerCallback
import pandas as pd

accuracy = evaluate.load("accuracy")
f1 = evaluate.load("f1")
bleu = evaluate.load("bleu")
rouge = evaluate.load("rouge")
bertscore = evaluate.load("bertscore")

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    refs = tokenizer.batch_decode(labels, skip_special_tokens=True)

    # basic string cleaning
    preds = [p.strip() for p in preds]
    refs = [r.strip() for r in refs]

    results = {}
    results.update(accuracy.compute(predictions=preds, references=refs))
    results.update(f1.compute(predictions=preds, references=refs, average="macro"))
    results.update(bleu.compute(predictions=[[p.split()] for p in preds], references=[[r.split()] for r in refs]))
    results.update(rouge.compute(predictions=preds, references=refs, use_stemmer=True))
    results.update(bertscore.compute(predictions=preds, references=refs, lang="en"))

    return {
        "accuracy": results["accuracy"],
        "f1": results["f1"],
        "bleu": results["bleu"],
        "rouge1": results["rouge1"],
        "rougeL": results["rougeL"],
        "bertscore_f1": sum(results["bertscore_f1"]) / len(results["bertscore_f1"])
    }

class CSVLogger(TrainerCallback):
    def __init__(self, path="metrics_log.csv"):
        self.path = path
        self.logs = []

    def on_evaluate(self, args, state, control, metrics=None, **kwargs):
        if metrics:
            metrics["epoch"] = state.epoch
            self.logs.append(metrics)
            pd.DataFrame(self.logs).to_csv(self.path, index=False)


In [15]:
torch.cuda.empty_cache()