In [None]:
from pathlib import Path


DEVICE = "cuda"
PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Loading data

In [3]:
import pandas as pd


train_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/train.parquet")
eval_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/eval.parquet")

In [None]:
train_data.shape[0], eval_data.shape[0]

In [5]:
def format_training_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: {gpt_turn}"""

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
        gpt_turn=sample["formatted_conversations"][1]["value"],
    )

In [6]:
train_data["full_conversation"] = train_data.apply(
    format_training_conversations, axis=1
)
eval_data["full_conversation"] = eval_data.apply(format_training_conversations, axis=1)

In [7]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_data[["full_conversation"]].reset_index(drop=True)
        ),
        "eval": Dataset.from_pandas(
            eval_data[["full_conversation"]].reset_index(drop=True)
        )
    }
)

In [None]:
dataset

# Fine tuning model

## Creating LoRA config

In [9]:
from peft import LoraConfig


lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

## Instantiating Gemma 2 2B-it

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2-2b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
)
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
)

## Tokenizing train and eval data

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["full_conversation"],
        padding="max_length",
        truncation=True,
        max_length=3000,
    )


tokenized_train_data = dataset["train"].map(tokenize_function, batched=True)
tokenized_eval_data = dataset["eval"].map(tokenize_function, batched=True)

tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_eval_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Training model

In [None]:
import transformers
from trl import SFTTrainer


trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=4,
        warmup_steps=2,
        max_steps=10,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=1,
        output_dir="outputs",
    ),
    peft_config=lora_config,
    data_collator=data_collator,
)

In [None]:
trainer.train()