In [None]:
from pathlib import Path


DEVICE = "cuda"
PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Loading data

In [3]:
import pandas as pd


train_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/train.parquet")
eval_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/eval.parquet")
test_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/test.parquet")

In [4]:
def format_training_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: {gpt_turn}"""

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
        gpt_turn=sample["formatted_conversations"][1]["value"],
    )


def format_test_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: """

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
    )

In [5]:
train_data["full_conversation"] = train_data.apply(
    format_training_conversations, axis=1
)
eval_data["full_conversation"] = eval_data.apply(format_training_conversations, axis=1)

test_data["full_conversation"] = test_data.apply(format_test_conversations, axis=1)

In [6]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_data[["full_conversation"]].reset_index(drop=True)
        ),
        "eval": Dataset.from_pandas(
            eval_data[["full_conversation"]].reset_index(drop=True)
        ),
        "test": Dataset.from_pandas(
            test_data[["full_conversation"]].reset_index(drop=True)
        ),
    }
)

In [None]:
dataset

# Instantiating Gemma 2 2B-it

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", device_map=DEVICE)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'

model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", device_map=DEVICE)

In [None]:
model.to(DEVICE)


In [None]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=4,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()

model.print_trainable_parameters()

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["full_conversation"],
        padding="max_length",
        truncation=True,
        max_length=3000,
    )


tokenized_train_data = dataset["train"].map(tokenize_function, batched=True)
tokenized_eval_data = dataset["eval"].map(tokenize_function, batched=True)

tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_eval_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    optim="paged_adamw_8bit",
    gradient_accumulation_steps=8,
    per_device_train_batch_size=1,
    per_device_eval_batch_size=1,
    fp16=True,
    learning_rate=1e-5,
    num_train_epochs=2,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    output_dir="./output",
)

In [None]:
from trl import SFTTrainer

sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    data_collator=data_collator,
    peft_config=lora_config,
)

In [None]:
sft_trainer.train()