In [None]:
from pathlib import Path


DEVICE = "cuda"
PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

In [None]:
from huggingface_hub import notebook_login

notebook_login()

# Loading data

In [3]:
import pandas as pd


train_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/train.parquet").sample(n=3)
eval_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/eval.parquet").sample(n=1)
test_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/test.parquet").sample(n=1)

In [4]:
def format_training_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: {gpt_turn}"""

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
        gpt_turn=sample["formatted_conversations"][1]["value"],
    )


def format_test_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: """

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
    )

In [5]:
train_data["full_conversation"] = train_data.apply(
    format_training_conversations, axis=1
)
eval_data["full_conversation"] = eval_data.apply(format_training_conversations, axis=1)

test_data["full_conversation"] = test_data.apply(format_test_conversations, axis=1)

In [6]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_data[["full_conversation"]].reset_index(drop=True)
        ),
        "eval": Dataset.from_pandas(
            eval_data[["full_conversation"]].reset_index(drop=True)
        ),
        "test": Dataset.from_pandas(
            test_data[["full_conversation"]].reset_index(drop=True)
        ),
    }
)

In [None]:
dataset

# Exploring Gemma 2B-it model

In [None]:
%%time

from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", device_map=DEVICE)

# model = AutoModel.from_pretrained("google/gemma-2-2b-it", device_map=DEVICE)
model = AutoModelForCausalLM.from_pretrained("google/gemma-2-2b-it", device_map=DEVICE)

In [None]:
model.to(DEVICE)

# Fine tune Gemma in a couple of samples

In [None]:
def tokenize_function(examples):
    return tokenizer(
        examples["full_conversation"],
        padding="max_length",
        truncation=True,
        max_length=512,
    )


tokenized_train_data = dataset["train"].map(tokenize_function, batched=True)
tokenized_eval_data = dataset["eval"].map(tokenize_function, batched=True)

tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_eval_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

In [11]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [12]:
from transformers import Trainer, TrainingArguments

training_args = TrainingArguments(
    output_dir="./results",
    eval_strategy="epoch",
    learning_rate=1e-5,
    num_train_epochs=2,
    save_strategy="epoch",
    logging_strategy="epoch",
    logging_dir="./logs",
)

trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
)

In [None]:
trainer.train()