In [1]:
from pathlib import Path

PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

'/root/llm_graph_embedder'

In [2]:
from huggingface_hub import notebook_login


notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Loading data

In [3]:
import pandas as pd

train_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/train.parquet").sample(n=6000)
eval_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/eval.parquet").sample(n=1500)

In [4]:
train_data.shape[0], eval_data.shape[0]

(6000, 1500)

In [5]:
def format_training_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: {gpt_turn}"""

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
        gpt_turn=sample["formatted_conversations"][1]["value"],
    )

In [6]:
train_data["full_conversation"] = train_data.apply(
    format_training_conversations, axis=1
)
eval_data["full_conversation"] = eval_data.apply(format_training_conversations, axis=1)

In [7]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_data[["full_conversation"]].reset_index(drop=True)
        ),
        "eval": Dataset.from_pandas(
            eval_data[["full_conversation"]].reset_index(drop=True)
        )
    }
)

In [8]:
dataset

DatasetDict({
    train: Dataset({
        features: ['full_conversation'],
        num_rows: 6000
    })
    eval: Dataset({
        features: ['full_conversation'],
        num_rows: 1500
    })
})

# Fine tuning model

## Creating LoRA config

In [9]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

## Instantiating Gemma 2 2B-it

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = "google/gemma-2-2b-it"

bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(
    model_id,
)

model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
)

`low_cpu_mem_usage` was None, now default to True since model is quantized.


Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

## Tokenizing train and eval data

In [11]:
def tokenize_function(examples):
    return tokenizer(
        examples["full_conversation"],
        padding="max_length",
        truncation=True,
        max_length=3000,
    )


tokenized_train_data = dataset["train"].map(tokenize_function, batched=True)
tokenized_eval_data = dataset["eval"].map(tokenize_function, batched=True)

tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_eval_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/6000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1500 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

## Training model

In [13]:
import transformers
from trl import SFTTrainer

trainer = SFTTrainer(
    model=model,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    args=transformers.TrainingArguments(
        per_device_train_batch_size=1,
        gradient_accumulation_steps=8,
        num_train_epochs=2,
        learning_rate=2e-4,
        fp16=True,
        logging_steps=100,
        output_dir="outputs",
    ),
    peft_config=lora_config,
    data_collator=data_collator,
)



In [14]:
trainer.train()

Step,Training Loss
100,1.1168
200,1.0308
300,1.0231
400,1.0107
500,1.009
600,1.0097
700,1.0102
800,0.9894
900,0.9797
1000,0.9907


TrainOutput(global_step=1500, training_loss=1.0023020680745443, metrics={'train_runtime': 8013.3355, 'train_samples_per_second': 1.498, 'train_steps_per_second': 0.187, 'total_flos': 4.39538669568e+17, 'train_loss': 1.0023020680745443, 'epoch': 2.0})

In [15]:
trainer.save_model(f"{PROJECT_ROOT_DIR}/model/graph_embedder")