In [1]:
from pathlib import Path


DEVICE = "cuda"
PROJECT_ROOT_DIR = str(Path.cwd().parent)
PROJECT_ROOT_DIR

'/root/llm_graph_embedder'

In [2]:
from huggingface_hub import notebook_login

notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

# Loading data

In [3]:
import pandas as pd


train_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/train.parquet").sample(n=100)
eval_data = pd.read_parquet(f"{PROJECT_ROOT_DIR}/dataset/eval.parquet").sample(n=10)

In [4]:
def format_training_conversations(sample):
    prompt_template = """<human>: {human_turn}\n\n<gpt>: {gpt_turn}"""

    return prompt_template.format(
        human_turn=sample["formatted_conversations"][0]["value"],
        gpt_turn=sample["formatted_conversations"][1]["value"],
    )

In [5]:
train_data["full_conversation"] = train_data.apply(
    format_training_conversations, axis=1
)
eval_data["full_conversation"] = eval_data.apply(format_training_conversations, axis=1)

In [6]:
from datasets import DatasetDict, Dataset

dataset = DatasetDict(
    {
        "train": Dataset.from_pandas(
            train_data[["full_conversation"]].reset_index(drop=True)
        ),
        "eval": Dataset.from_pandas(
            eval_data[["full_conversation"]].reset_index(drop=True)
        )
    }
)

In [7]:
dataset

DatasetDict({
    train: Dataset({
        features: ['full_conversation'],
        num_rows: 100
    })
    eval: Dataset({
        features: ['full_conversation'],
        num_rows: 10
    })
})

# Instantiating Gemma 2 2B-it

In [8]:
from transformers import AutoTokenizer, AutoModelForCausalLM, AutoModel

tokenizer = AutoTokenizer.from_pretrained("google/gemma-2-2b-it", device_map=DEVICE)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side='right'

model = AutoModelForCausalLM.from_pretrained(
    "google/gemma-2-2b-it",
    device_map=DEVICE,
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
model.to(DEVICE)

Gemma2ForCausalLM(
  (model): Gemma2Model(
    (embed_tokens): Embedding(256000, 2304, padding_idx=0)
    (layers): ModuleList(
      (0-25): 26 x Gemma2DecoderLayer(
        (self_attn): Gemma2Attention(
          (q_proj): Linear(in_features=2304, out_features=2048, bias=False)
          (k_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (v_proj): Linear(in_features=2304, out_features=1024, bias=False)
          (o_proj): Linear(in_features=2048, out_features=2304, bias=False)
          (rotary_emb): Gemma2RotaryEmbedding()
        )
        (mlp): Gemma2MLP(
          (gate_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (up_proj): Linear(in_features=2304, out_features=9216, bias=False)
          (down_proj): Linear(in_features=9216, out_features=2304, bias=False)
          (act_fn): PytorchGELUTanh()
        )
        (input_layernorm): Gemma2RMSNorm((2304,), eps=1e-06)
        (post_attention_layernorm): Gemma2RMSNorm((2304,), eps=

In [10]:
from peft import LoraConfig, prepare_model_for_kbit_training, get_peft_model

model.gradient_checkpointing_enable()
model = prepare_model_for_kbit_training(model)

lora_config = LoraConfig(
    r=8,
    lora_alpha=64,
    target_modules=["q_proj", "v_proj"],
    lora_dropout=0.2,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.gradient_checkpointing_enable()
model.config.use_cache = False

model.print_trainable_parameters()

trainable params: 1,597,440 || all params: 2,615,939,328 || trainable%: 0.0611


In [11]:
def tokenize_function(examples):
    return tokenizer(
        examples["full_conversation"],
        padding="max_length",
        truncation=True,
        max_length=3000,
    )


tokenized_train_data = dataset["train"].map(tokenize_function, batched=True)
tokenized_eval_data = dataset["eval"].map(tokenize_function, batched=True)

tokenized_train_data.set_format(type="torch", columns=["input_ids", "attention_mask"])
tokenized_eval_data.set_format(type="torch", columns=["input_ids", "attention_mask"])

Map:   0%|          | 0/100 [00:00<?, ? examples/s]

Map:   0%|          | 0/10 [00:00<?, ? examples/s]

In [12]:
from transformers import DataCollatorForLanguageModeling

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [13]:
from transformers import TrainingArguments

training_args = TrainingArguments(
    learning_rate=1e-5,
    num_train_epochs=2,
    # gradient_accumulation_steps=16,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    fp16=True,
    eval_strategy="epoch",
    save_strategy="epoch",
    logging_strategy="epoch",
    output_dir="./output",
)

In [14]:
from trl import SFTTrainer

sft_trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train_data,
    eval_dataset=tokenized_eval_data,
    data_collator=data_collator,
    peft_config=lora_config,
)



In [15]:
sft_trainer.train()

Epoch,Training Loss,Validation Loss
1,1.645,1.716598
2,1.6048,1.684021


TrainOutput(global_step=50, training_loss=1.6248886108398437, metrics={'train_runtime': 136.7643, 'train_samples_per_second': 1.462, 'train_steps_per_second': 0.366, 'total_flos': 7294015180800000.0, 'train_loss': 1.6248886108398437, 'epoch': 2.0})