In [None]:
#https://huggingface.co/google/gemma-7b/blob/main/examples/notebook_sft_peft.ipynb

In [1]:
import os
import random
import numpy as np
import copy
import pandas as pd
os.environ["HF_TOKEN"] = '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'

In [2]:
!pip3 install -q -U bitsandbytes==0.43.3
!pip3 install -q -U peft==0.12.0
!pip3 install -q -U trl==0.9.6
!pip3 install -q -U accelerate==0.33.0
!pip3 install -q -U datasets==2.21.0
!pip3 install -q -U transformers==4.44.0

### Create & verify user prompt from input data

In [2]:
def create_prompt(inputs: dict) -> str:
    """
    Function that creates prompt for poetry explanation.
    """
    return """
    You are given the poem "{title}" by "{poet}".
    <poem>
    {content_before}
    {referent}
    {context_after}
    </poem>
    Explain the meaning of the following lines: "{referent}"
    """.format(
        title=inputs['poem_title'],
        poet=inputs['poet'],
        content_before=inputs['content_before'],
        context_after=inputs['context_after'],
        referent=inputs['referent']
    )

In [3]:
example = {
    'content_before': "The battle rent a cobweb diamond-strung\nAnd cut a flower beside a ground bird's nest\nBefore it stained a single human breast.\nThe stricken flower bent double and so hung.\nAnd still the bird revisited her young.\nA butterfly its fall had dispossessed\nA moment sought in air his flower of rest,\nThen lightly stooped to it and fluttering clung.\nOn the bare upland pasture there had spread\nO'ernight 'twixt mullein stalks a wheel of thread\nAnd straining cables wet with silver dew.",
    'referent': 'A sudden passing bullet shook it dry.',
    'context_after': 'The indwelling spider ran to greet the fly,\nBut finding nothing, sullenly withdrew.',
    'annotation': 'The serenity is, as the reader no doubt anticipates, broken by the shot described in this snappy line. The dryness may represent the loss of a source of life that invigorates the natural — and human — worlds.',
    'poet': 'Robert Frost',
    'poem_title': 'Range-finding'
}

In [4]:
create_prompt(example)

'\n    You are given the poem "Range-finding" by "Robert Frost".\n    <poem>\n    The battle rent a cobweb diamond-strung\nAnd cut a flower beside a ground bird\'s nest\nBefore it stained a single human breast.\nThe stricken flower bent double and so hung.\nAnd still the bird revisited her young.\nA butterfly its fall had dispossessed\nA moment sought in air his flower of rest,\nThen lightly stooped to it and fluttering clung.\nOn the bare upland pasture there had spread\nO\'ernight \'twixt mullein stalks a wheel of thread\nAnd straining cables wet with silver dew.\n    A sudden passing bullet shook it dry.\n    The indwelling spider ran to greet the fly,\nBut finding nothing, sullenly withdrew.\n    </poem>\n    Explain the meaning of the following lines: "A sudden passing bullet shook it dry."\n    '

### Split dataset into train/validation/test without intersections between poets

In [159]:
def split_by_author(df, split_ratio=[0.7, 0.1]) -> list[pd.DataFrame]:
    unique_poets_count = dict(df['poet'].value_counts())
    """Function that splits dataset into train/validation/test with no intersection between authors"""
    
    # set target counts for each subset
    total_count = len(df)
    count_deviation = total_count*0.01
    train_count_target = int(total_count * split_ratio[0])
    validation_count_target = int(total_count * split_ratio[1])
    test_count_target = total_count - train_count_target - validation_count_target
    train_poets, train_count = [], 0
    validation_poets, validation_count = [], 0
    
    while abs(train_count-train_count_target) > count_deviation:
        print('Selecting train dataset')
        # define start values
        train_poets, train_count = [], 0
        unique_poets_list = df['poet'].value_counts().index.values.copy()

        while train_count < train_count_target:
            random_index = random.randint(0, len(unique_poets_list)-1)
            train_poets.append(unique_poets_list[random_index])
            train_count += unique_poets_count[unique_poets_list[random_index]]
            unique_poets_list = np.delete(unique_poets_list, random_index)
           
    
    while abs(validation_count-validation_count_target) > count_deviation:
        print('Selecting validation dataset')
        validation_poets, validation_count = [], 0
        val_unique_poets_list = unique_poets_list.copy()
        
        while validation_count < validation_count_target:
            random_index = random.randint(0, len(val_unique_poets_list)-1)
            validation_poets.append(val_unique_poets_list[random_index])
            validation_count += unique_poets_count[val_unique_poets_list[random_index]]
            val_unique_poets_list = np.delete(val_unique_poets_list, random_index)
    
    # all left poets are for testing
    test_poets = val_unique_poets_list
    
    print(train_count, len(df[df['poet'].isin(train_poets)]))
    print(set(train_poets).intersection(validation_poets))
    print(f"Allowed deviation = {count_deviation}")
    print(f"Train count (target={train_count_target}) = {len(df[df['poet'].isin(train_poets)])}")
    print(f"Validation count (target={validation_count_target}) = {len(df[df['poet'].isin(validation_poets)])}")
    print(f"Test count (target={test_count_target}) = {len(df[df['poet'].isin(test_poets)])}")
        
    return df[df['poet'].isin(train_poets)], df[df['poet'].isin(validation_poets)], df[df['poet'].isin(test_poets)]

In [158]:
train_df, validation_df, test_df = split_by_author(df)

Selecting train dataset
Selecting validation dataset
Selecting validation dataset
2576 2576
set()
Allowed deviation = 36.29
Train count (target=2540) = 2576
Validation count (target=362) = 366
Test count (target=727) = 687


In [160]:
train_df.to_csv('./data/annotations_dataset_train.csv', index=False)
validation_df.to_csv('./data/annotations_dataset_validation.csv', index=False)
test_df.to_csv('./data/annotations_dataset_test.csv', index=False)

### Create HF dataset from train/test split

In [5]:
data_files = {"train": ["./data/annotations_dataset_train.csv"],
             "test": ["./data/annotations_dataset_test.csv"]}

In [6]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files=data_files)

  from .autonotebook import tqdm as notebook_tqdm
Generating train split: 2576 examples [00:00, 65997.56 examples/s]
Generating test split: 687 examples [00:00, 21817.71 examples/s]


In [7]:
len(dataset['train']), len(dataset['test'])

(2576, 687)

### Load the  base model from HF hub

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

model_id = 'google/gemma-2-2b-it'
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], add_eos_token=True)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             #quantization_config=bnb_config,
                                             device_map='cuda',
                                             token=os.environ['HF_TOKEN'],
                                             attn_implementation='eager',
                                             torch_dtype=torch.float16,
                                             use_cache=False)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.03it/s]


In [11]:
text = '\n    You are given the poem "Lenox Avenue: Midnight" by "Langston Hughes".\n    <poem>\n    \n    The rhythm of life\nIs a jazz rhythm,\n    Honey.\nThe gods are laughing at us.\nThe broken heart of love,\n    </poem>\n    Explain the meaning of the following lines: "The rhythm of life\nIs a jazz rhythm,"\n    '

In [12]:
tokenizer.chat_template

"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

In [13]:
#text = create_prompt(inputs=dict(df.iloc[1500]))
device = "cuda"

messages = [
    {"role": "user", "content": text},
]
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
#inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
You are given the poem "Lenox Avenue: Midnight" by "Langston Hughes".
    <poem>
    
    The rhythm of life
Is a jazz rhythm,
    Honey.
The gods are laughing at us.
The broken heart of love,
    </poem>
    Explain the meaning of the following lines: "The rhythm of life
Is a jazz rhythm,"
* **What is the meaning of the line?**
* **How does this line relate to the overall theme of the poem?**

Here's a breakdown of the poem:

* **"The rhythm of life is a jazz rhythm, honey."** This line sets the tone for the poem, suggesting a sense of improvisation, spontaneity, and a certain unpredictability in life.
* **"The gods are laughing at us."** This line suggests a sense of irony and perhaps even despair.
* **"The broken heart of love."** This line suggests a sense of loss and pain.


Let me know if you'd like to explore any other lines from the poem! 



### Apply & verify chat template to the dataset

In [14]:
dataset["train"].features

{'content_before': Value(dtype='string', id=None),
 'referent': Value(dtype='string', id=None),
 'context_after': Value(dtype='string', id=None),
 'annotation': Value(dtype='string', id=None),
 'poet': Value(dtype='string', id=None),
 'poem_title': Value(dtype='string', id=None)}

In [15]:
def apply_chat_template(example, tokenizer):
    text = create_prompt(example)
    messages = [
        {"role": "user", "content": text},
        {"role": "assistant", "content": example['annotation']}
    ]
    
    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(dataset["train"].features)
dataset = dataset.map(apply_chat_template,
                      fn_kwargs={"tokenizer": tokenizer},
                      remove_columns=column_names,
                      desc="Applying chat template"
                     )

# create the splits
train_dataset = dataset["train"]
eval_dataset = dataset["test"]

Applying chat template: 100%|█████████████████████████████████████████████| 2576/2576 [00:00<00:00, 7536.67 examples/s]
Applying chat template: 100%|███████████████████████████████████████████████| 687/687 [00:00<00:00, 8402.87 examples/s]


In [16]:
for index in random.sample(range(len(dataset["train"])), 3):
    print(f"Sample {index} of the processed training set:\n\n{dataset['train'][index]['text']}")

Sample 230 of the processed training set:

<bos><start_of_turn>user
You are given the poem "Crimson" by "Carl Sandburg".
    <poem>
    None
    Crimson is the slow smolder of the cigar end I hold,
    Gray is the ash that stiffens and covers all silent the fire.
(A great man I know is dead and while he lies in his coffin a gone flame I sit here in cumbering shadows and smoke and watch my thoughts come and go.)
    </poem>
    Explain the meaning of the following lines: "Crimson is the slow smolder of the cigar end I hold,"<end_of_turn>
<start_of_turn>model
Crimson, a deep red (like blood) and the  slow  burning – life is  long , not short; though inevitably it does come to an end 
 Cigars  are the  perfect  tobacco product for this poem: cigarettes burn too quickly,  Black & Milds  bring along the notion of poverty, and  blunts are for rappers<end_of_turn>

Sample 429 of the processed training set:

<bos><start_of_turn>user
You are given the poem "Brass Spittoons" by "Langston Hughes"

### Train the model

In [17]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=32,
    lora_alpha=16,
    lora_dropout=0.1,
    bias="none",
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [18]:
import transformers
from trl import SFTTrainer, SFTConfig 

tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = 'right'
torch.cuda.empty_cache()
sft_config = SFTConfig(
    max_seq_length=512,
    warmup_steps=2,
    output_dir="outputs",
    per_device_train_batch_size=1,
    gradient_accumulation_steps=64,
    learning_rate=2e-4,
    fp16=True,
    gradient_checkpointing=True,
    gradient_checkpointing_kwargs={"use_reentrant": False},
    logging_steps=5,
    logging_strategy="steps",
    log_level="info",
    optim="paged_adamw_8bit",
    dataset_text_field="text",
    do_eval=True,
    eval_strategy="epoch",
    lr_scheduler_type="cosine",
    per_device_eval_batch_size=1,
    seed=42,
    overwrite_output_dir=True,
    max_steps=-1,
    num_train_epochs=1,
)

trainer = SFTTrainer(
    model=model,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    args=sft_config,
    peft_config=lora_config
)


#model.config.use_cache = False
train_result = trainer.train()

Map: 100%|████████████████████████████████████████████████████████████████| 2576/2576 [00:00<00:00, 9347.44 examples/s]
Map: 100%|██████████████████████████████████████████████████████████████████| 687/687 [00:00<00:00, 9932.57 examples/s]
  self.scaler = torch.cuda.amp.GradScaler(**kwargs)
You have loaded a model on multiple GPUs. `is_model_parallel` attribute will be force-set to `True` to avoid any unexpected behavior such as device placement mismatching.
Using auto half precision backend
***** Running training *****
  Num examples = 2,576
  Num Epochs = 1
  Instantaneous batch size per device = 1
  Total train batch size (w. parallel, distributed & accumulation) = 64
  Gradient Accumulation steps = 64
  Total optimization steps = 40
  Number of trainable parameters = 41,533,440
  with device_autocast_ctx, torch.cpu.amp.autocast(**cpu_autocast_kwargs), recompute_context:  # type: ignore[attr-defined]


Epoch,Training Loss,Validation Loss
0,2.4416,2.415272


Saving model checkpoint to outputs\checkpoint-40
loading configuration file config.json from cache at C:\Users\vlad-dev\.cache\huggingface\hub\models--google--gemma-2-2b-it\snapshots\e48216d9004e7fd70bc4fdfdc5b7cc3349f8e619\config.json
Model config Gemma2Config {
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_value_heads": 4,
  "pad_token_id": 0,
  "query_pre_attn_scalar": 256,
  "rms_norm_eps": 1e-06,
  "rope_theta": 10000.0,
  "sliding_window": 4096,
  "torch_dt

In [19]:
metrics = train_result.metrics
max_train_samples = len(train_dataset)
metrics["train_samples"] = min(max_train_samples, len(train_dataset))
trainer.log_metrics("train", metrics)
trainer.save_metrics("train", metrics)
trainer.save_state()

***** train metrics *****
  epoch                    =     0.9938
  total_flos               =  7334179GF
  train_loss               =      2.689
  train_runtime            = 0:22:16.15
  train_samples            =       2576
  train_samples_per_second =      1.928
  train_steps_per_second   =       0.03


### Run inference on fine-tuned model

In [20]:
from transformers import AutoTokenizer, AutoModelForCausalLM

output_dir = "./outputs/checkpoint-40"
tokenizer = AutoTokenizer.from_pretrained(output_dir, add_eos_token=True)
model = AutoModelForCausalLM.from_pretrained(output_dir,
                                             device_map="cuda",
                                             attn_implementation='eager',
                                             torch_dtype=torch.float16,
                                             use_cache=False)

loading file tokenizer.model
loading file tokenizer.json
loading file added_tokens.json
loading file special_tokens_map.json
loading file tokenizer_config.json
loading configuration file config.json from cache at C:\Users\vlad-dev\.cache\huggingface\hub\models--google--gemma-2-2b-it\snapshots\e48216d9004e7fd70bc4fdfdc5b7cc3349f8e619\config.json
Model config Gemma2Config {
  "_name_or_path": "google/gemma-2-2b-it",
  "architectures": [
    "Gemma2ForCausalLM"
  ],
  "attention_bias": false,
  "attention_dropout": 0.0,
  "attn_logit_softcapping": 50.0,
  "bos_token_id": 2,
  "cache_implementation": "hybrid",
  "eos_token_id": [
    1,
    107
  ],
  "final_logit_softcapping": 30.0,
  "head_dim": 256,
  "hidden_act": "gelu_pytorch_tanh",
  "hidden_activation": "gelu_pytorch_tanh",
  "hidden_size": 2304,
  "initializer_range": 0.02,
  "intermediate_size": 9216,
  "max_position_embeddings": 8192,
  "model_type": "gemma2",
  "num_attention_heads": 8,
  "num_hidden_layers": 26,
  "num_key_val

In [37]:
sample = pd.read_csv('./data/annotations_dataset_validation.csv').fillna('').sample(n=1).to_dict(orient='records')[0]

In [38]:
print(create_prompt(sample))


    You are given the poem "On Death" by "John Keats".
    <poem>
    Can death be sleep, when life is but a dream,
    And scenes of bliss pass as a phantom by?
    The transient pleasures as a vision seem,
And yet we think the greatest pain's to die.
How strange it is that man on earth should roam,
    </poem>
    Explain the meaning of the following lines: "And scenes of bliss pass as a phantom by?"
    


In [39]:
device = "cuda"
text = create_prompt(sample)
messages = [
    {"role": "user", "content": text},
]
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")
#inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs,
                         max_new_tokens=256,
                         pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
You are given the poem "On Death" by "John Keats".
    <poem>
    Can death be sleep, when life is but a dream,
    And scenes of bliss pass as a phantom by?
    The transient pleasures as a vision seem,
And yet we think the greatest pain's to die.
How strange it is that man on earth should roam,
    </poem>
    Explain the meaning of the following lines: "And scenes of bliss pass as a phantom by?"
model
The speaker is comparing death to a dream, and death to a phantom.  The phantom is a ghost, and a ghost is a spirit that is not alive.  The phantom is also a thing that is not real, and a thing that is not real is not alive.  The speaker is saying that death is not real, and that death is not alive.  The speaker is also saying that death is not a thing that is real, and that death is not a thing that is alive.  The speaker is also saying that death is not a thing that is real, and that death is not a thing that is alive.  The speaker is also saying that death is not a thing that i