In [None]:
#https://huggingface.co/google/gemma-7b/blob/main/examples/notebook_sft_peft.ipynb

In [1]:
import os
from sklearn.model_selection import train_test_split
os.environ["HF_TOKEN"] = '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'

In [2]:
!pip3 install -q -U bitsandbytes==0.43.3
!pip3 install -q -U peft==0.12.0
!pip3 install -q -U trl==0.9.6
!pip3 install -q -U accelerate==0.33.0
!pip3 install -q -U datasets==2.21.0
!pip3 install -q -U transformers==4.44.0

In [125]:
import random
import numpy as np
import copy

In [2]:
import pandas as pd
df = pd.read_csv('./data/annotations_dataset.csv').fillna('').sample(frac=1)

In [3]:
def create_prompt(inputs: dict) -> str:
    """
    Function that creates prompt for poetry explanation.
    """
    return """
    You are given the poem "{title}" by "{poet}".
    <poem>
    {content_before}
    {referent}
    {context_after}
    </poem>
    Explain the meaning of the following lines: "{referent}"
    """.format(
        title=inputs['poem_title'],
        poet=inputs['poet'],
        content_before=inputs['content_before'],
        context_after=inputs['context_after'],
        referent=inputs['referent']
    )

print(create_prompt(inputs=dict(df.iloc[2770])))


    You are given the poem "Well So That Is That" by "W. H. Auden".
    <poem>
    Putting the decorations back into their cardboard boxes -
Some have got broken - and carrying them up to the attic.
The holly and the mistletoe must be taken down and burnt,
And the children got ready for school. There are enough
Leftovers to do, warmed up, for the rest of the week -
Not that we have much appetite, having drunk such a lot,
Stayed up so late, attempted - quite unsuccessfully -
To love all of our relatives, and in general
Grossly overestimated our powers. Once again
As in previous years we have seen the actual Vision and failed
To do more than entertain it as an agreeable
Possibility, once again we have sent Him away,
    Begging though to remain His disobedient servant,
The promising child who cannot keep His word for long.
    
    </poem>
    Explain the meaning of the following lines: "Begging though to remain His disobedient servant,
The promising child who cannot keep His word for l

In [26]:
dict(df.iloc[2770])

{'content_before': "The battle rent a cobweb diamond-strung\nAnd cut a flower beside a ground bird's nest\nBefore it stained a single human breast.\nThe stricken flower bent double and so hung.\nAnd still the bird revisited her young.\nA butterfly its fall had dispossessed\nA moment sought in air his flower of rest,\nThen lightly stooped to it and fluttering clung.\nOn the bare upland pasture there had spread\nO'ernight 'twixt mullein stalks a wheel of thread\nAnd straining cables wet with silver dew.",
 'referent': 'A sudden passing bullet shook it dry.',
 'context_after': 'The indwelling spider ran to greet the fly,\nBut finding nothing, sullenly withdrew.',
 'annotation': 'The serenity is, as the reader no doubt anticipates, broken by the shot described in this snappy line. The dryness may represent the loss of a source of life that invigorates the natural — and human — worlds.',
 'poet': 'Robert Frost',
 'poem_title': 'Range-finding'}

#### Split dataset into train/validation/test without intersections between poets

In [159]:
def split_by_author(df, split_ratio=[0.7, 0.1]) -> list[pd.DataFrame]:
    unique_poets_count = dict(df['poet'].value_counts())
    """Function that splits dataset into train/validation/test with no intersection between authors"""
    
    # set target counts for each subset
    total_count = len(df)
    count_deviation = total_count*0.01
    train_count_target = int(total_count * split_ratio[0])
    validation_count_target = int(total_count * split_ratio[1])
    test_count_target = total_count - train_count_target - validation_count_target
    train_poets, train_count = [], 0
    validation_poets, validation_count = [], 0
    
    while abs(train_count-train_count_target) > count_deviation:
        print('Selecting train dataset')
        # define start values
        train_poets, train_count = [], 0
        unique_poets_list = df['poet'].value_counts().index.values.copy()

        while train_count < train_count_target:
            random_index = random.randint(0, len(unique_poets_list)-1)
            train_poets.append(unique_poets_list[random_index])
            train_count += unique_poets_count[unique_poets_list[random_index]]
            unique_poets_list = np.delete(unique_poets_list, random_index)
           
    
    while abs(validation_count-validation_count_target) > count_deviation:
        print('Selecting validation dataset')
        validation_poets, validation_count = [], 0
        val_unique_poets_list = unique_poets_list.copy()
        
        while validation_count < validation_count_target:
            random_index = random.randint(0, len(val_unique_poets_list)-1)
            validation_poets.append(val_unique_poets_list[random_index])
            validation_count += unique_poets_count[val_unique_poets_list[random_index]]
            val_unique_poets_list = np.delete(val_unique_poets_list, random_index)
    
    # all left poets are for testing
    test_poets = val_unique_poets_list
    
    print(train_count, len(df[df['poet'].isin(train_poets)]))
    print(set(train_poets).intersection(validation_poets))
    print(f"Allowed deviation = {count_deviation}")
    print(f"Train count (target={train_count_target}) = {len(df[df['poet'].isin(train_poets)])}")
    print(f"Validation count (target={validation_count_target}) = {len(df[df['poet'].isin(validation_poets)])}")
    print(f"Test count (target={test_count_target}) = {len(df[df['poet'].isin(test_poets)])}")
        
    return df[df['poet'].isin(train_poets)], df[df['poet'].isin(validation_poets)], df[df['poet'].isin(test_poets)]

In [158]:
train_df, validation_df, test_df = split_by_author(df)

Selecting train dataset
Selecting validation dataset
Selecting validation dataset
2576 2576
set()
Allowed deviation = 36.29
Train count (target=2540) = 2576
Validation count (target=362) = 366
Test count (target=727) = 687


In [160]:
train_df.to_csv('./data/annotations_dataset_train.csv', index=False)
validation_df.to_csv('./data/annotations_dataset_validation.csv', index=False)
test_df.to_csv('./data/annotations_dataset_test.csv', index=False)

#### Create HF dataset from train/validation split

In [161]:
data_files = {"train": ["./data/annotations_dataset_train.csv"],
             "test": ["./data/annotations_dataset_validation.csv"]}

In [162]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files=data_files)

Generating train split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


Generating test split: 0 examples [00:00, ? examples/s]

  return pd.read_csv(xopen(filepath_or_buffer, "rb", download_config=download_config), **kwargs)


In [165]:
len(dataset['train']), len(dataset['test'])

(2576, 366)

#### Load the model from HF hub

In [None]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

model_id = "google/gemma-2-2b-it"
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'])
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             #quantization_config=bnb_config,
                                             device_map={"mps":0},
                                             token=os.environ['HF_TOKEN'])

In [None]:
text = create_prompt(inputs=dict(df.iloc[2770]))
device = "cuda:0"
inputs = tokenizer(text, return_tensors="pt").to(device)

outputs = model.generate(**inputs, max_new_tokens=100)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

In [None]:
from peft import LoraConfig

lora_config = LoraConfig(
    r=8,
    target_modules=["q_proj", "o_proj", "k_proj", "v_proj", "gate_proj", "up_proj", "down_proj"],
    task_type="CAUSAL_LM",
)

In [35]:
from datasets import load_dataset

data = load_dataset("Abirate/english_quotes")
#data = data.map(lambda samples: tokenizer(samples["quote"]), batched=True)

In [36]:
data['train'][0]

{'quote': '“Be yourself; everyone else is already taken.”',
 'author': 'Oscar Wilde',
 'tags': ['be-yourself',
  'gilbert-perreira',
  'honesty',
  'inspirational',
  'misattributed-oscar-wilde',
  'quote-investigator']}

In [43]:
dataset = dataset.map(lambda samples: tokenizer(samples['referent']), batched=True)

Map:   0%|          | 0/50 [00:00<?, ? examples/s]

Map:   0%|          | 0/25 [00:00<?, ? examples/s]

In [44]:
dataset['train'][0]

{'content_before': 'About suffering they were never wrong,\nThe Old Masters; how well they understood\nIts human position; how it takes place\nWhile someone else is eating or opening a window or just walking dully along;\nHow, when the aged are reverently, passionately waiting\nFor the miraculous birth, there always must be\nChildren who did not specially want it to happen, skating\nOn a pond at the edge of the wood:\nThey never forgot\nThat even the dreadful martyrdom must run its course',
 'referent': "Anyhow in a corner, some untidy spot\nWhere the dogs go on with their doggy life and the torturer's horse\nScratches its innocent behind on a tree.",
 'context_after': "In Breughel's Icarus, for instance: how everything turns away\nQuite leisurely from the disaster; the ploughman may\nHave heard the splash, the forsaken cry,",
 'annotation': 'After the dramatic climax of the ‘dreadful martyrdom’, the tone changes to conversational, with the words ‘Anyhow’, ‘doggy’ and ‘behind’, a chara

In [42]:
'\n'.join(["1", "2"])

'1\n2'