In [7]:
import os
import random
import numpy as np
import copy
import pandas as pd

os.environ["HF_TOKEN"] = 'your token'

In [2]:
%pip install datasets transformers

### Create & verify user prompt from input data

In [2]:
def create_prompt(inputs: dict) -> str:
    """
    Function that creates prompt for poetry explanation.
    """
    return """
    You are given the poem "{title}" by "{poet}".
    <poem>
    {content_before}
    {referent}
    {context_after}
    </poem>
    Explain the meaning of the following lines: "{referent}"
    """.format(
        title=inputs['poem_title'],
        poet=inputs['poet'],
        content_before=inputs['content_before'],
        context_after=inputs['context_after'],
        referent=inputs['referent']
    )

In [3]:
example = {
    'content_before': "The battle rent a cobweb diamond-strung\nAnd cut a flower beside a ground bird's nest\nBefore it stained a single human breast.\nThe stricken flower bent double and so hung.\nAnd still the bird revisited her young.\nA butterfly its fall had dispossessed\nA moment sought in air his flower of rest,\nThen lightly stooped to it and fluttering clung.\nOn the bare upland pasture there had spread\nO'ernight 'twixt mullein stalks a wheel of thread\nAnd straining cables wet with silver dew.",
    'referent': 'A sudden passing bullet shook it dry.',
    'context_after': 'The indwelling spider ran to greet the fly,\nBut finding nothing, sullenly withdrew.',
    'annotation': 'The serenity is, as the reader no doubt anticipates, broken by the shot described in this snappy line. The dryness may represent the loss of a source of life that invigorates the natural — and human — worlds.',
    'poet': 'Robert Frost',
    'poem_title': 'Range-finding'
}

In [4]:
create_prompt(example)

'\n    You are given the poem "Range-finding" by "Robert Frost".\n    <poem>\n    The battle rent a cobweb diamond-strung\nAnd cut a flower beside a ground bird\'s nest\nBefore it stained a single human breast.\nThe stricken flower bent double and so hung.\nAnd still the bird revisited her young.\nA butterfly its fall had dispossessed\nA moment sought in air his flower of rest,\nThen lightly stooped to it and fluttering clung.\nOn the bare upland pasture there had spread\nO\'ernight \'twixt mullein stalks a wheel of thread\nAnd straining cables wet with silver dew.\n    A sudden passing bullet shook it dry.\n    The indwelling spider ran to greet the fly,\nBut finding nothing, sullenly withdrew.\n    </poem>\n    Explain the meaning of the following lines: "A sudden passing bullet shook it dry."\n    '

### Split dataset into train/validation/test without intersections between poets

In [159]:
def split_by_author(df, split_ratio=[0.7, 0.1]) -> list[pd.DataFrame]:
    unique_poets_count = dict(df['poet'].value_counts())
    """Function that splits dataset into train/validation/test with no intersection between authors"""
    
    # set target counts for each subset
    total_count = len(df)
    count_deviation = total_count*0.01
    train_count_target = int(total_count * split_ratio[0])
    validation_count_target = int(total_count * split_ratio[1])
    test_count_target = total_count - train_count_target - validation_count_target
    train_poets, train_count = [], 0
    validation_poets, validation_count = [], 0
    
    while abs(train_count-train_count_target) > count_deviation:
        print('Selecting train dataset')
        # define start values
        train_poets, train_count = [], 0
        unique_poets_list = df['poet'].value_counts().index.values.copy()

        while train_count < train_count_target:
            random_index = random.randint(0, len(unique_poets_list)-1)
            train_poets.append(unique_poets_list[random_index])
            train_count += unique_poets_count[unique_poets_list[random_index]]
            unique_poets_list = np.delete(unique_poets_list, random_index)
           
    
    while abs(validation_count-validation_count_target) > count_deviation:
        print('Selecting validation dataset')
        validation_poets, validation_count = [], 0
        val_unique_poets_list = unique_poets_list.copy()
        
        while validation_count < validation_count_target:
            random_index = random.randint(0, len(val_unique_poets_list)-1)
            validation_poets.append(val_unique_poets_list[random_index])
            validation_count += unique_poets_count[val_unique_poets_list[random_index]]
            val_unique_poets_list = np.delete(val_unique_poets_list, random_index)
    
    # all left poets are for testing
    test_poets = val_unique_poets_list
    
    print(train_count, len(df[df['poet'].isin(train_poets)]))
    print(set(train_poets).intersection(validation_poets))
    print(f"Allowed deviation = {count_deviation}")
    print(f"Train count (target={train_count_target}) = {len(df[df['poet'].isin(train_poets)])}")
    print(f"Validation count (target={validation_count_target}) = {len(df[df['poet'].isin(validation_poets)])}")
    print(f"Test count (target={test_count_target}) = {len(df[df['poet'].isin(test_poets)])}")
        
    return df[df['poet'].isin(train_poets)], df[df['poet'].isin(validation_poets)], df[df['poet'].isin(test_poets)]

In [158]:
train_df, validation_df, test_df = split_by_author(df)

Selecting train dataset
Selecting validation dataset
Selecting validation dataset
2576 2576
set()
Allowed deviation = 36.29
Train count (target=2540) = 2576
Validation count (target=362) = 366
Test count (target=727) = 687


In [160]:
train_df.to_csv('./data/annotations_dataset_train.csv', index=False)
validation_df.to_csv('./data/annotations_dataset_validation.csv', index=False)
test_df.to_csv('./data/annotations_dataset_test.csv', index=False)

### Create HF dataset from train/test split

In [9]:
data_files = {"train": ["./data/annotations_dataset_train.csv"],
             "test": ["./data/annotations_dataset_test.csv"],
             "validation": ["./data/annotations_dataset_validation.csv"]}

In [10]:
from datasets import load_dataset
dataset = load_dataset("csv", data_files=data_files)
dataset.push_to_hub("prettyvampire/genius_poems_annotations")

Generating train split: 0 examples [00:00, ? examples/s]

Generating test split: 0 examples [00:00, ? examples/s]

Generating validation split: 0 examples [00:00, ? examples/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/3 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1 [00:00<?, ?ba/s]

README.md:   0%|          | 0.00/485 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/prettyvampire/genius_poems_annotations/commit/8e85f16d583a25634b049386c0819ae69d1a7ecb', commit_message='Upload dataset', commit_description='', oid='8e85f16d583a25634b049386c0819ae69d1a7ecb', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/prettyvampire/genius_poems_annotations', endpoint='https://huggingface.co', repo_type='dataset', repo_id='prettyvampire/genius_poems_annotations'), pr_revision=None, pr_num=None)

In [7]:
len(dataset['train']), len(dataset['test'])

(2576, 687)

### Try gemma 2-2b-it on one sample

In [10]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig, GemmaTokenizer

model_id = 'google/gemma-2-2b-it'
tokenizer = AutoTokenizer.from_pretrained(model_id, token=os.environ['HF_TOKEN'], add_eos_token=True)
model = AutoModelForCausalLM.from_pretrained(model_id,
                                             device_map='cuda',
                                             token=os.environ['HF_TOKEN'],
                                             torch_dtype=torch.float16,
                                             use_cache=False)

Loading checkpoint shards: 100%|█████████████████████████████████████████████████████████| 2/2 [00:01<00:00,  1.03it/s]


In [11]:
text = '\n    You are given the poem "Lenox Avenue: Midnight" by "Langston Hughes".\n    <poem>\n    \n    The rhythm of life\nIs a jazz rhythm,\n    Honey.\nThe gods are laughing at us.\nThe broken heart of love,\n    </poem>\n    Explain the meaning of the following lines: "The rhythm of life\nIs a jazz rhythm,"\n    '

In [12]:
tokenizer.chat_template

"{{ bos_token }}{% if messages[0]['role'] == 'system' %}{{ raise_exception('System role not supported') }}{% endif %}{% for message in messages %}{% if (message['role'] == 'user') != (loop.index0 % 2 == 0) %}{{ raise_exception('Conversation roles must alternate user/assistant/user/assistant/...') }}{% endif %}{% if (message['role'] == 'assistant') %}{% set role = 'model' %}{% else %}{% set role = message['role'] %}{% endif %}{{ '<start_of_turn>' + role + '\n' + message['content'] | trim + '<end_of_turn>\n' }}{% endfor %}{% if add_generation_prompt %}{{'<start_of_turn>model\n'}}{% endif %}"

In [13]:
device = "cuda"

messages = [
    {"role": "user", "content": text},
]
inputs = tokenizer.apply_chat_template(messages, return_tensors="pt", return_dict=True).to("cuda")

outputs = model.generate(**inputs, max_new_tokens=150, pad_token_id=tokenizer.eos_token_id)
print(tokenizer.decode(outputs[0], skip_special_tokens=True))

user
You are given the poem "Lenox Avenue: Midnight" by "Langston Hughes".
    <poem>
    
    The rhythm of life
Is a jazz rhythm,
    Honey.
The gods are laughing at us.
The broken heart of love,
    </poem>
    Explain the meaning of the following lines: "The rhythm of life
Is a jazz rhythm,"
* **What is the meaning of the line?**
* **How does this line relate to the overall theme of the poem?**

Here's a breakdown of the poem:

* **"The rhythm of life is a jazz rhythm, honey."** This line sets the tone for the poem, suggesting a sense of improvisation, spontaneity, and a certain unpredictability in life.
* **"The gods are laughing at us."** This line suggests a sense of irony and perhaps even despair.
* **"The broken heart of love."** This line suggests a sense of loss and pain.


Let me know if you'd like to explore any other lines from the poem! 

