#### Install dependencies and mount Google Drive for Colab

In [1]:
from google.colab import drive # For Colab usage
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
!pip install -q -U transformers evaluate peft rouge_score bitsandbytes flash-attn bert_score

In [None]:
!pip install git+https://github.com/google-research/bleurt.git

In [4]:
import os
import pandas as pd
import evaluate
import numpy as np

os.environ["HF_TOKEN"] = 'your token'
drive_location = "/content/drive/MyDrive/Colab Notebooks/Poemma"

In [6]:
def create_prompt(inputs: dict) -> str:
    """
    Function that creates prompt for poetry explanation.
    """
    return """
    You are given the poem "{title}" by {poet}.
    <poem>
    {content_before}
    {referent}
    {context_after}
    </poem>
    Explain the meaning of the following lines: "{referent}"
    """.format(
        title=inputs['poem_title'],
        poet=inputs['poet'],
        content_before=inputs['content_before'] or '',
        context_after=inputs['context_after'] or '',
        referent=inputs['referent']
    )

#### Example how to merge base model and adapter

In [None]:
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig
from peft import PeftConfig, PeftModel
import torch

output_dir = f"{drive_location}/checkpoint-gemma-2-2b-it" # checkpoint of the trained adapter
config = PeftConfig.from_pretrained(output_dir)
base_model_name = config.base_model_name_or_path

base_model = AutoModelForCausalLM.from_pretrained(base_model_name, torch_dtype=torch.float16) # base model without quantization
model = PeftModel.from_pretrained(base_model, output_dir, torch_dtype=torch.float16)

In [58]:
merged_model = model.merge_and_unload()
merged_model.save_pretrained('./poemma_merged_gemma_fp_16')

### Load model from Hub for batch prediction

In [68]:
import os
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM, BitsAndBytesConfig

model_id = 'prettyvampire/poemma'
tokenizer = AutoTokenizer.from_pretrained(model_id, add_eos_token=True, padding_side='left')
tokenizer.pad_token = tokenizer.eos_token
nf4_config = BitsAndBytesConfig(
   load_in_4bit=True,
   bnb_4bit_quant_type="nf4",
   bnb_4bit_use_double_quant=True,
   bnb_4bit_compute_dtype=torch.bfloat16 # torch.float16 if GPU doesn't support bfloat16
)

model = AutoModelForCausalLM.from_pretrained(model_id,
                                             quantization_config=nf4_config,
                                             token=os.environ['HF_TOKEN'],
                                             attn_implementation="flash_attention_2",
                                             torch_dtype=torch.bfloat16,
                                             device_map="auto",
                                             use_cache=True)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

#### Load dataset and apply chat template without tokenization

In [63]:
from datasets import load_dataset

MAX_SEQUENCE_LENGTH = 512

dataset = load_dataset("prettyvampire/genius_poems_annotations")
dataset = dataset.map(lambda x: {
    'poem_title': x['poem_title'],
    'poet': x['poet'],
    'content_before': x['content_before'] if x['content_before'] else '',
    'context_after': x['context_after'] if x['context_after'] else '',
    'referent': x['referent']
})
labelled = dataset['test']['annotation']

In [65]:
def apply_poemma_template(example, tokenizer):
    text = create_prompt(example)
    messages = [
        #{"role": "system", "content": "You are an expert in poetry."},
        {"role": "user", "content": text}
    ]

    example["text"] = tokenizer.apply_chat_template(messages, tokenize=False)

    return example

column_names = list(dataset["train"].features)
dataset = dataset.map(apply_poemma_template,
                      fn_kwargs={"tokenizer": tokenizer},
                      remove_columns=column_names,
                      desc="Applying chat template"
                     )


Applying chat template:   0%|          | 0/2576 [00:00<?, ? examples/s]

Applying chat template:   0%|          | 0/687 [00:00<?, ? examples/s]

In [66]:
len(dataset['train']), len(dataset['test'])

(2576, 687)

Useful for inference: https://huggingface.co/docs/transformers/main/llm_optims?spec-decoding=sampling&static-kv=basic+usage%3A+generation_config

#### Run inference on the test dataset

In [None]:
import torch
from tqdm import tqdm
from torch.utils.data import DataLoader

#model.generation_config.cache_implementation = "static"
#model.forward = torch.compile(model.forward, mode="reduce-overhead", fullgraph=True)

tokenizer.pad_token = tokenizer.eos_token

BATCH_SIZE = 8
eval_dataloader = DataLoader(dataset['test']['text'], batch_size=BATCH_SIZE, shuffle=False)
device='cuda'
generated = []

with torch.no_grad():
    for batch in tqdm(eval_dataloader, desc="Evaluating"):
        encoded_batch = tokenizer(batch, padding=True, max_length=512, truncation=True, return_tensors='pt', padding_side='left').to(device)
        outputs = model.generate(**encoded_batch,
                                max_new_tokens=128,
                                use_cache=True,
                                pad_token_id=tokenizer.eos_token_id)
                                repetition_penalty=1.25)
        output = tokenizer.batch_decode(outputs, skip_special_tokens=False)
        generated.append(output)

In [70]:
flattenned_list = []
for b in generated:
  for t in b:
    flattenned_list.append({'text': t})
pd.DataFrame(flattenned_list).to_json('results.json')

### Evaluation: ROUGE, BLEU, BLEURT, BERTScore metrics

In [None]:
import json
import re

def process_text_before_evaluation(t: str):
    """Removes unnessecary symbols before evaluation"""

    t = t.replace('\n', ' ').strip()
    t = t.replace('<eos>', '')
    t = t.replace('<end_of_turn>', '')
    t = re.sub(' +', ' ', t)
    return t

def prepare_sampled_text_for_evaluation(file_path: str, is_gemma=False):
  """Reads and preprocesses assistants reponses"""

  with open(file_path) as f:
    generated_annotations = json.load(f)

  if is_gemma:
    split_line = '<eos>model\n'
  else:
    split_line = 'assistant<|end_header_id|>'

  sampled_text_only = []

  for index in generated_annotations['text']:
    if split_line in generated_annotations['text'][index]:
      assistant_reponse = generated_annotations['text'][index].split(split_line)[1]
      sampled_text_only.append(process_text_before_evaluation(assistant_reponse))
    else:
      sampled_text_only.append('')

  return sampled_text_only

In [83]:
assistant_reponses = prepare_sampled_text_for_evaluation(f"{drive_location}/results/gemma_results.json", is_gemma=True)

In [84]:
from datasets import load_dataset

dataset = load_dataset("csv", data_files=data_files)
human_responses = list(map(lambda s: process_text_before_evaluation(s), dataset['test']['annotation']))

In [87]:
assistant_reponses[50]

'The speaker has been at war for a long time, and is now tired of it.'

In [88]:
human_responses[50]

'That the speaker opens a sonnet by conveying persisting discontent truly conveys the tone of the piece immediately. Where many sonnets wholly pertain to unrequited love , this particular piece conveys the speaker’s discontents with a cause less overtly stated, that “I love another, and thus I hate myself”. Of course, at the end of a war, one expects peace, but the speaker states that though “all [my] war is done”, “I find no peace”. This contrast is furthered throughout, portraying the speaker’s situation as perhaps contradicting, or contrasting—he is being pulled from different directions, “I fly above the wind, yet can I not arise”.'

In [None]:
for i, ar in enumerate(assistant_reponses):
  if len(ar) < 5:
    del assistant_reponses[i]
    del human_responses[i]
    print(i, ar)

In [91]:
from pprint import pprint

rouge = evaluate.load('rouge')
bleu = evaluate.load('bleu')
bleurt = evaluate.load("bleurt", module_type="metric")
bertscore = evaluate.load("bertscore", lang='en')

rouge_results = rouge.compute(predictions=assistant_reponses, references=human_responses)
bleu_results = bleu.compute(predictions=assistant_reponses, references=human_responses)

pprint(rouge_results)
pprint(bleu_results)



{'rouge1': 0.20165773409507198,
 'rouge2': 0.03844267696295515,
 'rougeL': 0.1506271018812786,
 'rougeLsum': 0.1506423028226343}
{'bleu': 0.0034658997786793898,
 'brevity_penalty': 0.11384377367696598,
 'length_ratio': 0.31516628939578617,
 'precisions': [0.40552486187845305,
                0.06729445852951829,
                0.011881188118811881,
                0.0026495383380168605],
 'reference_length': 45944,
 'translation_length': 14480}


In [92]:
bleurt_results = bleurt.compute(predictions=assistant_reponses, references=human_responses)
bertscore_results = bertscore.compute(predictions=assistant_reponses, references=human_responses, lang='en')

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


In [93]:
df_bert_score = pd.DataFrame(bertscore_results['precision'], columns=['precision'])
df_bert_score['recall'] = bertscore_results['recall']
df_bert_score['f1'] = bertscore_results['f1']
df_bert_score['f1'].idxmax(), df_bert_score['f1'].idxmin()

(263, 429)

In [96]:
df_bert_score.describe()

Unnamed: 0,precision,recall,f1
count,675.0,675.0,675.0
mean,0.873021,0.834167,0.852961
std,0.020204,0.022413,0.017263
min,0.768253,0.768145,0.799913
25%,0.860489,0.819336,0.841332
50%,0.872843,0.8326,0.851435
75%,0.886699,0.84962,0.864117
max,0.933354,0.903087,0.905367


In [97]:
import pandas as pd
import numpy as np

df_describe = pd.DataFrame(bleurt_results['scores'])
df_describe.describe()

Unnamed: 0,0
count,675.0
mean,-1.13425
std,0.335436
min,-1.947912
25%,-1.367462
50%,-1.147086
75%,-0.910865
max,-0.05369


### Pushing results to hub

In [141]:
evaluate.push_to_hub(
    model_id="prettyvampire/poemma",
    metric_value=-0.926,
    metric_type="bleurt",
    dataset_type="prettyvampire/genius_poems_annotations",
    metric_name="BLEURT Mean",
    dataset_name="Genius Poems Annotations",
    dataset_split="test",
    task_type="text-generation",
    task_name="Poems Annotation Generation"
)

README.md:   0%|          | 0.00/5.77k [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/prettyvampire/poemma/commit/3f2a4adf5693ff679df13a1fa38e67b424b6acbe', commit_message='Update metadata with huggingface_hub', commit_description='', oid='3f2a4adf5693ff679df13a1fa38e67b424b6acbe', pr_url=None, repo_url=RepoUrl('https://huggingface.co/prettyvampire/poemma', endpoint='https://huggingface.co', repo_type='model', repo_id='prettyvampire/poemma'), pr_revision=None, pr_num=None)