In [None]:
%pip install openai datasets plotly

In [19]:
import re

def process_text_before_evaluation(t: str):
    """Removes unnessecary symbols before evaluation"""

    t = t.replace('\n', ' ').strip()
    t = t.replace('<eos>', '')
    t = t.replace('<end_of_turn>', '')
    t = re.sub(' +', ' ', t)
    return t


def prepare_sampled_text_for_evaluation(file_path: str, is_gemma=False):
  """Reads and preprocesses assistants reponses"""

  with open(file_path) as f:
    generated_annotations = json.load(f)

  if is_gemma:
    split_line = '<eos>model\n'
  else:
    split_line = 'assistant<|end_header_id|>'

  sampled_text_only = []

  for index in generated_annotations['text']:
    if split_line in generated_annotations['text'][index]:
      assistant_reponse = generated_annotations['text'][index].split(split_line)[1]
      sampled_text_only.append(process_text_before_evaluation(assistant_reponse))
    else:
      sampled_text_only.append('')

  return sampled_text_only

#### Load human responses

In [13]:
from datasets import load_dataset
dataset = load_dataset("prettyvampire/genius_poems_annotations")

README.md:   0%|          | 0.00/571 [00:00<?, ?B/s]

train-00000-of-00001.parquet:   0%|          | 0.00/1.41M [00:00<?, ?B/s]

test-00000-of-00001.parquet:   0%|          | 0.00/375k [00:00<?, ?B/s]

Generating train split:   0%|          | 0/2576 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/687 [00:00<?, ? examples/s]

In [20]:
human_responses = list(map(lambda s: process_text_before_evaluation(s), dataset['test']['annotation']))

In [28]:
print('Total responses from human = ', len(human_responses))

Total responses from human =  687


In [196]:
human_responses[80]

'‘Ancient Lord’ – nothing if not respectful. Escalus is the Duke’s right hand man. His name deliberately recalls the scales of justice – the play is obsessed with balancing right and wrong.'

#### Load assistants responses

In [21]:
import json

files = [
            'llama-3.1-8b-it-4bit',
            'llama-3.1-8b-it-4bit-qlora',
            'gemma-2-2b-it-lora'
        ]
assistants_responses = {}

for fname in files:
    assistant_reponses = prepare_sampled_text_for_evaluation(f"./data/{fname}.json", is_gemma='gemma' in fname)
    assistants_responses[fname] = assistant_reponses

In [197]:
for f in files:
    print(f'Total responses from assistant {f} = ', len(assistants_responses[f]))
    print(assistants_responses[f][80])
    print()

Total responses from assistant llama-3.1-8b-it-4bit =  687
The line "ESCALUS, an ancient Lord" from the "Measure for Measure Characters of the Play" by William Shakespeare refers to a character in the play named Escalus. In this context, "ancient" is an archaic term used to describe a person of high social status, wealth, and age. It is not necessarily referring to Escalus's physical age but rather his noble position in society. In the Elizabethan era, the term "ancient" was often used to describe a nobleman or a member of the aristocracy, conveying a sense of dignity, respect, and authority.

Total responses from assistant llama-3.1-8b-it-4bit-qlora =  687
Escalus is one of Claudio’s uncles and he appears only at the beginning of Act I Scene II as part of his role as Jailer-to-the-Gates where he reports on two different matters; Claudius’ situation with Juliet and Isabella being brought before him. He also reappears later in Act III when he serves as the messenger that delivers news a

#### Prepare batch file for OpenAI

In [132]:
prompt = """
You are given the correct annotation to a poem and 3 alternative annotations below.
Vote for the alternative annotation that provides explanation most similar to the correct annotation.
Provide the number of a most similar annotation as a response. Do not include any other details.

<correct annotation>
{human_response}
</correct annotation>

<alternative annotation 1>
{reference_annotation_1}
</alternative annotation 1>

<alternative annotation 2>
{reference_annotation_2}
</alternative annotation 2>

<alternative annotation 3>
{reference_annotation_3}
</alternative annotation 3>
"""

In [133]:
def prepare_request(rid, prompt):
    return {
      "custom_id": f"poems-annotation-request-{rid}",
      "method": "POST",
      "url": "/v1/chat/completions",
      "body": {
        "model": "gpt-4o",
        "messages": [
          {
            "role": "system",
            "content": "You are an expert in poetry."
          },
          {
            "role": "user",
            "content": prompt
          }
        ],
        "max_tokens": 64
      }
    }

In [134]:
batch_requests = []

In [135]:
for i, annotations in enumerate(zip(human_responses, assistants_responses['llama-3.1-8b-it-4bit'], assistants_responses['llama-3.1-8b-it-4bit-qlora'], assistants_responses['gemma-2-2b-it-lora'])):
    batch_requests.append(prepare_request(i, prompt=prompt.format(human_response=annotations[0],
                                                               reference_annotation_1=annotations[1],
                                                               reference_annotation_2=annotations[2],
                                                               reference_annotation_3=annotations[3],
                                                              )))

In [136]:
print(batch_requests[600]['body']['messages'][1]['content'])


You are given the correct annotation to a poem and 3 alternative annotations below.
Vote for the alternative annotation that provides explanation most similar to the correct annotation.
Provide the number of a most similar annotation as a response. Do not include any other details.

<correct annotation>
In the beginning of the poem the child absorbs the impressions of passive objects into his being; now the child is absorbing active traits and emotions. This ability to absorb one’s surroundings is one of the crowning feature’s of childhood. Children are perpetually engaged in the process of learning and making connections. The next lines go on to show the unforgettable and uncanny sensory details by which we recall familial relations. Mother has a certain smell. Father is rather exuberant. These are intimate details of family life that a child is sure to remember.
</correct annotation>

<alternative annotation 1>
The lines "They gave him afterward every day, they became part of him" f

In [137]:
with open('./data/batch_requests_3.jsonl', 'w') as f:
    for entry in batch_requests:
        json.dump(entry, f)
        f.write('\n')

#### Creating batch request

In [138]:
from openai import OpenAI
client = OpenAI(api_key='your key')

batch_input_file = client.files.create(
  file=open("./data/batch_requests_3.jsonl", "rb"),
  purpose="batch"
)

In [None]:
batch_input_file_id = batch_input_file.id

client.batches.create(
    input_file_id=batch_input_file_id,
    endpoint="/v1/chat/completions",
    completion_window="24h",
    metadata={
      "description": "poems annotation job"
    }
)

In [None]:
client.batches.retrieve("batch_67505f1e994c81919e79bb6c794ff6a3")

In [151]:
file_response = client.files.content("file-GYidKYvWWn2Jz35mu9gt7x")

In [152]:
with open('./data/batch_responses_3.jsonl', 'w', encoding='utf-8') as f:
    f.write(file_response.text)

In [153]:
responses = []
with open('./data/batch_responses_3.jsonl') as f:
    for line in f:
        responses.append(json.loads(line)['response']['body']['choices'][0]['message']['content'])

In [146]:
from collections import Counter

**First barch responses**

In [105]:
Counter(responses)

Counter({'1': 457, '3': 154, '2': 76})

**Second batch responses**

In [124]:
Counter(responses)

Counter({'1': 497,
         '3': 108,
         '2': 81,
         "I'm sorry, but I can't choose the most similar annotation without content in the reference annotations. Can you please provide the content for the three reference annotations?": 1})

**Third batch responses**

In [154]:
Counter(responses)

Counter({'1': 365,
         '3': 188,
         '2': 110,
         'Alternative annotation 1': 6,
         'Alternative annotation 2': 5,
         'Alternative annotation 3': 2,
         'Annotation 3': 2,
         'Annotation 1': 2,
         'The number of the most similar annotation is 1.': 1,
         'None of the alternative annotations provided are notably similar to the correct annotation, as they do not address the meter, rhythm, caesura, or the potential volta.': 1,
         "None of the alternative annotations seem to provide an explanation similar to the correct annotation. However, if I have to choose the closest one, alternative annotation 2 has some reference to psychological states and complexity, which could loosely relate to the craftsmanship and construction mentioned in the correct annotation, even if it doesn't directly address it. So": 1,
         'None of the alternative annotations provided closely aligns with the explanation given in the correct annotation.': 1,
 

### Charts & Stats

In [169]:
import plotly.io as pio
import plotly.offline as pyo
import plotly.graph_objects as go
import plotly.express as px

pyo.init_notebook_mode()
plotly_template = pio.templates["plotly_white"]

In [179]:
data = [
    {"step": 50, "train": 1.832800, "validation": 1.944836},
    {"step": 100, "train": 1.897800, "validation": 1.826170},
    {"step": 150, "train": 1.751900, "validation": 1.780192},
    {"step": 200, "train": 1.8609000, "validation": 1.741657},
    {"step": 250, "train": 2.079300, "validation": 1.714025},
    {"step": 300, "train": 1.903500, "validation": 1.692754},
    {"step": 350, "train": 1.720100, "validation": 1.681152},
    {"step": 400, "train": 1.369700, "validation": 1.675502},
    {"step": 450, "train": 1.974000, "validation": 1.671325},
    {"step": 500, "train": 1.822900, "validation": 1.668295},
    {"step": 550, "train": 1.076600, "validation": 1.666146},
    {"step": 600, "train": 1.109300, "validation": 1.664798}
]
df = pd.DataFrame(data)

### Training graph

In [184]:
# Create traces
fig = go.Figure()
fig.add_trace(go.Scatter(x=df['step'], y=df['validation'],
                    mode='lines+markers',
                    name='Validation Loss', line_color='orange'))
fig.add_trace(go.Scatter(x=df['step'], y=df['train'],
                    mode='lines+markers', name='Train Loss', line_color='black'))
fig.update_layout(
    plot_bgcolor='white',
    title="One epoch training"
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title="Step number"
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title="Cross-Entropy Loss"
)

fig.show()

In [203]:
metrics = [
    {"model": "llama-3.1-8b-it-4bit", "metric": "ROUGE-1", "score":0.256},
    {"model": "llama-3.1-8b-it-4bit", "metric": "BLEU", "score": 0.171},
    {"model": "llama-3.1-8b-it-4bit", "metric": "BLEURT mean", "score": -0.929},
    {"model": "llama-3.1-8b-it-4bit", "metric": "BERTScore F1 mean", "score": 0.839},
     
    {"model": "llama-3.1-8b-it-4bit-qlora", "metric": "ROUGE-1", "score": 0.185},
    {"model": "llama-3.1-8b-it-4bit-qlora", "metric": "BLEU", "score": 0.005},
    {"model": "llama-3.1-8b-it-4bit-qlora", "metric": "BLEURT mean", "score": -0.926},
    {"model": "llama-3.1-8b-it-4bit-qlora", "metric": "BERTScore F1 mean", "score": 0.836},

    {"model": "gemma-2-2b-it-lora", "metric": "ROUGE-1", "score": 0.201},
    {"model": "gemma-2-2b-it-lora", "metric": "BLEU", "score": 0.003},
    {"model": "gemma-2-2b-it-lora", "metric": "BLEURT mean", "score": -1.134},
    {"model": "gemma-2-2b-it-lora", "metric": "BERTScore F1 mean", "score": 0.852}
]

In [204]:
metrics_df = pd.DataFrame(metrics)

### Rouge, Bleu, BleuRT, BERTScore Metrics

In [219]:
df = metrics_df
fig = px.histogram(df, x="metric", y="score",
             color='model', barmode='group',
             height=400, color_discrete_map={'llama-3.1-8b-it-4bit': 'orange', 
                                                   'llama-3.1-8b-it-4bit-qlora': 'grey', 'gemma-2-2b-it-lora': 'black'})
fig.update_layout(
    plot_bgcolor='white',
    title="Standard Evaluation Metrics"
)
fig.update_xaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title="Metric"
)
fig.update_yaxes(
    mirror=True,
    ticks='outside',
    showline=True,
    linecolor='black',
    gridcolor='lightgrey',
    title="Score"
)
fig.show()

### LLM as a Judge

In [228]:
llm_as_a_judge = [
    {"model": "llama-3.1-8b-it-4bit", "times_chosen": 374},
    {"model": "llama-3.1-8b-it-4bit-qlora","times_chosen": 116},
    {"model": "gemma-2-2b-it-lora", "times_chosen": 192},
    {"model": "none", "times_chosen": 5},
]

fig = px.pie(pd.DataFrame(llm_as_a_judge), values='times_chosen', color='model',
             names='model', title='GPT-4o Preference by Similarity to Human Response', color_discrete_map={'llama-3.1-8b-it-4bit': 'orange', 
                                                   'llama-3.1-8b-it-4bit-qlora': 'grey', 'gemma-2-2b-it-lora': 'black', 'none': 'lightgrey'})
fig.update_traces(textinfo='percent+label')
fig.show()