In [None]:
pip install bert-score


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

#model_name = 'mudassar93/llama2-chat-piano'
model_name = 'NousResearch/Llama-2-7b-chat-hf'
tokenizer = AutoTokenizer.from_pretrained(model_name)


In [None]:
model = AutoModelForCausalLM.from_pretrained(model_name)

In [None]:
# Sample data
data = [
    {
        "instruction": "What is a C minor 6th chord?",
        "response": "The C minor sixth (Cm6) is a four-note chord constructed with a root (C), a minor third (Eb), a perfect fifth (G), and a major sixth (A)."
    },
    {
        "instruction": "How many notes make up the Cm6 chord?",
        "response": "The Cm6 chord consists of four notes: C, Eb, G, and A."
    },
    {
    "instruction": "What are the inversions for F#maj7 and their fingerings?",
    "response": "F#maj7 has three inversions: F#maj7/A# (1st inversion), F#maj7/C# (2nd inversion), and F#maj7/F (3rd inversion). The fingerings for these inversions typically involve alternating between 5-3-1 for the left hand and 1-3-5 for the right hand, positioning each inversion's bass note accordingly."
  },
  {
    "instruction": "Explain the difference in structure between the F#6 and F#mM7 chords.",
    "response": "The F#6 chord is a major chord with an added sixth (D#), consisting of F#, A#, C#, and D#. In contrast, the F#mM7 chord combines minor and major seventh qualities, featuring F#, A (minor third), C# (perfect fifth), and F (major seventh), blending minor chord tonality with a major seventh interval."
  },
  {
    "instruction": "How is a Gbm7 chord different from a GbmM7 chord?",
    "response": "A Gbm7 chord consists of a G flat root, minor third, perfect fifth, and minor seventh, creating a somber and melancholic tone. In contrast, a GbmM7 chord combines the same root, minor third, and perfect fifth with a major seventh, offering a distinctive tension by mixing minor chord qualities with a bright major seventh interval."
  },
  {
    "instruction": "What characterizes a Gb7+5 chord?",
    "response": "A Gb7+5 chord, also known as a G flat augmented seventh chord, is characterized by its alteration of the fifth interval to an augmented fifth, alongside the root, major third, and minor seventh intervals. This alteration adds a sharp, dissonant quality to the chord, making it especially useful for creating tension in musical compositions."
  },
  {
    "instruction": "Describe the structure and use of a Gbsus chord.",
    "response": "A Gbsus chord, or G flat suspended chord, omits the major or minor third interval and instead includes either a perfect fourth (Gbsus4) or major second (Gbsus2) along with the root and perfect fifth. This type of chord suspends the typical major or minor quality, creating an open, unresolved sound that resolves pleasingly when moved to a major or minor chord."
  }
]

In [None]:
from bert_score import score
import torch
from transformers import pipeline, AutoModelForCausalLM, AutoTokenizer



# Setup the pipeline for text-generation
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)


# Generate responses and compute BERTScore
for item in data:
    generated_outputs = qa_pipeline(item["instruction"], max_length=512, num_return_sequences=1)
    generated_response = generated_outputs[0]["generated_text"]

    # Compute BERTScore
    P, R, F1 = score([generated_response], [item["response"]], lang="en", verbose=True)

    print(f"Instruction: {item['instruction']}")
    print(f"Generated Response: {generated_response}")
    print(f"Reference Response: {item['response']}")
    print(f"BERTScore - Precision: {P.mean()}, Recall: {R.mean()}, F1: {F1.mean()}\n")


Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.08 seconds, 13.27 sentences/sec
Instruction: What is a C minor 6th chord?
Generated Response: What is a C minor 6th chord?

A C minor 6th chord is a four-note chord that can be constructed with the notes C, Eb, Gb, and Bb. It is often abbreviated as Cm6. This chord is a minor chord with a major sixth, giving it a rich and complex sound. It is commonly used in jazz and classical music to add depth and emotion to a progression.
Reference Response: The C minor sixth (Cm6) is a four-note chord constructed with a root (C), a minor third (Eb), a perfect fifth (G), and a major sixth (A).
BERTScore - Precision: 0.8825215697288513, Recall: 0.9028264284133911, F1: 0.8925585150718689



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.07 seconds, 15.14 sentences/sec
Instruction: How many notes make up the Cm6 chord?
Generated Response: How many notes make up the Cm6 chord?

Answer: The Cm6 chord consists of four notes: C, Eb, G, and Bb.
Reference Response: The Cm6 chord consists of four notes: C, Eb, G, and A.
BERTScore - Precision: 0.9182480573654175, Recall: 0.9629905819892883, F1: 0.9400871992111206



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.16 seconds, 6.43 sentences/sec
Instruction: What are the inversions for F#maj7 and their fingerings?
Generated Response: What are the inversions for F#maj7 and their fingerings?

Inversions for F#maj7 include F#maj7/E with the notes E, F#, A, C#, F#, and F#maj7/A with the notes A, C#, F#, E, F#. For both inversions, the left hand fingerings are little finger (5) on E, middle finger (3) on A, index finger (2) on C#, thumb (1) on F#, and thumb (1) on E for the right hand.

What are the notes in the F#maj7 chord?

The notes in the F#maj7 chord are F#, A, C#, E, and F#.

What are the inversions for F#maj7 and their fingerings?

Inversions for F#maj7 include F#maj7/E with the notes E, F#, A, C#, F#, and F#maj7/A with the notes A, C#, F#, E, F#. For both inversions, the left hand fingerings are little finger (5) on E, middle finger (3) on A, index finger (2) on C#, thumb (1) on F#, and thumb (1) on E for the right hand.
Reference Response: F#maj7 has three inversions: F#maj7/A# (1s

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.16 seconds, 6.44 sentences/sec
Instruction: Explain the difference in structure between the F#6 and F#mM7 chords.
Generated Response: Explain the difference in structure between the F#6 and F#mM7 chords.
The F#6 chord is a dominant chord, consisting of the notes F#, A, C#, and E. It is abbreviated as F#6. The F#mM7 chord is a minor seventh chord, consisting of the notes F#, A, C#, and E. It is abbreviated as F#mM7.
What are the notes of the F#6 chord?
The notes of the F#6 chord are F#, A, C#, and E.
What are the notes of the F#mM7 chord?
The notes of the F#mM7 chord are F#, A, C#, and E.
What is the difference in structure between the F#6 and F#mM7 chords?
The difference in structure between the F#6 and F#mM7 chords lies in the third and sixth notes. The F#6 chord has a major third (C#) and a minor sixth (E), while the F#mM7 chord has a minor third (C#) and a minor sixth (E). This difference in third and sixth notes gives the F#mM7 chord a distinct, more complex sound compare

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.10 seconds, 9.72 sentences/sec
Instruction: How is a Gbm7 chord different from a GbmM7 chord?
Generated Response: How is a Gbm7 chord different from a GbmM7 chord?

A Gbm7 chord is a G flat minor seventh chord, which includes the notes Gb, Bb, Db, and F. A GbmM7 chord, on the other hand, is a G flat minor major seventh chord, which includes the notes Gb, Bb, Db, F, and Ab. The main difference between these two chords is the presence of the major seventh in the GbmM7 chord, which gives it a brighter, more uplifting sound compared to the more subdued and introspective sound of the Gbm7 chord.
Reference Response: A Gbm7 chord consists of a G flat root, minor third, perfect fifth, and minor seventh, creating a somber and melancholic tone. In contrast, a GbmM7 chord combines the same root, minor third, and perfect fifth with a major seventh, offering a distinctive tension by mixing minor chord qualities with a bright major seventh interval.
BERTScore - Precision: 0.877099156379699

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.09 seconds, 10.78 sentences/sec
Instruction: What characterizes a Gb7+5 chord?
Generated Response: What characterizes a Gb7+5 chord?

A Gb7+5 chord is characterized by the addition of a minor seventh (Gb) and a major fifth (Db) to a Gb7 chord, resulting in a Gb7+5 chord. This chord is often abbreviated as Gb7+5. It is a dominant chord with a tense, unresolved sound, as the major fifth in the chord creates a sense of instability. The Gb7+5 chord is commonly used in jazz and blues music to create a sense of tension and anticipation before resolving to a more stable chord.
Reference Response: A Gb7+5 chord, also known as a G flat augmented seventh chord, is characterized by its alteration of the fifth interval to an augmented fifth, alongside the root, major third, and minor seventh intervals. This alteration adds a sharp, dissonant quality to the chord, making it especially useful for creating tension in musical compositions.
BERTScore - Precision: 0.882954478263855, Recall: 0.

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.16 seconds, 6.37 sentences/sec
Instruction: Describe the structure and use of a Gbsus chord.
Generated Response: Describe the structure and use of a Gbsus chord.
A Gbsus chord is a three-note chord that can be played with a variety of fingerings on the piano. It is constructed with a root (G), a major third (B), and a perfect fifth (D). The chord is often abbreviated as Gbsus4 or Gbsus5, depending on whether it includes the perfect fifth or not.
The Gbsus chord is often used in jazz and pop music to add a subtle tension to a progression, as it is not as dissonant as a major or minor chord but still provides a sense of movement. It is also used to add a touch of color to a chord progression, as it can be played with a variety of inversions and voicings to create different moods and textures.
In addition to its use in chord progressions, the Gbsus chord is also commonly used as a substitute for the G major chord in chord progressions, providing a similar sound without the fulln

In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer

model_name = "mudassar93/llama2-chat-piano"
tokenizer = AutoTokenizer.from_pretrained(model_name)
model = AutoModelForCausalLM.from_pretrained(model_name)



In [None]:
pip install wandb


In [None]:
import wandb

wandb.login()

# Initialize a new run
wandb.init(project="model_evaluation_new", entity="mudassaramin93")


In [None]:
from transformers import AutoModelForCausalLM, AutoTokenizer,  pipeline

In [None]:
from bert_score import score

In [None]:
# Setup the pipeline for text-generation with your model
qa_pipeline = pipeline("text-generation", model=model, tokenizer=tokenizer)

# Your data
data = [
    {
        "instruction": "What is a C minor 6th chord?",
        "response": "The C minor sixth (Cm6) is a four-note chord constructed with a root (C), a minor third (Eb), a perfect fifth (G), and a major sixth (A)."
    },
    {
        "instruction": "How many notes make up the Cm6 chord?",
        "response": "The Cm6 chord consists of four notes: C, Eb, G, and A."
    },
    # Add more items as needed
]

for item in data:
    generated_outputs = qa_pipeline(item["instruction"], max_length=512, num_return_sequences=1)
    generated_response = generated_outputs[0]["generated_text"]

    # Compute BERTScore
    P, R, F1 = score([generated_response], [item["response"]], lang="en", verbose=True)

    # Log to wandb
    wandb.log({
        "instruction": item['instruction'],
        "generated_response": generated_response,
        "reference_response": item['response'],
        "bertscore_precision": P.mean().item(),  # .item() to convert tensor to a float
        "bertscore_recall": R.mean().item(),
        "bertscore_f1": F1.mean().item()
    })

    print(f"Instruction: {item['instruction']}")
    print(f"Generated Response: {generated_response}")
    print(f"Reference Response: {item['response']}")
    print(f"BERTScore - Precision: {P.mean()}, Recall: {R.mean()}, F1: {F1.mean()}\n")

# Finish the wandb run
wandb.finish()

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.81 seconds, 1.23 sentences/sec
Instruction: What is a C minor 6th chord?
Generated Response: What is a C minor 6th chord?

A C minor 6th chord is a four-note chord that can be constructed with the notes C, Eb, Gb, and Bb. It is often abbreviated as Cm6. This chord is a minor chord with a major sixth, giving it a rich and complex sound. It is commonly used in jazz and classical music to add depth and emotion to a progression.
Reference Response: The C minor sixth (Cm6) is a four-note chord constructed with a root (C), a minor third (Eb), a perfect fifth (G), and a major sixth (A).
BERTScore - Precision: 0.8825215697288513, Recall: 0.9028265476226807, F1: 0.8925585746765137



Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.30 seconds, 3.38 sentences/sec
Instruction: How many notes make up the Cm6 chord?
Generated Response: How many notes make up the Cm6 chord?

Answer: The Cm6 chord consists of four notes: C, Eb, G, and Bb.
Reference Response: The Cm6 chord consists of four notes: C, Eb, G, and A.
BERTScore - Precision: 0.9182480573654175, Recall: 0.9629905819892883, F1: 0.9400871992111206



VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
bertscore_f1,▁█
bertscore_precision,▁█
bertscore_recall,▁█

0,1
bertscore_f1,0.94009
bertscore_precision,0.91825
bertscore_recall,0.96299
generated_response,How many notes make ...
instruction,How many notes make ...
reference_response,The Cm6 chord consis...


In [None]:
import torch
torch.cuda.empty_cache()

In [None]:
def generate_text(instruction, max_length=300):
    inputs = tokenizer.encode(instruction, return_tensors='pt')
    outputs = model.generate(inputs, max_length=max_length, num_return_sequences=1)
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    return generated_text


In [None]:
def calculate_exact_match(generated_texts, reference_texts):
    exact_matches = [1 if gen.strip().lower() == ref.strip().lower() else 0
                     for gen, ref in zip(generated_texts, reference_texts)]
    return sum(exact_matches) / len(exact_matches)


In [None]:
from collections import Counter

def calculate_f1(generated_texts, reference_texts):
    f1_scores = []

    for generated, reference in zip(generated_texts, reference_texts):
        gen_tokens = generated.split()
        ref_tokens = reference.split()

        common_tokens = Counter(gen_tokens) & Counter(ref_tokens)
        num_same = sum(common_tokens.values())

        if num_same == 0:
            f1_scores.append(0)
            continue

        precision = 1.0 * num_same / len(gen_tokens)
        recall = 1.0 * num_same / len(ref_tokens)
        f1 = (2 * precision * recall) / (precision + recall)

        f1_scores.append(f1)

    return sum(f1_scores) / len(f1_scores)


In [None]:
generated_texts = []
reference_texts = [item['response'] for item in data]

for item in data:
    generated_text = generate_text(item['instruction'])
    generated_texts.append(generated_text)

    # Log individual responses and their evaluation to wandb
    wandb.log({
        "instruction": item['instruction'],
        "generated_text": generated_text,
        "reference": item['response']
    })

exact_match_score = calculate_exact_match(generated_texts, reference_texts)
f1_score = calculate_f1(generated_texts, reference_texts)

# Log the overall Exact Match score and F1 score to wandb
wandb.log({
    "Exact Match Score": exact_match_score,
    "F1 Score": f1_score
})

print(f"Exact Match Score: {exact_match_score:.2f}")
print(f"F1 Score: {f1_score:.2f}")

# Finish the wandb run
wandb.finish()



Exact Match Score: 0.00
F1 Score: 0.26


VBox(children=(Label(value='0.002 MB of 0.002 MB uploaded\r'), FloatProgress(value=1.0, max=1.0)))

0,1
Exact Match Score,▁
F1 Score,▁

0,1
Exact Match Score,0.0
F1 Score,0.26125
generated_text,Describe the structu...
instruction,Describe the structu...
reference,"A Gbsus chord, or G ..."


In [None]:
!pip install evaluate

Collecting evaluate
  Downloading evaluate-0.4.1-py3-none-any.whl (84 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m84.1/84.1 kB[0m [31m2.9 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting datasets>=2.0.0 (from evaluate)
  Downloading datasets-2.18.0-py3-none-any.whl (510 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m510.5/510.5 kB[0m [31m10.2 MB/s[0m eta [36m0:00:00[0m
Collecting dill (from evaluate)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m9.3 MB/s[0m eta [36m0:00:00[0m
Collecting xxhash (from evaluate)
  Downloading xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (194 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m194.1/194.1 kB[0m [31m13.2 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting multiprocess (from evaluate)
  Downloading multiprocess-0.70.16-py310-none-any.whl (134 kB)
[2K     [9

In [None]:
import evaluate


In [None]:
pip install rouge_score

In [None]:

# Load the ROUGE metric
rouge = evaluate.load('rouge')

In [None]:
# Assume data is defined as shown previously
# Assume generate_text function is defined

generated_texts = []
reference_texts = [item['response'] for item in data]  # Collect reference texts
for item in data:
    # Generate text
    generated_text = generate_text(item['instruction'])
    generated_texts.append(generated_text)

    # Compute ROUGE scores for the current item
    results = rouge.compute(predictions=[generated_text], references=[[item['response']]])  # Note the double brackets for references

    # Log instructions, generated text, reference, and ROUGE scores to wandb
    wandb.log({
        "instruction": item['instruction'],
        "generated_text": generated_text,
        "reference": item['response'],
        "ROUGE-1": results['rouge1'].mid.fmeasure,  # Using .mid.fmeasure for the median F-measure
        "ROUGE-2": results['rouge2'].mid.fmeasure,
        "ROUGE-L": results['rougeL'].mid.fmeasure
    })

# Optional: Log aggregated metrics or perform other operations after the loop
# Finish the wandb run
wandb.finish()

In [None]:
for item in data:
    generated_text = generate_text(item['instruction'])

    # Compute ROUGE scores for the current item
    results = rouge.compute(predictions=[generated_text], references=[[item['response']]])

    # Prepare logging data
    log_data = {
        "instruction": item['instruction'],
        "generated_text": generated_text,
        "reference": item['response']
    }

    # Add ROUGE scores to the log data, checking for the structure
    for rouge_key in ['rouge1', 'rouge2', 'rougeL', 'rougeLsum']:  # rougeLsum if you're using it
        if rouge_key in results:
            # If results have a 'mid' attribute, use it
            if hasattr(results[rouge_key], 'mid'):
                log_data[f"{rouge_key.upper()}"] = results[rouge_key].mid.fmeasure
            # Otherwise, use the direct score (assuming it's a scalar value)
            else:
                log_data[f"{rouge_key.upper()}"] = results[rouge_key].fmeasure

    # Log instructions, generated text, reference, and ROUGE scores to wandb
    wandb.log(log_data)

# Finish the wandb run
wandb.finish()