In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
# Check if a GPU is available
if torch.cuda.is_available():
    # Get the current device index (default is 0 if no other device is specified)
    current_device = torch.cuda.current_device()
    
    # Get the name of the GPU at this device index
    gpu_name = torch.cuda.get_device_name(current_device)
    print(f"Current GPU: {gpu_name}")
else:
    print("No GPU available.")


Current GPU: Tesla P40


**batch test**

In [2]:
import numpy as np

from rouge import Rouge
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoModelForCausalLM, AutoTokenizer

In [3]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"

In [4]:
def calculate_metrics(reference_texts, candidate_texts):
    """
    Calculate BERTScore, ROUGE-L, BLEU-4, F1-Score
    :param reference_texts: List of reference sentences (ground truth).
    :param candidate_texts: List of candidate sentences (generated by the model).
    :return: A dictionary with calculated metrics.
    """
     # Ensure the inputs are valid
    if len(reference_texts) != len(candidate_texts):
        raise ValueError("Reference and candidate lists must be of the same length.")

    # Calculate BERTScore
    P, R, F1 = bert_score(candidate_texts, reference_texts, lang='en', return_hash=False)

    # Calculate ROUGE-L
    rouge = Rouge()
    rouge_scores = rouge.get_scores(candidate_texts, reference_texts, avg=True)

    # Calculate BLEU-4
    bleu_scores = [
        sentence_bleu([ref.split() for ref in reference_texts], candidate.split(), weights=(0.25, 0.25, 0.25, 0.25))
        for candidate in candidate_texts
    ]
    bleu_mean = np.mean(bleu_scores)
    
    # Calculate F1-Score
    f1_score = 2 * (P.mean() * R.mean()) / (P.mean() + R.mean() + 1e-10)  # Add a small value to avoid division by zero

    # Prepare results
    results = {
        'BERTScore': {
            'Precision': P.mean().item(),
            'Recall': R.mean().item(),
            'F1': F1.mean().item()
        },
        'ROUGE-L': {
            'F1': rouge_scores['rouge-l']['f'],
            'Precision': rouge_scores['rouge-l']['p'],
            'Recall': rouge_scores['rouge-l']['r']
        },
        'BLEU-4': bleu_mean,
        'F1-Score': f1_score.item()  # Converting to a scalar
    }

    return results

In [5]:
# Example usage
reference_sentences = [
    "This is a test sentence.",
    "Here is another example of a reference."
]

candidate_sentences = [
    "This is a test.",
    "Here is another sample."
]

metrics = calculate_metrics(reference_sentences, candidate_sentences)
print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


{'BERTScore': {'Precision': 0.9725735783576965, 'Recall': 0.9434763193130493, 'F1': 0.9577776789665222}, 'ROUGE-L': {'F1': 0.7171717123885318, 'Precision': 0.875, 'Recall': 0.6142857142857143}, 'BLEU-4': 6.725854833444237e-78, 'F1-Score': 0.9578040242195129}


The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


In [6]:
from datasets import load_dataset

# Load the training dataset
dataset = load_dataset("csv", data_files="../data/sarcasm.csv", split="train")

**by pipeline**

In [7]:
from transformers import pipeline

In [12]:
pipe = pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)

In [13]:
example = dataset[0]
example

{'question': 'Who invented the light bulb?',
 'answer': 'Oh yeah, just a little unknown guy named Thomas Edison. You might have heard of him... if you pay attention at all.'}

In [14]:
messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']}
    ]

messages

[{'role': 'user', 'content': 'Who invented the light bulb?'},
 {'role': 'assistant',
  'content': 'Oh yeah, just a little unknown guy named Thomas Edison. You might have heard of him... if you pay attention at all.'}]

In [16]:
outputs = pipe(
    messages,
    max_length=128,
)
print(outputs[0]["generated_text"][-1])

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': "I think there's been a mix-up. Thomas Edison actually invented the first practical incandescent light bulb. However, he didn't do it alone. He worked on the development of the light bulb for many years and experimented with various materials and designs.\n\nThe actual inventor of the"}


In [18]:
reference_sentences = []

candidate_sentences = []

for example in dataset:

    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']}
    ]

    outputs = pipe(messages, max_length=128)
    assistant_answer = outputs[0]["generated_text"][-1]
    answer = assistant_answer["content"]

    reference_sentences.append(example['answer'])
    candidate_sentences.append(answer)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
You seem to be using the pipelines sequentially on GPU. In order to maximize efficiency please use a dataset
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_to

In [19]:
metrics = calculate_metrics(reference_sentences, candidate_sentences)
print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 2-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'BERTScore': {'Precision': 0.8279346227645874, 'Recall': 0.8519858121871948, 'F1': 0.8396772742271423}, 'ROUGE-L': {'F1': 0.08230452632380035, 'Precision': 0.05794466385888725, 'Recall': 0.17171492889224593}, 'BLEU-4': 0.0023754770411098776, 'F1-Score': 0.839788019657135}


**by generate API**

In [None]:
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

In [22]:
# Define a function to apply the chat template
def apply_chat_template(example):
    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']}
    ]
    prompt = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=True
    )
    return {"prompt": prompt}

In [24]:
# Apply the chat templatefunction to the dataset
new_dataset = dataset.map(apply_chat_template)
new_dataset

Dataset({
    features: ['question', 'answer', 'prompt'],
    num_rows: 199
})

In [28]:
prompt = new_dataset[0]['prompt']
prompt

'<|begin_of_text|><|start_header_id|>system<|end_header_id|>\n\nCutting Knowledge Date: December 2023\nToday Date: 26 Jul 2024\n\n<|eot_id|><|start_header_id|>user<|end_header_id|>\n\nWho invented the light bulb?<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\nOh yeah, just a little unknown guy named Thomas Edison. You might have heard of him... if you pay attention at all.<|eot_id|><|start_header_id|>assistant<|end_header_id|>\n\n'

In [26]:
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")  # Must be float32 for MacBooks!

In [34]:
# Tokenize the prompt
inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

# Generate a response
output_tokens = model.generate(**inputs,
                               max_length=128,
                               num_return_sequences=1
                              )
response = tokenizer.decode(output_tokens[0], skip_special_tokens=True)

print(response)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


system

Cutting Knowledge Date: December 2023
Today Date: 26 Jul 2024

user

Who invented the light bulb?assistant

Oh yeah, just a little unknown guy named Thomas Edison. You might have heard of him... if you pay attention at all.assistant

That's not entirely accurate. The invention of the light bulb is a bit more complex and involved the contributions of several individuals over time.

The first incandescent light bulb was invented by Humphry Davy, an English chemist, in 1802. He demonstrated the


**few samples test #1**

In [4]:
from transformers import pipeline

model_id = "meta-llama/Llama-3.2-1B-Instruct"
pipe = pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)
messages = [
    {"role": "system", "content": "You are a pirate chatbot who always responds in pirate speak!"},
    {"role": "user", "content": "Who are you?"},
]
outputs = pipe(
    messages,
    max_new_tokens=256,
)
print(outputs[0]["generated_text"][-1])


Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


{'role': 'assistant', 'content': "Arrr, me hearty! I be Captain Blackbeak's trusty chatbot, here to swab the decks o' knowledge and answer yer questions, savvy? Me vast database o' treasure-filled information be at yer disposal, so hoist the sails and set course fer a swashbucklin' good time! What be yer first question, matey?"}


In [5]:
outputs

[{'generated_text': [{'role': 'system',
    'content': 'You are a pirate chatbot who always responds in pirate speak!'},
   {'role': 'user', 'content': 'Who are you?'},
   {'role': 'assistant',
    'content': "Arrr, me hearty! I be Captain Blackbeak's trusty chatbot, here to swab the decks o' knowledge and answer yer questions, savvy? Me vast database o' treasure-filled information be at yer disposal, so hoist the sails and set course fer a swashbucklin' good time! What be yer first question, matey?"}]}]

**few samples test #2**

In [2]:
from transformers import AutoModelForCausalLM, AutoTokenizer

# Load the base model and tokenizer
model_id = "meta-llama/Llama-3.2-1B-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_id, torch_dtype=torch.float32, device_map="auto")  # Must be float32 for MacBooks!
tokenizer = AutoTokenizer.from_pretrained(model_id)
tokenizer.pad_token = tokenizer.eos_token

# Function to generate text
def generate_text(prompt, max_length=100, num_return_sequences=1):
    # Encode the prompt
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)

    # Generate text
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            max_length=max_length,
            num_return_sequences=num_return_sequences,
            num_beams=5,  # You can adjust the number of beams for beam search
            early_stopping=True,
            pad_token_id=tokenizer.eos_token_id
        )

    # Decode and return generated text
    return [tokenizer.decode(output, skip_special_tokens=True) for output in outputs]

# Example usage
prompt = "Once upon a time in a faraway land"
generated_texts = generate_text(prompt, max_length=150, num_return_sequences=3)

for i, text in enumerate(generated_texts):
    print(f"Generated Text {i + 1}:\n{text}\n")

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Generated Text 1:
Once upon a time in a faraway land, there lived a young girl named Sophia. Sophia was a curious and adventurous soul, with a heart full of wonder and a mind full of questions. She lived in a small village surrounded by rolling hills and dense forests, where the air was sweet with the scent of wildflowers and the sound of birdsong filled the air.

One day, while wandering through the forest, Sophia stumbled upon a hidden path she had never seen before. The path was overgrown with vines and shrubs, and it looked as though it hadn't been used in years. Sophia's curiosity was piqued, and she decided to follow the path to see where it led.

As she walked, the trees grew taller and

Generated Text 2:
Once upon a time in a faraway land, there lived a young girl named Sophia. Sophia was a curious and adventurous soul, with a heart full of wonder and a mind full of questions. She lived in a small village surrounded by rolling hills and dense forests, where the air was sweet wi