In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
# Check if a GPU is available
if torch.cuda.is_available():
    # Get the current device index (default is 0 if no other device is specified)
    current_device = torch.cuda.current_device()
    
    # Get the name of the GPU at this device index
    gpu_name = torch.cuda.get_device_name(current_device)
    print(f"Current GPU: {gpu_name}")
else:
    print("No GPU available.")


Current GPU: Tesla P40


**few samples test**

In [2]:
from transformers import pipeline

model_id = "fine-tuned-model"
pipe = pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)
messages = [
    {"role": "user", "content": "Should I move to Scandinavia?"},
]
outputs = pipe(
    messages,
    max_new_tokens=128
)
print(outputs[0]["generated_text"][-1])

{'role': 'assistant', 'content': "Only if you enjoy being cold and dark for 365 days. And don't forget the language barrier. Good luck with that. And your social life will be non-existent. You might need to consider a new hobby. Like knitting. Or taxidermy. Or something. Don't ask me why. Ask yourself. And your mother. And your best friend. And the wind. And the trees. And the clouds. And the moon. And the stars. And the... you get the idea. Scandinavia is not for the faint of heart. Are you prepared? Do you have any sense of humor? Can you even think outside"}


In [3]:
messages = [
    {"role": "user", "content": "Who painted the Mona Lisa"},
]
outputs = pipe(
    messages,
    max_new_tokens=128
)
print(outputs[0]["generated_text"][-1])

{'role': 'assistant', 'content': 'Oh, just some unknown artist named Leonardo da Vinci. Never heard of him, right? He was also a bit of a weirdo. Did nothing for fun, right? Just like me. Except I don’t have a beard. Weirdo.'}


**batch test**

In [8]:
import numpy as np

from rouge import Rouge
from bert_score import score as bert_score
from nltk.translate.bleu_score import sentence_bleu
from transformers import AutoModelForCausalLM, AutoTokenizer

In [9]:
def calculate_metrics(reference_texts, candidate_texts):
    """
    Calculate BERTScore, ROUGE-L, BLEU-4, F1-Score
    :param reference_texts: List of reference sentences (ground truth).
    :param candidate_texts: List of candidate sentences (generated by the model).
    :return: A dictionary with calculated metrics.
    """
     # Ensure the inputs are valid
    if len(reference_texts) != len(candidate_texts):
        raise ValueError("Reference and candidate lists must be of the same length.")

    # Calculate BERTScore
    P, R, F1 = bert_score(candidate_texts, reference_texts, lang='en', return_hash=False)

    # Calculate ROUGE-L
    rouge = Rouge()
    rouge_scores = rouge.get_scores(candidate_texts, reference_texts, avg=True)

    # Calculate BLEU-4
    bleu_scores = [
        sentence_bleu([ref.split() for ref in reference_texts], candidate.split(), weights=(0.25, 0.25, 0.25, 0.25))
        for candidate in candidate_texts
    ]
    bleu_mean = np.mean(bleu_scores)
    
    # Calculate F1-Score
    f1_score = 2 * (P.mean() * R.mean()) / (P.mean() + R.mean() + 1e-10)  # Add a small value to avoid division by zero

    # Prepare results
    results = {
        'BERTScore': {
            'Precision': P.mean().item(),
            'Recall': R.mean().item(),
            'F1': F1.mean().item()
        },
        'ROUGE-L': {
            'F1': rouge_scores['rouge-l']['f'],
            'Precision': rouge_scores['rouge-l']['p'],
            'Recall': rouge_scores['rouge-l']['r']
        },
        'BLEU-4': bleu_mean,
        'F1-Score': f1_score.item()  # Converting to a scalar
    }

    return results

In [10]:
from datasets import load_dataset

# Load the training dataset
dataset = load_dataset("csv", data_files="../data/sarcasm.csv", split="train")

In [11]:
reference_sentences = []

candidate_sentences = []

for example in dataset:

    messages = [
        {"role": "user", "content": example['question']},
        {"role": "assistant", "content": example['answer']}
    ]

    outputs = pipe(messages, max_length=128)
    assistant_answer = outputs[0]["generated_text"][-1]
    answer = assistant_answer["content"]

    reference_sentences.append(example['answer'])
    candidate_sentences.append(answer)

In [12]:
metrics = calculate_metrics(reference_sentences, candidate_sentences)
print(metrics)

Some weights of RobertaModel were not initialized from the model checkpoint at roberta-large and are newly initialized: ['roberta.pooler.dense.bias', 'roberta.pooler.dense.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()


{'BERTScore': {'Precision': 0.8461359739303589, 'Recall': 0.8795238137245178, 'F1': 0.8623548150062561}, 'ROUGE-L': {'F1': 0.17334470310275787, 'Precision': 0.12760773296752512, 'Recall': 0.31465830293052655}, 'BLEU-4': 0.23570974295565314, 'F1-Score': 0.8625068664550781}
