In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
# Check if a GPU is available
if torch.cuda.is_available():
    # Get the current device index (default is 0 if no other device is specified)
    current_device = torch.cuda.current_device()
    
    # Get the name of the GPU at this device index
    gpu_name = torch.cuda.get_device_name(current_device)
    print(f"Current GPU: {gpu_name}")
else:
    print("No GPU available.")

Current GPU: Tesla P40


In [2]:
import sys
import os
current_dir = os.getcwd()
sys.path.append(os.path.join(os.getcwd(), '..'))

In [3]:
import numpy  as np
from nltk import ngrams
from collections import Counter
from transformers import AutoTokenizer
from bert_score import score

# Calculate BERT
def calcuate_bert(reference:str, candidate:str):
    P, R, F1 = score([candidate], [reference], lang="ja")  # Set language to Japanese
    #print(f"BERTScore: Precision={P.mean():.4f}, Recall={R.mean():.4f}, F1={F1.mean():.4f}")
    return {
        'precision': float(P),
        'recall': float(R),
        'f1_score': float(F1)
    }
    

# Calculate ROUGE, ROUGE-L
def calculate_rouge(reference, generated, n=1, model_id = "CohereForAI/aya-23-8B"):

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Tokenize the input strings into words
    reference_tokens = tokenizer.tokenize(reference) #reference.split()
    generated_tokens = tokenizer.tokenize(generated) #generated.split()
    
    # Generate n-grams
    reference_ngrams = list(ngrams(reference_tokens, n))
    generated_ngrams = list(ngrams(generated_tokens, n))
    
    # Count n-grams
    reference_count = Counter(reference_ngrams)
    generated_count = Counter(generated_ngrams)

    # Calculate matched n-grams
    matched_ngrams = reference_count & generated_count
    
    # Precision
    precision = (sum(matched_ngrams.values()) / len(generated_ngrams)) if generated_ngrams else 0.0
    
    # Recall
    recall = (sum(matched_ngrams.values()) / len(reference_ngrams)) if reference_ngrams else 0.0
    
    # F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

def lcs_length(x, y):
    """Calculate the length of the longest common subsequence (LCS)"""
    m, n = len(x), len(y)
    # Create a 2D array to store lengths of longest common subsequence.
    lcs_table = [[0] * (n + 1) for _ in range(m + 1)]

    # Fill the lcs_table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i - 1] == y[j - 1]:
                lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
            else:
                lcs_table[i][j] = max(lcs_table[i - 1][j], lcs_table[i][j - 1])

    return lcs_table[m][n]

def calculate_rouge_l(reference, generated, model_id = "CohereForAI/aya-23-8B"):

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Tokenize the input strings into words
    reference_tokens = tokenizer.tokenize(reference) #reference.split()
    generated_tokens = tokenizer.tokenize(generated) #generated.split()

    # Calculate the length of the longest common subsequence
    lcs_len = lcs_length(reference_tokens, generated_tokens)

    # Precision
    precision = lcs_len / len(generated_tokens) if generated_tokens else 0.0

    # Recall
    recall = lcs_len / len(reference_tokens) if reference_tokens else 0.0

    # F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

# Calculate BLEU-score
def brevity_penalty(candidate, reference):
    """
    Calculates the brevity penalty given the candidate and reference sentences.
    """
    reference_length = len(reference)
    candidate_length = len(candidate)

    if reference_length < candidate_length:
        BP = 1
    else:
        penalty = 1 - (reference_length / candidate_length)
        BP = np.exp(penalty)

    return BP


def average_clipped_precision(candidate:str, reference:str,n:int):
    """
    Calculates the precision given the candidate and reference sentences.
    """

    clipped_precision_score = []
    
    # Loop through values 1, 2, 3, 4. This is the length of n-grams
    for n_gram_length in range(1, n):
        reference_n_gram_counts = Counter(ngrams(reference, n_gram_length))        
        candidate_n_gram_counts = Counter(ngrams(candidate, n_gram_length))

        total_candidate_ngrams = sum(candidate_n_gram_counts.values())       
        
        for ngram in candidate_n_gram_counts: 
            # check if it is in the reference n-gram
            if ngram in reference_n_gram_counts:
                # if the count of the candidate n-gram is bigger than the corresponding
                # count in the reference n-gram, then set the count of the candidate n-gram 
                # to be equal to the reference n-gram
                
                if candidate_n_gram_counts[ngram] > reference_n_gram_counts[ngram]: 
                    candidate_n_gram_counts[ngram] = reference_n_gram_counts[ngram] # t
                                                   
            else:
                candidate_n_gram_counts[ngram] = 0 # else set the candidate n-gram equal to zero

        clipped_candidate_ngrams = sum(candidate_n_gram_counts.values())
        
        clipped_precision_score.append(clipped_candidate_ngrams / total_candidate_ngrams)
    
    # Calculate the geometric average: take the mean of elemntwise log, then exponentiate
    # This is equivalent to taking the n-th root of the product as shown in equation (1) above
    s = np.exp(np.mean(np.log(clipped_precision_score)))
    
    return s

def calculate_bleu_score(reference:str,candidate:str, n:int):
    assert n >=2, "n must >= 2"
    BP = brevity_penalty(candidate, reference)    
    geometric_average_precision = average_clipped_precision(candidate, reference, n)    
    return BP * geometric_average_precision

In [4]:
# !huggingface-cli download dataset longquan/llm-japanese-dataset-split_10
from datasets import load_dataset

dataset = load_dataset("longquan/llm-japanese-dataset-split_10", cache_dir="~/.cache/huggingface/datasets")

# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]

Available Splits: dict_keys(['train'])


In [5]:
import random
from transformers import pipeline

In [6]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)

In [7]:
random_sample = train_data[random.choice(range(train_data.num_rows))]

reference = random_sample['output']
print("Reference: ",reference,"\n")

messages = [
    {"role": "system", "content": random_sample["instruction"]},
    {"role": "user", "content": random_sample["input"]}
]
outputs = pipe(
    messages,
    max_new_tokens=128,
)
candidate = outputs[0]["generated_text"][-1]['content']
print("Model: ", candidate)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Reference:  番組はＮＨＫラジオセンタースタッフの岩崎博（ミスター・イワサキ）が進行役を務め、特に１９６０年代・１９７０年代のイージーリスニングをメインとした洋楽のナンバーを多数かけ、それを交えつつ、岩崎の一人コント的なトーク（季節や時事ネタ）を楽しむ。 

Model:  番組はNHKラジオセンターの岩崎博（ミスターイワサキ）が進行を務め、特に1960年代と1970年代のエジプト音楽を中心とした洋楽のナンバーを多く取り上げ、岩崎の一人コント的なトーク（季節や時事ネタ）を楽しむ。


In [8]:
print("BERT:", calcuate_bert(reference, candidate))
print("BLEU-2:", calculate_bleu_score(reference, candidate,2))
print("BLEU-4:", calculate_bleu_score(reference, candidate,4))
print("ROUGE-2:", calculate_rouge(reference, candidate, n=2))
print("ROUGE-3:", calculate_rouge(reference, candidate, n=3))
rouge_l = calculate_rouge_l(reference, candidate)
print("ROUGE-L:", rouge_l)



BERT: {'precision': 0.8950253129005432, 'recall': 0.890752911567688, 'f1_score': 0.8928839564323425}
BLEU-2: 0.6595414762772626
BLEU-4: 0.5759595650977029
ROUGE-2: {'precision': 0.5873015873015873, 'recall': 0.44047619047619047, 'f1_score': 0.5034013605442177}
ROUGE-3: {'precision': 0.4838709677419355, 'recall': 0.3614457831325301, 'f1_score': 0.41379310344827586}
ROUGE-L: {'precision': 0.703125, 'recall': 0.5294117647058824, 'f1_score': 0.6040268456375839}


In [None]:
def calculate_metrics(reference:str, candidate:str):
    bert_score = calcuate_bert(reference, candidate)
    