In [1]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

import torch
# Check if a GPU is available
if torch.cuda.is_available():
    # Get the current device index (default is 0 if no other device is specified)
    current_device = torch.cuda.current_device()
    
    # Get the name of the GPU at this device index
    gpu_name = torch.cuda.get_device_name(current_device)
    print(f"Current GPU: {gpu_name}")
else:
    print("No GPU available.")

Current GPU: Tesla P40


In [2]:
import sys
import os
current_dir = os.getcwd()
sys.path.append(os.path.join(os.getcwd(), '..'))

In [3]:
import numpy  as np
from nltk import ngrams
from collections import Counter
from transformers import AutoTokenizer
from bert_score import score

# Calculate BERT
def calcuate_bert(reference:str, candidate:str):
    P, R, F1 = score([candidate], [reference], lang="ja")  # Set language to Japanese
    #print(f"BERTScore: Precision={P.mean():.4f}, Recall={R.mean():.4f}, F1={F1.mean():.4f}")
    return {
        'precision': float(P),
        'recall': float(R),
        'f1_score': float(F1)
    }
    

# Calculate ROUGE, ROUGE-L
def calculate_rouge(reference, generated, n=1, model_id = "CohereForAI/aya-23-8B"):

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Tokenize the input strings into words
    reference_tokens = tokenizer.tokenize(reference) #reference.split()
    generated_tokens = tokenizer.tokenize(generated) #generated.split()
    
    # Generate n-grams
    reference_ngrams = list(ngrams(reference_tokens, n))
    generated_ngrams = list(ngrams(generated_tokens, n))
    
    # Count n-grams
    reference_count = Counter(reference_ngrams)
    generated_count = Counter(generated_ngrams)

    # Calculate matched n-grams
    matched_ngrams = reference_count & generated_count
    
    # Precision
    precision = (sum(matched_ngrams.values()) / len(generated_ngrams)) if generated_ngrams else 0.0
    
    # Recall
    recall = (sum(matched_ngrams.values()) / len(reference_ngrams)) if reference_ngrams else 0.0
    
    # F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0
    
    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

def lcs_length(x, y):
    """Calculate the length of the longest common subsequence (LCS)"""
    m, n = len(x), len(y)
    # Create a 2D array to store lengths of longest common subsequence.
    lcs_table = [[0] * (n + 1) for _ in range(m + 1)]

    # Fill the lcs_table
    for i in range(1, m + 1):
        for j in range(1, n + 1):
            if x[i - 1] == y[j - 1]:
                lcs_table[i][j] = lcs_table[i - 1][j - 1] + 1
            else:
                lcs_table[i][j] = max(lcs_table[i - 1][j], lcs_table[i][j - 1])

    return lcs_table[m][n]

def calculate_rouge_l(reference, generated, model_id = "CohereForAI/aya-23-8B"):

    tokenizer = AutoTokenizer.from_pretrained(model_id)
    
    # Tokenize the input strings into words
    reference_tokens = tokenizer.tokenize(reference) #reference.split()
    generated_tokens = tokenizer.tokenize(generated) #generated.split()

    # Calculate the length of the longest common subsequence
    lcs_len = lcs_length(reference_tokens, generated_tokens)

    # Precision
    precision = lcs_len / len(generated_tokens) if generated_tokens else 0.0

    # Recall
    recall = lcs_len / len(reference_tokens) if reference_tokens else 0.0

    # F1 Score
    if precision + recall > 0:
        f1_score = 2 * (precision * recall) / (precision + recall)
    else:
        f1_score = 0.0

    return {
        'precision': precision,
        'recall': recall,
        'f1_score': f1_score
    }

# Calculate BLEU-score
def brevity_penalty(candidate, reference):
    """
    Calculates the brevity penalty given the candidate and reference sentences.
    """
    reference_length = len(reference)
    candidate_length = len(candidate)

    if reference_length < candidate_length:
        BP = 1
    else:
        penalty = 1 - (reference_length / candidate_length)
        BP = np.exp(penalty)

    return BP


def average_clipped_precision(candidate:str, reference:str,n:int):
    """
    Calculates the precision given the candidate and reference sentences.
    """

    clipped_precision_score = []
    
    # Loop through values 1, 2, 3, 4. This is the length of n-grams
    for n_gram_length in range(1, n):
        reference_n_gram_counts = Counter(ngrams(reference, n_gram_length))        
        candidate_n_gram_counts = Counter(ngrams(candidate, n_gram_length))

        total_candidate_ngrams = sum(candidate_n_gram_counts.values())       
        
        for ngram in candidate_n_gram_counts: 
            # check if it is in the reference n-gram
            if ngram in reference_n_gram_counts:
                # if the count of the candidate n-gram is bigger than the corresponding
                # count in the reference n-gram, then set the count of the candidate n-gram 
                # to be equal to the reference n-gram
                
                if candidate_n_gram_counts[ngram] > reference_n_gram_counts[ngram]: 
                    candidate_n_gram_counts[ngram] = reference_n_gram_counts[ngram] # t
                                                   
            else:
                candidate_n_gram_counts[ngram] = 0 # else set the candidate n-gram equal to zero

        clipped_candidate_ngrams = sum(candidate_n_gram_counts.values())
        
        clipped_precision_score.append(clipped_candidate_ngrams / total_candidate_ngrams)
    
    # Calculate the geometric average: take the mean of elemntwise log, then exponentiate
    # This is equivalent to taking the n-th root of the product as shown in equation (1) above
    s = np.exp(np.mean(np.log(clipped_precision_score)))
    
    return s

def calculate_bleu_score(reference:str,candidate:str, n:int):
    assert n >=2, "n must >= 2"
    BP = brevity_penalty(candidate, reference)    
    geometric_average_precision = average_clipped_precision(candidate, reference, n)    
    return BP * geometric_average_precision

In [4]:
# !huggingface-cli download dataset longquan/llm-japanese-dataset-split_10
from datasets import load_dataset

dataset = load_dataset("longquan/llm-japanese-dataset-split_10", cache_dir="~/.cache/huggingface/datasets")

# View available dataset splits
print("Available Splits:", dataset.keys())

# Load specific split (e.g., 'train') and inspect the first few rows
train_data = dataset["train"]

Available Splits: dict_keys(['train'])


In [44]:
import random
import time
from random import seed, sample
from transformers import pipeline

In [6]:
model_id = "meta-llama/Llama-3.2-1B-Instruct"

pipe = pipeline(
    "text-generation",
    model=model_id,
    device_map="auto",
)

In [9]:
random_sample = train_data[random.choice(range(train_data.num_rows))]

reference = random_sample['output']
print("Reference: ",reference,"\n")

messages = [
    {"role": "system", "content": random_sample["instruction"]},
    {"role": "user", "content": random_sample["input"]}
]
outputs = pipe(
    messages,
    max_new_tokens=128,
)
candidate = outputs[0]["generated_text"][-1]['content']
print("Model: ", candidate)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Reference:  長谷川 きよし（はせがわ きよし、1949年7月13日 - ）は、日本のシンガーソングライター、ギタリスト。本名は長谷川 清志。東京都出身。 

Model:  長谷川きよしは、1978年生まれの日本のシンガポール・ポップ・バンド。バンド名の「きよし」は、長谷川きよしが本名の「きよしが」に由来する。

バンドのメンバーは、長谷川きよし、加代麻子、滝本修司、平田隆司、加藤正人、加藤麻美、平田敏明、平田隆司、平田敏明、平田隆司、平田敏明、平田隆司


In [10]:
print("BERT:", calcuate_bert(reference, candidate))
print("BLEU-2:", calculate_bleu_score(reference, candidate,2))
print("BLEU-4:", calculate_bleu_score(reference, candidate,4))
print("ROUGE-2:", calculate_rouge(reference, candidate, n=2))
print("ROUGE-3:", calculate_rouge(reference, candidate, n=3))
rouge_l = calculate_rouge_l(reference, candidate)
print("ROUGE-L:", rouge_l)



BERT: {'precision': 0.6437352895736694, 'recall': 0.679200291633606, 'f1_score': 0.660992443561554}
BLEU-2: 0.2535211267605634
BLEU-4: 0.13222487680372713
ROUGE-2: {'precision': 0.039603960396039604, 'recall': 0.0975609756097561, 'f1_score': 0.056338028169014086}
ROUGE-3: {'precision': 0.01, 'recall': 0.025, 'f1_score': 0.014285714285714285}
ROUGE-L: {'precision': 0.10784313725490197, 'recall': 0.2619047619047619, 'f1_score': 0.1527777777777778}


In [11]:
def calculate_metrics(reference:str, candidate:str):
    bert_score = calcuate_bert(reference, candidate)
    bleu_2_score = calculate_bleu_score(reference, candidate,2)
    bleu_4_score = calculate_bleu_score(reference, candidate,4)
    rouge_2_score = calculate_rouge(reference, candidate, n=2)
    calculate_rouge_l(reference, candidate)
    return bert_score, bleu_2_score, bleu_4_score, rouge_2_score, rouge_l

In [12]:
bert_score, bleu_2_score, bleu_4_score, rouge_2_score, rouge_l = calculate_metrics(reference, candidate)

print("BERT:", bert_score)
print("BLEU-2:", bleu_2_score)
print("BLEU-4:", bleu_4_score)
print("ROUGE-2:", rouge_2_score)
print("ROUGE-L:", rouge_l)

BERT: {'precision': 0.6437352895736694, 'recall': 0.679200291633606, 'f1_score': 0.660992443561554}
BLEU-2: 0.2535211267605634
BLEU-4: 0.13222487680372713
ROUGE-2: {'precision': 0.039603960396039604, 'recall': 0.0975609756097561, 'f1_score': 0.056338028169014086}
ROUGE-L: {'precision': 0.10784313725490197, 'recall': 0.2619047619047619, 'f1_score': 0.1527777777777778}


In [48]:
seed(42)  # Set seed for reproducibility
random_indices = sample(range(train_data.num_rows), 10)  # Randomly choose 10 indices
subset = train_data.select(random_indices)

In [49]:
# Start counting time
start_time = time.time()

# Initialize accumulators for each metric
bert_total, bleu_2_total, bleu_4_total, rouge_2_total, rouge_l_total = 0, 0, 0, 0, 0

# Loop through the subset
for i in range(10):
    
    # Current sample 
    current_sample = subset[i]
    
    reference = current_sample['output']
    
    messages = [
        {"role": "system", "content": current_sample["instruction"]},
        {"role": "user", "content": current_sample["input"]}
    ]
    outputs = pipe(
        messages,
        max_new_tokens=128,
    )
    candidate = outputs[0]["generated_text"][-1]['content']
    
    # Calculate metrics for the current pair
    bert_score, bleu_2_score, bleu_4_score, rouge_2_score, rouge_l = calculate_metrics(reference, candidate)
    
    # Accumulate scores
    bert_total += bert_score["f1_score"]
    bleu_2_total += bleu_2_score
    bleu_4_total += bleu_4_score
    rouge_2_total += rouge_2_score["f1_score"]
    rouge_l_total += rouge_l["f1_score"]

# Compute average scores for the subset
num_items = len(subset)
bert_avg = bert_total / num_items
bleu_2_avg = bleu_2_total / num_items
bleu_4_avg = bleu_4_total / num_items
rouge_2_avg = rouge_2_total / num_items
rouge_l_avg = rouge_l_total / num_items

# Print the average metrics
print("Average Metrics for Subset:")
print("BERT:", bert_avg)
print("BLEU-2:", bleu_2_avg)
print("BLEU-4:", bleu_4_avg)
print("ROUGE-2:", rouge_2_avg)
print("ROUGE-L:", rouge_l_avg)

end_time = time.time() - start_time
print(f"Consuming time: ",end_time)

Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
  s = np.exp(np.mean(np.log(clipped_precision_score)))
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:128001 for open-end generation.


Average Metrics for Subset:
BERT: 0.6702380239963531
BLEU-2: 0.2746584187704523
BLEU-4: 0.16628310886013314
ROUGE-2: 0.14740961635365704
ROUGE-L: 0.15277777777777776
Consuming time:  44.97154903411865
