In [1]:
from jiwer import wer
import evaluate
import pandas as pd

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
def calculate_wer(ground_truth, hypothesis):
    # Calculate WER
    error_rate = wer(ground_truth, hypothesis)
    return round(error_rate*100,2)

In [3]:
def calculate_bleu(predictions, references):
    # Load the BLEU evaluation metric
    bleu = evaluate.load("bleu")
   
    # Compute the BLEU score
    results = bleu.compute(predictions=[predictions], references=[references])
    
    return results["bleu"]

predictions = "hello please there can general a kenobi"
references = "hello please there can general can you kenobi"

bleu_score = calculate_bleu(predictions, references)

print(bleu_score)

0.5578002860768766


In [1]:
def calculate_rouge(predictions, references):
    # Load the ROUGE evaluation metric
    rouge = evaluate.load('rouge')
    
    # Compute the ROUGE score
    results = rouge.compute(predictions=[predictions], references=[references])
    
    return pd.Series([results["rouge1"],results["rouge2"],results["rougeL"],results["rougeLsum"]])

#predictions = "hello please there can general a kenobi"
#references = "hello please there can general can you kenobi"
#rouge_score = calculate_rouge(predictions, references)

#print(rouge_score)

In [4]:
df = pd.read_csv("Final Captions with evaluation results.csv", encoding='ISO-8859-1')

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 52 entries, 0 to 51
Data columns (total 8 columns):
 #   Column           Non-Null Count  Dtype 
---  ------           --------------  ----- 
 0   File             52 non-null     object
 1   Whisper_Caption  33 non-null     object
 2   Label            52 non-null     int64 
 3   youtube_caption  52 non-null     object
 4   Category         52 non-null     object
 5   Chatgpt_caption  52 non-null     object
 6   Ground_Truth     52 non-null     object
 7   Llama2_caption   52 non-null     object
dtypes: int64(1), object(7)
memory usage: 3.4+ KB


In [5]:
df.head(6)

Unnamed: 0,File,Label,youtube_caption,Category,Chatgpt_caption,Ground_Truth,Llama2_caption,wer_gpt,bleu_gpt,rouge1_gpt,rouge2_gpt,rougeL_gpt,rougeLsum_gpt,wer_llama2,bleu_llama2,rouge1_llama2,rouge2_llama2,rougeL_llama2,rougeLsum_llama2
0,audio\youtube\Cooking\001.mp3,1.0,- Let's make easy baked pears.(upbeat music) F...,Cooking,Let's make easy baked pears. (Upbeat music) Fo...,Let's make easy baked pears (upbeat music). Fo...,"Let's make easy baked pears. For the topping, ...",11.11,0.836766,0.968153,0.955128,0.968153,0.968153,16.99,0.761542,0.90429,0.857143,0.90429,0.90429
1,audio\youtube\Cooking\002.mp3,2.0,homemade pizza dough no special equipment need...,Cooking,"Homemade pizza dough, no special equipment nee...","Homemade pizza dough, no special equipment nee...","Homemade pizza dough, no special equipment nee...",9.44,0.850729,0.992042,0.981333,0.992042,0.992042,32.78,0.585373,0.887006,0.784091,0.858757,0.858757
2,audio\youtube\Cooking\003.mp3,3.0,[Music] reynolds what is the dish so this is o...,Cooking,"[Music] Reynolds, what is the dish? So, this i...","[Music] Reynolds, what is the dish? So, this i...","Reynolds, what is the dish so this is of cours...",9.76,0.856061,0.934911,0.898204,0.934911,0.934911,43.9,0.403562,0.833333,0.779221,0.833333,0.833333
3,audio\youtube\Cooking\004.mp3,4.0,When you're in the mood for something sweet an...,Cooking,When you're in the mood for something sweet an...,When you're in the mood for something sweet an...,When you're in the mood for something sweet an...,0.88,0.982065,0.994786,0.988506,0.994786,0.994786,3.94,0.943766,0.988506,0.982199,0.988506,0.988506
4,audio\youtube\Cooking\005.mp3,5.0,we're down to the last 10 minutes wo start pla...,Cooking,We're down to the last 10 minutes. We start pl...,We're down to the last 10 minutes. Start plati...,"We're down to the last 10 minutes, and it's ti...",1.05,0.98359,0.996764,0.99026,0.996764,0.996764,30.66,0.60663,0.947195,0.913907,0.943894,0.943894
5,audio\youtube\Cooking\006.mp3,6.0,growing up one of my favorite things was a chi...,Cooking,"Growing up, one of my favorite things was a ch...","Growing up, one of my favorite things was a ch...","Growing up, one of my favorite things was a ch...",24.62,0.722989,0.8722,0.804749,0.859025,0.859025,64.45,0.344864,0.700377,0.507937,0.591698,0.591698


In [7]:
df['Ground_Truth'] = df['Ground_Truth'].fillna('').astype(str)
df['Chatgpt_caption'] = df['Chatgpt_caption'].fillna('').astype(str)
df["wer_gpt"] = df.apply(lambda x: calculate_wer(x["Ground_Truth"],x["Chatgpt_caption"]) ,axis=1)
df["bleu_gpt"] = df.apply(lambda x: calculate_bleu(x["Ground_Truth"],x["Chatgpt_caption"]) ,axis=1)

In [8]:
df[["rouge1_gpt","rouge2_gpt","rougeL_gpt","rougeLsum_gpt"]] = df.apply(lambda x: calculate_rouge(x["Ground_Truth"],x["Chatgpt_caption"]) ,axis=1)

In [9]:
empty_records = df[(df['Ground_Truth'] == '') | (df['Chatgpt_caption'] == '')]
empty_records

Unnamed: 0,File,Whisper_Caption,Label,youtube_caption,Category,Chatgpt_caption,Ground_Truth,Llama2_caption,wer,bleu,rouge1,rouge2,rougeL,rougeLsum


In [10]:
df["wer_llama2"] = df.apply(lambda x: calculate_wer(x["Ground_Truth"],x["Llama2_caption"]) ,axis=1)
df["bleu_llama2"] = df.apply(lambda x: calculate_bleu(x["Ground_Truth"],x["Llama2_caption"]) ,axis=1)

In [11]:
df[["rouge1_llama2","rouge2_llama2","rougeL_llama2","rougeLsum_llama2"]] = df.apply(lambda x: calculate_rouge(x["Ground_Truth"],x["Llama2_caption"]) ,axis=1)

In [6]:
df.to_csv('Final Captions with evaluation results.csv',index=False)

In [3]:
reference = "hello world, how are you?"
hypothesis = "hello, how you doing world?"
# Calculate Word Error Rate
error = wer(reference, hypothesis)
# Display results
print(f"Reference Transcript: {reference}")
print(f"Hypothesis Transcript: {hypothesis}")
print(f"Word Error Rate: {error:.2%}")

Reference Transcript: hello world, how are you?
Hypothesis Transcript: hello, how you doing world?
Word Error Rate: 100.00%
