In [1]:
!pip install bert-score

In [1]:
import jiwer
from bert_score import BERTScorer
import pandas as pd
import json

transforms = jiwer.Compose(
    [
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveEmptyStrings(),
        jiwer.ToLowerCase(),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
        jiwer.RemovePunctuation(),
        jiwer.ReduceToListOfListOfWords(),
    ]
)

In [2]:
from storage import MongoDBStorage
from service import MongoDBSearchService

with open("db_host", "r") as file:
    host = file.read().rstrip()

storage = MongoDBStorage(parameters={
        "host_url": host
    })
search_service = MongoDBSearchService(storage)

In [3]:
cursor = search_service.aggregate_filters_with_or([
        search_service.get_query_find_by_id('661931f5d172007aa8b18204'),
        search_service.get_query_find_by_id('661931fcd172007aa8b18205'),
        search_service.get_query_find_by_id('661931ffd172007aa8b18206')
    ])
test_df_list = []
for test_df in cursor:
    test_df_list.append(test_df)

In [4]:
with open("evaluation_and_results/references.json") as f:
    references = json.load(f)

In [5]:
scorer = BERTScorer(model_type='bert-base-uncased')

wer_list = []
precision_list = []
recall_list = []
f1_score_list = []
file_names = []
ids = []

for df in test_df_list:
    current_id = str(df['_id'])
    print(f"Processing {df['additional_info']['file_name']}, id is {current_id}")
    ids.append(current_id)
    file_names.append(df['additional_info']['file_name'])
    wer = jiwer.wer(
                    references[current_id]['text_ref'],
                    df['extracted_text'],
                    truth_transform=transforms,
                    hypothesis_transform=transforms,
                )
    print(f"Word Error Rate (WER) is :{wer}")
    wer_list.append(wer)

    # BERTScore calculation
    P, R, F1 = scorer.score([df['summary']], [references[current_id]['summary_ref']])
    print(f"BERTScore Precision: {P.mean():.4f}, Recall: {R.mean():.4f}, F1:    {F1.mean():.4f}")
    precision_list.append(P.mean())
    recall_list.append(R.mean())
    f1_score_list.append(F1.mean())

Downloading tokenizer_config.json:   0%|          | 0.00/49.0 [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to see activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


Downloading config.json:   0%|          | 0.00/625 [00:00<?, ?B/s]

Downloading vocab.txt:   0%|          | 0.00/996k [00:00<?, ?B/s]

Downloading model.safetensors:   0%|          | 0.00/714M [00:00<?, ?B/s]

Processing probabilistic_modelling.mp4, id is 661931f5d172007aa8b18204
Word Error Rate (WER) is :0.009779951100244499
BERTScore Precision: 0.8963, Recall: 0.8931, F1:    0.8947
Processing system_oriented_analysis_and_informational_modelling.mp4, id is 661931fcd172007aa8b18205
Word Error Rate (WER) is :0.019925280199252802
BERTScore Precision: 0.9161, Recall: 0.8780, F1:    0.8967
Processing factographical_information_analysis.mp4, id is 661931ffd172007aa8b18206
Word Error Rate (WER) is :0.02962962962962963
BERTScore Precision: 0.9227, Recall: 0.8956, F1:    0.9090


In [6]:
metrics_df = pd.DataFrame()
metrics_df['file_name'] = file_names
metrics_df['id'] = ids
metrics_df['summary_precision'] = precision_list
metrics_df['summary_precision'] = metrics_df['summary_precision'].astype('float')
metrics_df['summary_recall'] = recall_list
metrics_df['summary_recall'] = metrics_df['summary_recall'].astype('float')
metrics_df['summary_f1'] = f1_score_list
metrics_df['summary_f1'] = metrics_df['summary_f1'].astype('float')
metrics_df['text_wer'] = wer_list
metrics_df['text_accuracy'] = 1.0 - metrics_df['text_wer']
metrics_df

Unnamed: 0,file_name,id,summary_precision,summary_recall,summary_f1,text_wer,text_accuracy
0,probabilistic_modelling.mp4,661931f5d172007aa8b18204,0.896348,0.893104,0.894723,0.00978,0.99022
1,system_oriented_analysis_and_informational_mod...,661931fcd172007aa8b18205,0.916146,0.878035,0.896686,0.019925,0.980075
2,factographical_information_analysis.mp4,661931ffd172007aa8b18206,0.922727,0.89563,0.908976,0.02963,0.97037


In [7]:
print(f"AVG summary precision: {metrics_df['summary_precision'].mean()}")
print(f"AVG summary recall: {metrics_df['summary_recall'].mean()}")
print(f"AVG summary f1: {metrics_df['summary_f1'].mean()}")
print(f"AVG text WER: {metrics_df['text_wer'].mean()}")

AVG summary precision: 0.9117401043574015
AVG summary recall: 0.8889231085777283
AVG summary f1: 0.9001284241676331
AVG text WER: 0.019778286976375643


In [8]:
metrics_df.to_csv("evaluation_and_results/metrics111.csv", index=False)