<a href="https://colab.research.google.com/github/mikelsegura/fstvsicl/blob/main/normalization/Evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install evaluate -q
!pip install tabulate -q

import pandas as pd
import evaluate
import os
from evaluate import load
from tabulate import tabulate
try:
    import jiwer
except ImportError:
    !pip install jiwer -q

In [None]:
repo_url = "https://github.com/MCL-Lab-mx/fstvsicl.git"
!git clone {repo_url}

%cd fstvsicl

BASE_PATH = "otomi/"
OUTPUT_PATH = "otomi/colab_eval/"
!mkdir -p {OUTPUT_PATH}

Cloning into 'fstvsicl'...
remote: Enumerating objects: 96, done.[K
remote: Counting objects: 100% (96/96), done.[K
remote: Compressing objects: 100% (41/41), done.[K
remote: Total 96 (delta 62), reused 75 (delta 53), pack-reused 0 (from 0)[K
Receiving objects: 100% (96/96), 723.10 KiB | 10.04 MiB/s, done.
Resolving deltas: 100% (62/62), done.
/content/fstvsicl


# WER & CER

In [None]:
reference_file_path = f'{BASE_PATH}test_set.tsv'
reference_df = pd.read_csv(reference_file_path, sep='\t')
inali_reference = reference_df['INALI'].str.lower().tolist()

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

experiment_folder_path = f'{BASE_PATH}clean_results/'

results = []

# Iterate through each experiment TSV file
for file_name in os.listdir(experiment_folder_path):
    if file_name.endswith('.tsv') and file_name != 'test_set.tsv':
        file_path = os.path.join(experiment_folder_path, file_name)
        experiment_df = pd.read_csv(file_path, sep='\t')

        result_column = experiment_df['Result'].str.lower().tolist()

        input_column = experiment_df['Input'].str.lower().tolist()

        # Compute WER and CER for Result Column
        wer_score = wer_metric.compute(predictions=result_column, references=inali_reference)
        cer_score = cer_metric.compute(predictions=result_column, references=inali_reference)

        # Compute WER and CER for Result Column
        wer_baseline = wer_metric.compute(predictions=input_column, references=inali_reference)
        cer_baseline = cer_metric.compute(predictions=input_column, references=inali_reference)

        # Convert scores to percentages
        wer_percentage = wer_score * 100
        cer_percentage = cer_score * 100

        wer_baseline_percentage = wer_baseline * 100
        cer_baseline_percentage = cer_baseline * 100

        results.append([file_name.lower(), f"{wer_percentage:.2f}%", f"{wer_baseline_percentage:.2f}%", f"{cer_percentage:.2f}%", f"{cer_baseline_percentage:.2f}%"])  # Convert file name to lowercase

results_sorted = sorted(results, key=lambda x: x[0])

headers = ["Experiment File", "WER Score", "WER Baseline","CER Score", "CER baseline"]
print(tabulate(results_sorted, headers=headers, tablefmt="pretty"))

+---------------------------------------------+-----------+--------------+-----------+--------------+
|               Experiment File               | WER Score | WER Baseline | CER Score | CER baseline |
+---------------------------------------------+-----------+--------------+-----------+--------------+
|             fst_otq_results.tsv             |   4.36%   |    0.10%     |   2.03%   |    0.04%     |
|           fst_ots_otq_results.tsv           |  10.78%   |    7.74%     |   3.37%   |    1.59%     |
|             fst_ots_results.tsv             |  20.27%   |    18.90%    |   5.31%   |    4.09%     |
|   gpt_3_5_turbo_few_shot_otq_results.tsv    |   0.34%   |    0.10%     |   0.10%   |    0.04%     |
| gpt_3_5_turbo_few_shot_ots_otq_results.tsv  |   7.06%   |    7.74%     |   1.61%   |    1.59%     |
|   gpt_3_5_turbo_few_shot_ots_results.tsv    |  14.34%   |    18.90%    |   3.45%   |    4.09%     |
|   gpt_3_5_turbo_zero_shot_otq_results.tsv   |  12.34%   |    0.10%     |   3.13%

In [None]:
reference_file_path = f'{BASE_PATH}test_set.tsv'
reference_df = pd.read_csv(reference_file_path, sep='\t')

inali_reference = reference_df['INALI'].astype(str).str.lower().tolist()

wer_metric = evaluate.load("wer")
cer_metric = evaluate.load("cer")

experiment_folder_path = f'{BASE_PATH}clean_results/'
output_file_path = f'{OUTPUT_PATH}oto_eval_per_sentence.tsv'

results = []

# Iterate through each experiment TSV file
for file_name in os.listdir(experiment_folder_path):
    print("- evaluating "+file_name)
    if file_name.endswith('.tsv') and file_name != 'test_set.tsv':
        file_path = os.path.join(experiment_folder_path, file_name)
        experiment_df = pd.read_csv(file_path, sep='\t')

        if 'Result' not in experiment_df:
            print(f"Skipping {file_name}: 'Result' column not found.")
            continue

        input_column = experiment_df['Input'].tolist()

        result_column = experiment_df['Result'].astype(str).str.lower().fillna("").tolist()

        # Compute WER and CER for each sentence pair
        for ref, pred, input in zip(inali_reference, result_column, input_column):
            if not pred.strip():
                print(f"Skipping empty prediction in {file_name}")
                continue

            wer_score = wer_metric.compute(predictions=[pred], references=[ref])
            cer_score = cer_metric.compute(predictions=[pred], references=[ref])

            results.append([input, pred, ref, f"{wer_score * 100:.2f}%", f"{cer_score * 100:.2f}%", file_name.lower()])

output_df = pd.DataFrame(results, columns=["INPUT", "PREDICTION", "INALI_REFERENCE", "WER", "CER", "FILE"])
output_df.to_csv(output_file_path, sep='\t', index=False)
print(f"Results saved to: {output_file_path}")

- evaluating llama_3_1_70b_zero_shot_OTS_results.tsv
- evaluating fst_OTQ_results.tsv
- evaluating llama_3_1_70b_few_shot_OTS_results.tsv
- evaluating gpt_4o_zero_shot_OTS_OTQ_results.tsv
- evaluating llama_3_3_70b_zero_shot_OTS_OTQ_results.tsv
- evaluating gpt_3_5_turbo_few_shot_OTS_results.tsv
- evaluating llama_3_1_70b_zero_shot_OTS_OTQ_results.tsv
- evaluating llama_3_1_70b_few_shot_OTS_OTQ_results.tsv
- evaluating llama_3_1_70b_zero_shot_OTQ_results.tsv
- evaluating llama_3_3_70b_few_shot_OTQ_results.tsv
- evaluating gpt_3_5_turbo_zero_shot_OTQ_results.tsv
- evaluating gpt_4o_few_shot_OTS_results.tsv
- evaluating gpt_3_5_turbo_few_shot_OTQ_results.tsv
- evaluating gpt_4o_few_shot_OTQ_results.tsv
- evaluating llama_3_3_70b_few_shot_OTS_OTQ_results.tsv
- evaluating llama_3_3_70b_few_shot_OTS_results.tsv
- evaluating gpt_3_5_turbo_zero_shot_OTS_results.tsv
- evaluating fst_OTS_OTQ_results.tsv
- evaluating llama_3_3_70b_zero_shot_OTS_results.tsv
- evaluating gpt_4o_zero_shot_OTS_resul