In [1]:
import jiwer
from jiwer import wer
import pandas as pd
import os

In [2]:
# standardize string inputs to make more accurate wer counts
def pre_jiwer_standardize(any_string):
    wer_standardize = jiwer.Compose(
    [
        jiwer.ToLowerCase(),
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveKaldiNonWords(),
        jiwer.RemoveWhiteSpace(replace_by_space=True),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
    ]
    )
    any_string = str(any_string).replace("neighbourhood", "neighborhood")

    return wer_standardize(any_string)


# applying transformation for preprocessing prompt/transcript columns
def transform_col_text(df, col_name, new_col_name):
    df_copy = df.copy()
    df_copy.loc[:,new_col_name] = df_copy[col_name].apply(pre_jiwer_standardize)
    return df_copy


# creating a new wer getting row-wise error rates for transformed strings in col1, col2
def wer_transformed(df, wer_col, col1, col2):
    df_copy = df.copy()
    df_copy.loc[:, wer_col] = df.apply(lambda x:
                                  wer(x[col1], x[col2]), axis=1)
    return df_copy

In [3]:
# test
df_orig = pd.read_csv('partic_transcripts.csv')
df_expected_preprocessed = transform_col_text(df_orig,
                                              'Expected Transcription',
                                              'preproc_exp_transc')
df_large_preprocessed = transform_col_text(df_expected_preprocessed,
                                           'large',
                                           'large_preproc')
df_base_preprocessed = transform_col_text(df_large_preprocessed,
                                           'base',
                                           'base_preproc')
df_medium_preprocessed = transform_col_text(df_base_preprocessed,
                                           'medium',
                                           'medium_preproc')
df_small_preprocessed = transform_col_text(df_medium_preprocessed,
                                           'small',
                                           'small_preproc')
df_tiny_preprocessed = transform_col_text(df_small_preprocessed,
                                           'tiny',
                                           'tiny_preproc')

df_preproc_final = df_tiny_preprocessed
df_preproc_final.head()
df_preproc_final.to_csv('preproc_for_wer.csv')

In [4]:
# Adding WER
df_wer = wer_transformed(df_preproc_final, 
                         'large_wer', 
                         'preproc_exp_transc', 
                         'large_preproc')
df_wer = wer_transformed(df_wer, 
                         'base_wer', 
                         'preproc_exp_transc', 
                         'base_preproc')

df_wer = wer_transformed(df_wer, 
                         'medium_wer', 
                         'preproc_exp_transc', 
                         'medium_preproc')

df_wer = wer_transformed(df_wer, 
                         'small_wer', 
                         'preproc_exp_transc', 
                         'small_preproc')

df_wer = wer_transformed(df_wer, 
                         'tiny_wer', 
                         'preproc_exp_transc', 
                         'tiny_preproc')
df_wer.head()
df_wer.to_csv('wer_oishani.csv')