In [1]:
import jiwer
from jiwer import wer
import pandas as pd
import os

In [2]:
# standardize string inputs to make more accurate wer counts
def pre_jiwer_standardize(any_string):
    wer_standardize = jiwer.Compose(
    [
        jiwer.ToLowerCase(),
        jiwer.ExpandCommonEnglishContractions(),
        jiwer.RemoveKaldiNonWords(),
        jiwer.RemoveWhiteSpace(replace_by_space=True),
        jiwer.RemoveMultipleSpaces(),
        jiwer.Strip(),
    ]
    )
    any_string = str(any_string).replace("neighbourhood", "neighborhood")

    return wer_standardize(any_string)


# applying transformation for preprocessing prompt/transcript columns
def transform_col_text(df, col_name, new_col_name):
    df_copy = df.copy()
    df_copy.loc[:,new_col_name] = df_copy[col_name].apply(pre_jiwer_standardize)
    return df_copy


# creating a new wer getting row-wise error rates for transformed strings in col1, col2
def wer_transformed(df, wer_col, col1, col2):
    df_copy = df.copy()
    df_copy.loc[:, wer_col] = df.apply(lambda x:
                                  wer(x[col1], x[col2]), axis=1)
    return df_copy

In [3]:
# test
df_orig = pd.read_csv('partic_transcripts.csv')
df_expected_preprocessed = transform_col_text(df_orig,
                                              'Expected Transcription',
                                              'preproc_exp_transc')
df_large_preprocessed = transform_col_text(df_expected_preprocessed,
                                           'large',
                                           'large_preproc')
df_base_preprocessed = transform_col_text(df_large_preprocessed,
                                           'base',
                                           'base_preproc')
df_medium_preprocessed = transform_col_text(df_base_preprocessed,
                                           'medium',
                                           'medium_preproc')
df_small_preprocessed = transform_col_text(df_medium_preprocessed,
                                           'small',
                                           'small_preproc')
df_tiny_preprocessed = transform_col_text(df_small_preprocessed,
                                           'tiny',
                                           'tiny_preproc')

df_preproc_final = df_tiny_preprocessed
df_preproc_final.head()
df_preproc_final.to_csv('preproc_for_wer.csv')

In [4]:
# Adding WER
df_wer = wer_transformed(df_preproc_final, 
                         'large_wer', 
                         'preproc_exp_transc', 
                         'large_preproc')
df_wer = wer_transformed(df_wer, 
                         'base_wer', 
                         'preproc_exp_transc', 
                         'base_preproc')

df_wer = wer_transformed(df_wer, 
                         'medium_wer', 
                         'preproc_exp_transc', 
                         'medium_preproc')

df_wer = wer_transformed(df_wer, 
                         'small_wer', 
                         'preproc_exp_transc', 
                         'small_preproc')

df_wer = wer_transformed(df_wer, 
                         'tiny_wer', 
                         'preproc_exp_transc', 
                         'tiny_preproc')
df_wer.head()
df_wer.to_csv('wer_oishani.csv')

## For GitHub upload
Both WER files are too big - preproc_for_wer.csv and wer_oishani.csv, so splitting them into participants 1-35 and 36-77

In [5]:
preproc_for_wer_df = pd.read_csv('preproc_for_wer.csv')

In [6]:
preproc_for_wer_df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,participant_id,Sentence ID,Iteration,probability,Expected Transcription,large,base,medium,small,tiny,preproc_exp_transc,large_preproc,base_preproc,medium_preproc,small_preproc,tiny_preproc
0,0,0,t42,15_1,L1,nonword,My nend likes pepperoni on his money.,Manan likes pepperoni all his money.,Mennaan likes pepperoni or his money.,Manan likes pepperoni on his money.,Manan likes pepperoni all his money.,Menan likes Pavaroni or his money.,my nend likes pepperoni on his money.,manan likes pepperoni all his money.,mennaan likes pepperoni or his money.,manan likes pepperoni on his money.,manan likes pepperoni all his money.,menan likes pavaroni or his money.
1,1,1,t42,100_2,R3P1,low prob,I wish I had a trunk like that rubber ducky.,I wish I had a trunk like that rubber ducky.,I wish I had a trunk like that rubber donkey.,I wish I had a trunk like that rubber ducky.,I wish I had a trunk like that rubber ducky.,I wish I had a trunk like the rubber dot key.,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like that rubber donkey.,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like the rubber dot key.


In [7]:
top_half_ids = []
bottom_half_ids =[]
for i in range(1,78):
    if i < 36:
        top_half_ids.append(f't{i}')
    else:
        bottom_half_ids.append(f't{i}')
print(top_half_ids, bottom_half_ids)

['t1', 't2', 't3', 't4', 't5', 't6', 't7', 't8', 't9', 't10', 't11', 't12', 't13', 't14', 't15', 't16', 't17', 't18', 't19', 't20', 't21', 't22', 't23', 't24', 't25', 't26', 't27', 't28', 't29', 't30', 't31', 't32', 't33', 't34', 't35'] ['t36', 't37', 't38', 't39', 't40', 't41', 't42', 't43', 't44', 't45', 't46', 't47', 't48', 't49', 't50', 't51', 't52', 't53', 't54', 't55', 't56', 't57', 't58', 't59', 't60', 't61', 't62', 't63', 't64', 't65', 't66', 't67', 't68', 't69', 't70', 't71', 't72', 't73', 't74', 't75', 't76', 't77']


In [10]:
top_preproc_df = preproc_for_wer_df[preproc_for_wer_df['participant_id'].isin(top_half_ids)]
top_preproc_df.shape

(26692, 18)

In [11]:
top_preproc_df.to_csv('preproc_for_wer_top.csv')

In [12]:
bottom_preproc_df = preproc_for_wer_df[preproc_for_wer_df['participant_id'].isin(bottom_half_ids)]
bottom_preproc_df.shape

(26437, 18)

In [13]:
bottom_preproc_df.to_csv('preproc_for_wer_bottom.csv')

In [14]:
wer_oishani_df = pd.read_csv('wer_oishani.csv')
wer_oishani_df.head(2)

Unnamed: 0.2,Unnamed: 0.1,Unnamed: 0,participant_id,Sentence ID,Iteration,probability,Expected Transcription,large,base,medium,...,large_preproc,base_preproc,medium_preproc,small_preproc,tiny_preproc,large_wer,base_wer,medium_wer,small_wer,tiny_wer
0,0,0,t42,15_1,L1,nonword,My nend likes pepperoni on his money.,Manan likes pepperoni all his money.,Mennaan likes pepperoni or his money.,Manan likes pepperoni on his money.,...,manan likes pepperoni all his money.,mennaan likes pepperoni or his money.,manan likes pepperoni on his money.,manan likes pepperoni all his money.,menan likes pavaroni or his money.,0.428571,0.428571,0.285714,0.428571,0.571429
1,1,1,t42,100_2,R3P1,low prob,I wish I had a trunk like that rubber ducky.,I wish I had a trunk like that rubber ducky.,I wish I had a trunk like that rubber donkey.,I wish I had a trunk like that rubber ducky.,...,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like that rubber donkey.,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like that rubber ducky.,i wish i had a trunk like the rubber dot key.,0.0,0.1,0.0,0.0,0.3


In [15]:
top_wer_df = wer_oishani_df[wer_oishani_df['participant_id'].isin(top_half_ids)]
top_wer_df.shape

(26692, 23)

In [16]:
top_wer_df.to_csv('wer_oishani_top.csv')

In [17]:
bottom_wer_df = wer_oishani_df[wer_oishani_df['participant_id'].isin(bottom_half_ids)]
bottom_wer_df.shape

(26437, 23)

In [18]:
bottom_wer_df.to_csv('wer_oishani_bottom.csv')