In [1]:
import preprocessing
import pandas as pd
import numpy as np
import importlib
from g2p_en import G2p # https://github.com/Kyubyong/g2p

In [2]:
celex_dict_file = "Data/epw.cd"
filename = "Data/2016_all_words_no_audio.pickle"
hom_filename = "Data/hom.csv"

In [3]:
df = preprocessing.read_dataframe(filename, remove_pauses=True, remove_errors=True, preprocessing=True, drop_error_columns=False)

read dataframe from Data/2016_all_words_no_audio.pickle
Preprocessing: extract pause information...
Remove pauses from data!
Preprocessing: apply word preprocessing...
Preprocessing: calculate word duration...
Preprocessing: calculate word frequency...
Preprocessing: extract context information...
Preprocessing: calculate letter length...
Preprocessing: calculate contextual predictability...
(18864660, 25) RangeIndex(start=0, stop=18864660, step=1)


In [12]:
source_files = ["2016-12-17_1330_US_KCET_Asia_Insight", "2016-10-25_2300_US_KABC_Eyewitness_News_4PM"]

In [1]:
sub_df = pd.read_csv("sub_df.csv", index_col="Unnamed: 0")

NameError: name 'df' is not defined

In [4]:
homophones_in_data, gahls_homophones, gahls_homophones_missing_in_data = preprocessing.read_and_extract_homophones(hom_filename, df)

read Gahls Homophone data from Data/hom.csv
406 out of 412 homophones found in Data:
Homophone Pairs found in Data: 200
Homophones without Pair:  ['flowers', 'holes', 'moose', 'naval', 'pairs', 'taught']
Missing homophones: ['flours' 'mousse' 'navel' 'pears' 'taut' 'wholes']


In [5]:
gahls_homophones.columns

Index(['spell', 'pron', 'lgPronCelFq', 'logCelFq', 'logAvgDur', 'stem',
       'is_complex', 'celexPhon', 'phonNeighCount', 'NearestSemNeighCor',
       'MeanCorTop20', 'AvCor', 'MedianCor', 'MeanCorTop20Unrel',
       'CossinTwinsStem', 'CossinTwinsFull', 'L2Ldiag', 'EuclidDistTwins',
       'SL1norm', 'CorrectLDLpred', 'SumChatWord', 'MinChatWord', 'L1ChatWord',
       'CorPredWord', 'LWLinkRatioWord', 'RankProd'],
      dtype='object')

In [5]:
def get_ARPAbet_phonetic_transcription(word_list):
    g2p = G2p()
    arpabet_word_list = []
    for word in word_list:
        transcription = g2p(word)
        arpabet_word_list.append(transcription)

    return arpabet_word_list


def get_english_phonology_from_celex(filename):
    phonology_dict = {"word":[], "disc":[], "clx":[]}
    with open(filename) as f:
        for line in f:
            line = line.strip().split("\\")
            word = line[1] # the word
            phonology_dict["word"].append(word)
            disc = line[6] # pronunciation in DISC notation, hyphens to mark syllable boundaries, inverted comma for primary stress and double quote for secondary stress (PhonStrsDISC)
            phonology_dict["disc"].append(disc)
            clx = line[8] # pronunciation in CELEX notation, with brackets (PhonSylBCLX)
            phonology_dict["clx"].append(clx)

    celex_phonology_dict = pd.DataFrame.from_dict(phonology_dict).drop_duplicates()
    celex_phonology_dict["disc_no_bound"] = celex_phonology_dict["disc"].apply(
        lambda x: x.replace("'", "").replace("-", ""))
    celex_phonology_dict["clx_no_bound"] = celex_phonology_dict["clx"].apply(
        lambda x: x.replace("[", "").replace("]", ""))
    return celex_phonology_dict



def get_celex_transcription(df, celex_phonology_dict):

    return df.merge(celex_phonology_dict[["word", "disc", "clx", "disc_no_bound", "clx_no_bound"]], how = "left", left_on=["word", "celexPhon"], right_on=["word","disc_no_bound"])


In [6]:
celex_phonology_dict = get_english_phonology_from_celex(celex_dict_file)

In [13]:
celex_phonology_dict

Unnamed: 0,word,disc,clx,disc_no_bound,clx_no_bound
0,a,'1,[eI],1,eI
2,A,'1,[eI],1,eI
4,AA,"""1-'1",[eI][eI],"""11",eIeI
6,AAs,"""1-'1z",[eI][eIz],"""11z",eIeIz
7,abaci,'{-b@-s2,[&][b@][saI],{b@s2,&b@saI
...,...,...,...,...,...
100620,Zouave,zu-'#v,[zu:][A:v],zu#v,zu:A:v
100621,Zouaves,zu-'#vz,[zu:][A:vz],zu#vz,zu:A:vz
100622,z's,'zEdz,[zEdz],zEdz,zEdz
100623,zucchini,zU-'ki-nI,[zU][ki:][nI],zUkinI,zUki:nI


In [7]:
homophones_in_data_celex_mapped = get_celex_transcription(homophones_in_data,celex_phonology_dict)

In [15]:
homophones_in_data_celex_mapped

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,cond_pred_next,has_pair,pron,celexPhon,pron_frequency,is_max,disc,clx,disc_no_bound,clx_no_bound
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.070000,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.017207,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.320000,38.540000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.660000,139.880000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.000264,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.530000,277.750000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.939999,415.159999,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.660000,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.150000,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.980000,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.000002,True,sp1d,sp1d,13,1,'sp1d,[speId],sp1d,speId
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.059999,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.001192,True,plVm,plVm,23,1,'plVm,[plVm],plVm,plVm


In [8]:
berndt_character_code = pd.read_csv("Data/celex_phonetic_character_code_berndt1987.csv", delimiter=";")
berndt_conditional_probs = pd.read_csv("Data/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv",delimiter=";")

In [17]:
berndt_character_code

Unnamed: 0,keyboard_compatible_phonetic_symbol,CELEX,g2p(ARPAbet),DISC,Example,Note
0,ay,eI,"EY1,EY2",1,ale,
1,ae,&,"AE0,AE1,AE2",{,add,
2,ee,i:,"IY0, IY1",i,bee,
3,eh,E,"EH0,EH1,EH2",E,end,
4,er,@r*,"ER0,ER1,ER2",@R,father,
5,ai,aI,"AY0,AY1,AY2",2,high,
6,ih,I,"IH0,IH1,IH2",I,bin,
7,o,@U,"OW0,OW1,OW2",5,boat,
8,ah,O,"AA1,AA2",Q,cot,
9,aw,O,"AO1,AO2",Q,soft,AA1 in soft


In [39]:
homophones_in_data_celex_mapped.word

0          right
1          right
2          right
3          right
4          right
           ...  
530336     franc
530337     franc
530338    spayed
530339     plumb
530340     reeks
Name: word, Length: 530341, dtype: object

In [18]:
#arpabet_encoded_words = get_ARPAbet_phonetic_transcription(homophones_in_data_celex_mapped.word)
#arpabet_used_in_data = set(sum(arpabet_encoded_words,[]))

disc_encoded_words = list(pd.Series(homophones_in_data_celex_mapped.disc.unique()).dropna().str.replace("'","").str.split("-"))
disc_used_in_data = set(sum(disc_encoded_words,[]))

#clx_encoded_words = list(homophones_in_data_celex_mapped.clx[pd.notnull(homophones_in_data_celex_mapped.clx)].str.replace("[","").str.split("]"))
#clx_used_in_data = set(filter(lambda x: x != "",sum(clx_encoded_words,[])))

In [10]:
arpabet_used_in_data - set(berndt_character_code["g2p(ARPAbet)"])

{'AA2', 'AO1', 'EH0', 'ER1', 'EY2', 'IH0', 'IY0', 'OW0', 'OW2'}

In [20]:
disc_characters_used_in_data = set(''.join(list(disc_used_in_data)))
disc_characters_for_berndts_encoding = set(''.join([str(i) for i in list(set(berndt_character_code.DISC))]))

In [24]:
clx_characters_used_in_data = set(''.join(list(clx_used_in_data)))
clx_characters_for_berndts_encoding = set(''.join([str(i) for i in list(set(berndt_character_code.CELEX))]))

NameError: name 'clx_used_in_data' is not defined

In [21]:
disc_characters_used_in_data - disc_characters_for_berndts_encoding

{'#', '$', '3', '7', '8', 'q'}

In [170]:
clx_characters_used_in_data - clx_characters_for_berndts_encoding

{'A'}

In [173]:
for i,word in enumerate(clx_encoded_words):
    for j in word:
        if any(x in j for x in ['A']):
            print("Missing:", j, word, homophones_in_data_celex_mapped.word[pd.notnull(homophones_in_data_celex_mapped.clx)].iloc[i])

Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] passed
Missing: pA:st ['pA:st', ''] passed


In [10]:
berndt_conditional_probs_words = get_ARPAbet_phonetic_transcription(berndt_conditional_probs.Example)
arpabet_used_in_bernd_examples = set(sum(berndt_conditional_probs_words,[]))

In [30]:
for i,word in enumerate(berndt_conditional_probs_words):
    for j in word:
        if j in ["AO1"]:
            print(i,j, word, berndt_conditional_probs.Example.iloc[i])

4 AO1 ['AO1', 'L', 'S', 'OW0'] also
11 AO1 ['AO1', 'R', 'AH0', 'N', 'JH'] orange
13 AO1 ['F', 'AO1', 'L', 'S'] false
27 AO1 ['IH0', 'K', 'S', 'T', 'R', 'AO1', 'R', 'D', 'AH0', 'N', 'EH2', 'R', 'IY0'] extraordinary
28 AO1 ['F', 'AO1', 'S', 'AH0', 'T'] faucet
30 AO1 ['T', 'AO1', 'P'] taupe 
31 AO1 ['IH0', 'P', 'AO1', 'L', 'AH0', 'T'] epaulet
33 AO1 ['M', 'AO1', 'V'] mauve
36 AO1 ['L', 'AO1'] law
73 AO1 ['F', 'AO1', 'R', 'F', 'IH0', 'T'] forfeit
127 AO1 ['AO1', 'F'] off
137 AO1 ['G', 'AO1', 'N'] gone
142 AO1 ['B', 'R', 'AO1', 'D'] broad
143 AO1 ['K', 'AO1', 'R', 'S'] coarse
150 AO1 ['T', 'AO1', 'R', 'T', 'AH0', 'S'] tortoise
153 AO1 ['D', 'AO1', 'R'] door
155 AO1 ['IH0', 'N', 'AO1', 'R', 'M', 'AH0', 'S'] enormous
158 AO1 ['F', 'AO1', 'R'] four
164 AO1 ['K', 'AO1', 'R', 'S'] course
203 AO1 ['S', 'T', 'AO1', 'R', 'IY0'] story
223 AO1 ['K', 'AO1', 'R', 'AH0', 'S'] chorus
237 AO1 ['K', 'AO1', 'R', 'JH', 'AH0', 'L'] cordial
241 AO1 ['SH', 'AO1', 'R', 'T', 'AH0', 'N'] shorten
246 AO1 ['AO1', 'F

In [42]:
arpabet_used_in_data - arpabet_used_in_bernd_examples

{'AA2', 'OW2'}

## Get Keyboard compatible phonetic symbols

In [23]:
homophones_in_data_celex_mapped["clx"].unique()

array(['[raIt]', '[nju:]', '[mO:l]', '[weI][tIN]', '[li:st]', '[si:]',
       '[taIm]', '[hA:t]', '[saIt]', '[p@Ul]', '[h3:d]', '[sEnt]',
       '[wO:n]', '[meId]', '[m&][n@r*]', '[beIl]', '[kru:z]', '[si:n]',
       '[flu:]', '[r@Ud]', '[pA:st]', '[mO:][nIN]', '[hI@r*]', '[daId]',
       '[faInd]', '[weI]', '[hO:l]', '[r@Uz]', '[streIt]', '[naIt]',
       '[k@U][k@U]', '[si:m]', '[tSI][lI]', '[greIt]', '[reIn]', '[fIl]',
       '[pi:k]', '[si:mz]', '[fi:t]', '[seIl]', '[wi:k]', '[taId]',
       '[tSu:z]', '[sVn]', '[@][laUd]', '[dju:]', '[s@Ul]', '[hi:l]',
       '[fr&Nk]', '[rI][NIN]', '[haI][@r*]', '[hO:s]', '[h@Ul]',
       '[flaU][@z]', '[weIt]', '[kO:s]', '[t&ks]', '[daI]', '[li:]',
       '[bO:d]', '[sEnts]', '[pE@r*]', '[Su:t]', '[gEst]', '[rVf]',
       '[reIz]', '[k&][pI][tl,]', '[b@Uld]', '[wO:r*]', '[weIv]',
       '[b&nd]', '[tEk]', '[r@Ut]', '[mIst]', '[seIlz]', '[blu:]',
       '[&dz]', '[m@Ud]', '[rIN]', '[sE][l@z]', '[r&p]', '[r@Um]',
       '[h@Uld]', '[eIdz]', '[pOlz

In [125]:
berndt_character_code = pd.read_csv("Data/celex_phonetic_character_code_berndt1987.csv", delimiter=";")
berndt_conditional_probs = pd.read_csv("Data/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv",delimiter=";")

In [117]:
berndt_conditional_probs_words = get_ARPAbet_phonetic_transcription(berndt_conditional_probs.Example)
arpabet_used_in_bernd_examples = set(sum(berndt_conditional_probs_words,[]))

In [10]:
berndt_character_code

Unnamed: 0,keyboard_compatible_phonetic_symbol,CELEX,g2p(ARPAbet),DISC,Example,Note
0,ay,eI,"EY1,EY2",1,ale,
1,ae,&,"AE0,AE1,AE2",{,add,
2,ee,i:,"IY0, IY1",i,bee,
3,eh,E,"EH0,EH1,EH2",E,end,
4,er,@r*,"ER0,ER1,ER2",@R,father,
5,ai,aI,"AY0,AY1,AY2",2,high,
6,ih,I,"IH0,IH1,IH2",I,bin,
7,o,@U,"OW0,OW1,OW2",5,boat,
8,ah,O,"AA1,AA2",Q,cot,
9,aw,O,"AO1,AO2",Q,soft,AA1 in soft


In [126]:
berndt_arpabbet_dict = {}
for i,p in enumerate(berndt_character_code["g2p(ARPAbet)"]):
    if not p is np.nan:
        p = p.replace(" ","").split(",")       
        if len(p) > 1:
            for p_i in p:
                """
                if p_i in berndt_arpabbet_dict:
                    value_list = berndt_arpabbet_dict[p_i]
                    if isinstance(value_list, list): 
                        value_list += [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                    else:
                        value_list = [value_list] + [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                    
                    berndt_arpabbet_dict[p_i] = value_list
                else:
                """
                berndt_arpabbet_dict[p_i] = berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]
        else:
            """
            if p[0] in berndt_arpabbet_dict:
                value_list = berndt_arpabbet_dict[p[0]]
                if isinstance(value_list, list): 
                    value_list += [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                else:
                    value_list = [value_list] + [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                    
                berndt_arpabbet_dict[p[0]] = value_list
            else:   
            """
            berndt_arpabbet_dict[p[0]] = berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]

In [30]:
berndt_arpabbet_dict

{'EY1': 'ay',
 'EY2': 'ay',
 'AE0': 'ae',
 'AE1': 'ae',
 'AE2': 'ae',
 'IY0': 'ee',
 'IY1': 'ee',
 'EH0': 'eh',
 'EH1': 'eh',
 'EH2': 'eh',
 'ER0': 'er',
 'ER1': 'er',
 'ER2': 'er',
 'AY0': 'ai',
 'AY1': 'ai',
 'AY2': 'ai',
 'IH0': 'ih',
 'IH1': 'ih',
 'IH2': 'ih',
 'OW0': 'o',
 'OW1': 'o',
 'OW2': 'o',
 'AA1': 'ah',
 'AA2': 'ah',
 'AO1': 'aw',
 'AO2': 'aw',
 'UW0': 'oo',
 'UW1': 'oo',
 'UH1': 'u',
 'Y-UW0': 'yu',
 'Y-UW1': 'yu',
 'Y-AH0': 'yu',
 'AH1': 'uh+',
 'OY1': 'oy',
 'OY2': 'oy',
 'AW0': 'au',
 'AW1': 'au',
 'AH0': 'uh-',
 'B': 'b',
 'D': 'd',
 'F': 'f',
 'G': 'g',
 'HH': 'h',
 'JH': 'dj',
 'K': 'k',
 'L': 'l',
 'M': 'm',
 'N': 'n',
 'P': 'p',
 'R': 'r',
 'S': 's',
 'T': 't',
 'V': 'v',
 'W': 'w',
 'Y': 'y',
 'Z': 'z',
 'CH': 'tch',
 'KS': 'ks',
 'GZ': 'gz',
 'KW': 'kw',
 'AH0-L': 'ul',
 'AH0-M': 'um',
 'AH0-N': 'un',
 'NG': 'ng',
 'SH': 'sh',
 'TH': 'th-',
 'DH': 'th+',
 'ZH': 'zh'}

In [127]:
def get_keyboard_phonetic_symbols(arpabet_word, dictionary):
    skip_word = False
    keyboard_encoding = []
    for i,p in enumerate(arpabet_word):
        if skip_word == True:
            skip_word = False
        else:
            if p == 'AH0':
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] == "L":
                        keyboard_encoding.append([dictionary[p + "-" + "L"],[dictionary[p],dictionary["L"]]])
                        skip_word = True
                    elif arpabet_word[i+1] == "M":
                        keyboard_encoding.append([dictionary[p + "-" + "M"],[dictionary[p],dictionary["M"]]])
                        skip_word = True
                    elif arpabet_word[i+1] == "N":
                        keyboard_encoding.append([dictionary[p + "-" + "N"],[dictionary[p],dictionary["N"]]])
                        skip_word = True
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            elif p == "K":
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] == "S":
                        keyboard_encoding.append([dictionary[p]+dictionary["S"],[dictionary[p],dictionary["S"]]])
                        skip_word = True
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            
            elif p=="Y":
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] in ["UW0", "UW1", "AH0"]:
                        keyboard_encoding.append([dictionary[p + "-" + arpabet_word[i+1]],[dictionary[p],dictionary[arpabet_word[i+1]]]])
                        skip_word = True
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            elif p=="AO1":
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] == "R":
                        keyboard_encoding.append([dictionary[p],[dictionary["OW0"]]])
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            
            elif p=="IH1":
                keyboard_encoding.append([dictionary[p],[dictionary["IY1"]]])
            
            elif p in ["ER0", "ER1", "ER2"]:
                keyboard_encoding.append([dictionary[p],dictionary["R"],[dictionary[p],dictionary["R"]]])
                #keyboard_encoding.append([dictionary[p],[dictionary[p],dictionary["R"]]])
                
            else:
                keyboard_encoding.append(dictionary[p])
    return keyboard_encoding

In [128]:
berndt_computer_phonem_graph_prob_dict = {}
for phoneme in berndt_conditional_probs.Phoneme.unique():
    berndt_computer_phonem_graph_prob_dict[phoneme] = []

In [129]:
for i,row in berndt_conditional_probs.iterrows():
    grapheme_prior_cond = (row["Grapheme"], row['Prior_Probability'], row["Conditional_Probability"])
    berndt_computer_phonem_graph_prob_dict[row["Phoneme"]].append(grapheme_prior_cond)

In [130]:
unique_homophones = np.unique(homophones_in_data.word)
hom_phon_words = get_ARPAbet_phonetic_transcription(unique_homophones)

In [131]:
test_words = []
for i,word in enumerate(hom_phon_words):
    print(word,unique_homophones[i])
    print(get_keyboard_phonetic_symbols(word, berndt_arpabbet_dict))
    test_words.append((unique_homophones[i],get_keyboard_phonetic_symbols(word, berndt_arpabbet_dict)))

['AE1', 'D'] ad
['ae', 'd']
['AE1', 'D'] add
['ae', 'd']
['AE1', 'D', 'Z'] adds
['ae', 'd', 'z']
['AE1', 'D', 'Z'] ads
['ae', 'd', 'z']
['EY1', 'D'] aid
['ay', 'd']
['EY1', 'D'] aide
['ay', 'd']
['EY1', 'D', 'Z'] aides
['ay', 'd', 'z']
['EY1', 'D', 'Z'] aids
['ay', 'd', 'z']
['EH1', 'R', 'Z'] airs
['eh', 'r', 'z']
['AH0', 'L', 'AW1', 'D'] allowed
[['ul', ['uh-', 'l']], 'au', 'd']
['AH0', 'L', 'AW1', 'D'] aloud
[['ul', ['uh-', 'l']], 'au', 'd']
['B', 'EY1', 'L'] bail
['b', 'ay', 'l']
['B', 'EY1', 'T', 'S'] baits
['b', 'ay', 't', 's']
['B', 'AO1', 'L', 'D'] bald
['b', 'aw', 'l', 'd']
['B', 'EY1', 'L'] bale
['b', 'ay', 'l']
['B', 'AE1', 'N', 'D'] band
['b', 'ae', 'n', 'd']
['B', 'AE1', 'N', 'D'] banned
['b', 'ae', 'n', 'd']
['B', 'EH1', 'R'] bare
['b', 'eh', 'r']
['B', 'EY1', 'T', 'S'] bates
['b', 'ay', 't', 's']
['B', 'AO1', 'L', 'D'] bawled
['b', 'aw', 'l', 'd']
['B', 'EH1', 'R'] bear
['b', 'eh', 'r']
['B', 'IY1', 'T', 'S'] beats
['b', 'ee', 't', 's']
['B', 'IY1', 'T', 'S'] beets
['b', 

In [132]:
#test_words = [("soft", get_keyboard_phonetic_symbols(get_ARPAbet_phonetic_transcription(["soft"])[0], berndt_arpabbet_dict))]
#test_words = [("flower", get_keyboard_phonetic_symbols(get_ARPAbet_phonetic_transcription(["flower"])[0], berndt_arpabbet_dict))]


possible_grapheme_strings = [] # list for each word
possible_prior_probs = [] # list for each word
possible_cond_probs = [] # list for each word
word_rests = [] # list for each word

for i,word_pron in enumerate(test_words):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    print(word,pron)
    if word[0] == 'h' and pron[0] != 'h': #leading silent h         
        possible_grapheme_strings_i = [['$H']] # for each word a list of possible lists with grapheme given pronunciation strings 
        possible_prior_probs_i = [[0.0003]] # for each word a list of possible lists with corresponding prio probabilites 
        possible_cond_probs_i = [[1.000]] # for each word a list of possible lists possible corresponding conditional probabilities
        word_rests_i = [word.upper()[1:]] # for each word a list of remaining word characters after having splitted it into a list of possible graphemes  

    else:    
        possible_grapheme_strings_i = [[]] # for each word a list of possible lists with grapheme given pronunciation strings 
        possible_prior_probs_i = [[]] # for each word a list of possible lists with corresponding prio probabilites 
        possible_cond_probs_i = [[]] # for each word a list of possible lists possible corresponding conditional probabilities
        word_rests_i = [word.upper()] # for each word a list of remaining word characters after having splitted it into a list of possible graphemes  

    for j,p in enumerate(pron):  
        new_word_rests_i = [] # new rest of the word after looking at the current encoded syllable pronunciation
        new_possible_grapheme_strings_i = [] # new possible grapheme strings given the current syllable pronunciation
        new_possible_prior_probs_i = []
        new_possible_cond_probs_i = []
        if isinstance(p,list):
            for p_i in p:   
                if isinstance(p_i,list):
                    for p_ij in p_i:
                        for possible_grapheme in berndt_computer_phonem_graph_prob_dict[p_ij]: # all possible corresponding graphemes
                            grapheme = possible_grapheme[0].split("-") # account for silent e encoded by e.g. A-E
                            prior = possible_grapheme[1] # prior prob
                            cond = possible_grapheme[2] # cond prop
                            for k,word_rest in enumerate(word_rests_i): # for each possible combination we get a different word rest e.g. APPLE can have [A,P], [A,PP], [A-E,P], [A-E,PP] --> ["PLE", LE, PL, L]
                                if len(grapheme)>1: #silent E 
                                    # if word_rest.startswith(grapheme[0]) and word_rest.endswith(grapheme[1]): # check whether the grapheme fits to the rest of the word
                                     if word_rest.startswith(grapheme[0]) and grapheme[1] in word_rest[len(grapheme[0]):]:
                                        new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]]) # add the grapheme to the grapheme list which corresponds to the word ret we are currently looking at
                                        new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                        new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                        #new_word_rests_i.append(word_rest[len(grapheme[0]):-1]) # new word_rests 
                                        new_word_rests_i.append(word_rest[len(grapheme[0]):].replace(grapheme[1],"",1))
                                else:
                                    if word_rest.startswith(grapheme[0]): #no silent E
                                        new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]])
                                        new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                        new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                        new_word_rests_i.append(word_rest[len(grapheme[0]):])
                            
                        word_rests_i = new_word_rests_i #update word rests 
                        possible_grapheme_strings_i = new_possible_grapheme_strings_i #update possible grapheme strings 
                        possible_prior_probs_i = new_possible_prior_probs_i
                        possible_cond_probs_i = new_possible_cond_probs_i

                    
                else: # proceed like normal but without updating the words_rests
                    for possible_grapheme in berndt_computer_phonem_graph_prob_dict[p_i]: # all possible corresponding graphemes
                        grapheme = possible_grapheme[0].split("-") # account for silent e encoded by e.g. A-E
                        prior = possible_grapheme[1] # prior prob
                        cond = possible_grapheme[2] # cond prop

                        for k,word_rest in enumerate(word_rests_i): # for each possible combination we get a different word rest e.g. APPLE can have [A,P], [A,PP], [A-E,P], [A-E,PP] --> ["PLE", LE, PL, L]
                            if len(grapheme)>1: #silent E 
                                #if word_rest.startswith(grapheme[0]) and word_rest.endswith(grapheme[1]): # check whether the grapheme fits to the rest of the word
                                if word_rest.startswith(grapheme[0]) and grapheme[1] in word_rest[len(grapheme[0]):]:    
                                    new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]]) # add the grapheme to the grapheme list which corresponds to the word ret we are currently looking at
                                    new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                    new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                    #new_word_rests_i.append(word_rest[len(grapheme[0]):-1]) # new word_rests 
                                    new_word_rests_i.append(word_rest[len(grapheme[0]):].replace(grapheme[1],"",1))
                            else:
                                if word_rest.startswith(grapheme[0]): #no silent E
                                    new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]])
                                    new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                    new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                    new_word_rests_i.append(word_rest[len(grapheme[0]):])

                
            
        else:
            for possible_grapheme in berndt_computer_phonem_graph_prob_dict[p]: # all possible corresponding graphemes
                grapheme = possible_grapheme[0].split("-") # account for silent e encoded by e.g. A-E
                prior = possible_grapheme[1] # prior prob
                cond = possible_grapheme[2] # cond prop
                for k,word_rest in enumerate(word_rests_i): # for each possible combination we get a different word rest e.g. APPLE can have [A,P], [A,PP], [A-E,P], [A-E,PP] --> ["PLE", LE, PL, L]
                    if len(grapheme)>1: #silent E 
                        #if word_rest.startswith(grapheme[0]) and word_rest.endswith(grapheme[1]): # check whether the grapheme fits to the rest of the word
                        if word_rest.startswith(grapheme[0]) and grapheme[1] in word_rest[len(grapheme[0]):]:    
                            new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]]) # add the grapheme to the grapheme list which corresponds to the word ret we are currently looking at
                            new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                            new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                            #new_word_rests_i.append(word_rest[len(grapheme[0]):-1]) # new word_rests 
                            new_word_rests_i.append(word_rest[len(grapheme[0]):].replace(grapheme[1],"",1))

                    else:
                        if word_rest.startswith(grapheme[0]): #no silent E
                            new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]])
                            new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                            new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                            new_word_rests_i.append(word_rest[len(grapheme[0]):])


            word_rests_i = new_word_rests_i #update word rests 
            possible_grapheme_strings_i = new_possible_grapheme_strings_i #update possible grapheme strings 
            possible_prior_probs_i = new_possible_prior_probs_i
            possible_cond_probs_i = new_possible_cond_probs_i
    
        #print(possible_grapheme_strings_i,word_rests_i)
    possible_grapheme_strings.append(possible_grapheme_strings_i)
    possible_prior_probs.append(possible_prior_probs_i)
    possible_cond_probs.append(possible_cond_probs_i)
    word_rests.append(word_rests_i)

ad ['ae', 'd']
add ['ae', 'd']
adds ['ae', 'd', 'z']
ads ['ae', 'd', 'z']
aid ['ay', 'd']
aide ['ay', 'd']
aides ['ay', 'd', 'z']
aids ['ay', 'd', 'z']
airs ['eh', 'r', 'z']
allowed [['ul', ['uh-', 'l']], 'au', 'd']
aloud [['ul', ['uh-', 'l']], 'au', 'd']
bail ['b', 'ay', 'l']
baits ['b', 'ay', 't', 's']
bald ['b', 'aw', 'l', 'd']
bale ['b', 'ay', 'l']
band ['b', 'ae', 'n', 'd']
banned ['b', 'ae', 'n', 'd']
bare ['b', 'eh', 'r']
bates ['b', 'ay', 't', 's']
bawled ['b', 'aw', 'l', 'd']
bear ['b', 'eh', 'r']
beats ['b', 'ee', 't', 's']
beets ['b', 'ee', 't', 's']
bell ['b', 'eh', 'l']
belle ['b', 'eh', 'l']
berry ['b', 'eh', 'r', 'ee']
billed ['b', ['ih', ['ee']], 'l', 'd']
blew ['b', 'l', 'oo']
blue ['b', 'l', 'oo']
boar ['b', ['aw', ['o']], 'r']
board ['b', ['aw', ['o']], 'r', 'd']
bold ['b', 'o', 'l', 'd']
bore ['b', ['aw', ['o']], 'r']
bored ['b', ['aw', ['o']], 'r', 'd']
bowled ['b', 'o', 'l', 'd']
brakes ['b', 'r', 'ay', ['ks', ['k', 's']]]
bread ['b', 'r', 'eh', 'd']
breaks ['b', 

weighs ['w', 'ay', 'z']
weight ['w', 'ay', 't']
weighting ['w', 'ay', 't', 'ih', 'ng']
weights ['w', 'ay', 't', 's']
whacks ['w', 'ae', ['ks', ['k', 's']]]
whine ['w', 'ai', 'n']
whit ['w', ['ih', ['ee']], 't']
whole ['h', 'o', 'l']
wine ['w', 'ai', 'n']
wit ['w', ['ih', ['ee']], 't']
wore ['w', ['aw', ['o']], 'r']
worn ['w', ['aw', ['o']], 'r', 'n']
wrack ['r', 'ae', 'k']
wrap ['r', 'ae', 'p']
wreaks ['r', 'ee', ['ks', ['k', 's']]]
wring ['r', ['ih', ['ee']], 'ng']
wringing ['r', ['ih', ['ee']], 'ng', 'ih', 'ng']
write ['r', 'ai', 't']
writes ['r', 'ai', 't', 's']
wrote ['r', 'o', 't']


In [80]:
berndt_computer_phonem_graph_prob_dict

{'ae': [('A', 0.0712, 0.542), ('A-E', 0.0111, 0.121), ('AI', 0.0026, 0.003)],
 'uh-': [('A', 0.0712, 0.18600000000000003),
  ('A-E', 0.0111, 0.002),
  ('AI', 0.0026, 0.031),
  ('AU', 0.0014, 0.006),
  ('E', 0.073, 0.096),
  ('E-E', 0.0032, 0.28600000000000003),
  ('EA', 0.0047, 0.005),
  ('EI', 0.0005, 0.035),
  ('EO', 0.0001, 0.6659999999999999),
  ('EOU', 7e-05, 1.0),
  ('I', 0.0688, 0.18),
  ('I-E', 0.0086, 6.0),
  ('IA', 1e-05, 1.0),
  ('IE', 0.0011, 0.171),
  ('IE-E', 0.0002, 0.129),
  ('O', 0.055, 0.26899999999999996),
  ('O-E', 0.0043, 0.044000000000000004),
  ('OI-E', 9e-05, 0.2),
  ('OU', 0.0064, 0.48),
  ('U', 0.0267, 0.102),
  ('U-E', 0.0033, 0.01),
  ('Y', 0.0193, 0.01)],
 'ay': [('A', 0.0712, 0.129),
  ('A-E', 0.0111, 0.6509999999999999),
  ('AI', 0.0026, 0.7340000000000001),
  ('AI-E', 0.0002, 0.818),
  ('AIGH', 2.9999999999999997e-05, 1.0),
  ('AU-E', 0.0001, 0.083),
  ('AY', 0.0012, 0.97),
  ('AY-E', 9e-06, 1.0),
  ('E', 0.073, 0.002),
  ('E-E', 0.0032, 0.017),
  ('EA',

In [133]:
counter = 0 
for i,word_pron in enumerate(test_words):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(possible_grapheme_strings[i]) == 0:
        counter+=1
        print(word,pron)
        print(possible_grapheme_strings[i])
        print(possible_prior_probs[i])
        print(possible_cond_probs[i])
        print(word_rests[i])

guessed ['g', 'eh', 's', 't']
[]
[]
[]
[]
guest ['g', 'eh', 's', 't']
[]
[]
[]
[]
guise ['g', 'ai', 'z']
[]
[]
[]
[]
thai ['t', 'ai']
[]
[]
[]
[]
thais ['t', 'ai', 'z']
[]
[]
[]
[]
weighed ['w', 'ay', 'd']
[]
[]
[]
[]


In [31]:
print(test_words[399])
print(possible_grapheme_strings[399])
print(word_rests[399])

('wrap', ['r', 'ae', 'p'])
[['WR', 'A', 'P']]
['']


In [64]:
len(test_words)

406

In [19]:
homophones_in_data.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'letter_length', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max'],
      dtype='object')

In [26]:
homophones_in_data.groupby("word").first()["has_pair"].to_dict()

{'ad': True,
 'add': True,
 'adds': True,
 'ads': True,
 'aid': True,
 'aide': True,
 'aides': True,
 'aids': True,
 'airs': True,
 'allowed': True,
 'aloud': True,
 'bail': True,
 'baits': True,
 'bald': True,
 'bale': True,
 'band': True,
 'banned': True,
 'bare': True,
 'bates': True,
 'bawled': True,
 'bear': True,
 'beats': True,
 'beets': True,
 'bell': True,
 'belle': True,
 'berry': True,
 'billed': True,
 'blew': True,
 'blue': True,
 'boar': True,
 'board': True,
 'bold': True,
 'bore': True,
 'bored': True,
 'bowled': True,
 'brakes': True,
 'bread': True,
 'breaks': True,
 'bred': True,
 'build': True,
 'bury': True,
 'callous': True,
 'callus': True,
 'capital': True,
 'capitol': True,
 'ceiling': True,
 'cell': True,
 'cellar': True,
 'cellars': True,
 'cells': True,
 'cent': True,
 'cents': True,
 'cereal': True,
 'chews': True,
 'chile': True,
 'chilly': True,
 'choose': True,
 'chord': True,
 'chords': True,
 'chute': True,
 'chutes': True,
 'coarse': True,
 'coco': Tr

In [32]:
test_words

[('ad', ['ae', 'd']),
 ('add', ['ae', 'd']),
 ('adds', ['ae', 'd', 'z']),
 ('ads', ['ae', 'd', 'z']),
 ('aid', ['ay', 'd']),
 ('aide', ['ay', 'd']),
 ('aides', ['ay', 'd', 'z']),
 ('aids', ['ay', 'd', 'z']),
 ('airs', ['eh', 'r', 'z']),
 ('allowed', [['ul', ['uh-', 'l']], 'au', 'd']),
 ('aloud', [['ul', ['uh-', 'l']], 'au', 'd']),
 ('bail', ['b', 'ay', 'l']),
 ('baits', ['b', 'ay', 't', 's']),
 ('bald', ['b', 'aw', 'l', 'd']),
 ('bale', ['b', 'ay', 'l']),
 ('band', ['b', 'ae', 'n', 'd']),
 ('banned', ['b', 'ae', 'n', 'd']),
 ('bare', ['b', 'eh', 'r']),
 ('bates', ['b', 'ay', 't', 's']),
 ('bawled', ['b', 'aw', 'l', 'd']),
 ('bear', ['b', 'eh', 'r']),
 ('beats', ['b', 'ee', 't', 's']),
 ('beets', ['b', 'ee', 't', 's']),
 ('bell', ['b', 'eh', 'l']),
 ('belle', ['b', 'eh', 'l']),
 ('berry', ['b', 'eh', 'r', 'ee']),
 ('billed', ['b', ['ih', ['ee']], 'l', 'd']),
 ('blew', ['b', 'l', 'oo']),
 ('blue', ['b', 'l', 'oo']),
 ('boar', ['b', ['aw', ['o']], 'r']),
 ('board', ['b', ['aw', ['o']], 'r

In [40]:
possible_grapheme_strings[1]
word_rests[1]

['D', '']

In [64]:
print(len(test_words))
print(len(possible_grapheme_strings))

406
406


In [134]:
berndt_conditional_probs

Unnamed: 0,Grapheme,Prior_Probability,Phoneme,Conditional_Probability,Example,Unnamed: 5
0,A,0.0712,ae,0.542,cab,vowel
1,A,0.0712,uh-,0.186,canal,vowel
2,A,0.0712,ay,0.129,angel,vowel
3,A,0.0712,ah,0.077,wad,vowel
4,A,0.0712,aw,0.021,also,vowel
...,...,...,...,...,...,...
340,Z,0.0021,z,0.996,zoo,consonant
341,Z,0.0021,s,0.025,waltz,consonant
342,Z,0.0021,zh,0.008,azure,consonant
343,ZZ,0.0002,z,1.000,buzz,consonant


In [None]:
homophones_in_data.groupby("word").first()["has_pair"].to_dict()

In [135]:
def get_valid_grapheme_strings(test_words, possible_grapheme_strings, word_rests,possible_prior_probs, possible_cond_probs):
    valid_word_rests = []
    valid_grapheme_strings = []
    valid_prior_probs = []
    valid_cond_probs = []


    for i,word_tuple in enumerate(test_words):
        word = word_tuple[0]
        phon = word_tuple[1]

        word_rests_i = np.asarray(word_rests[i])
        grapheme_strings_i = np.asarray(possible_grapheme_strings[i])
        prio_probs_i = np.asarray(possible_prior_probs[i])
        cond_probs_i = np.asarray(possible_cond_probs[i])

        if len(word_rests_i) == 0:
            not_empty_word_rests_idx = []
        else:
            not_empty_word_rests_idx = np.where(word_rests_i != '')[0]


        if len(not_empty_word_rests_idx)>0:
            valid_word_rests.append(np.delete(word_rests_i,not_empty_word_rests_idx,axis=0))
            valid_grapheme_strings.append(np.delete(grapheme_strings_i,not_empty_word_rests_idx,axis=0))
            valid_prior_probs.append(np.delete(prio_probs_i,not_empty_word_rests_idx,axis=0))
            valid_cond_probs.append(np.delete(cond_probs_i,not_empty_word_rests_idx,axis=0))
        else:
            valid_word_rests.append(word_rests_i)
            valid_grapheme_strings.append(grapheme_strings_i)
            valid_prior_probs.append(prio_probs_i)
            valid_cond_probs.append(cond_probs_i)
    
    return valid_word_rests,valid_grapheme_strings,valid_prior_probs, valid_cond_probs


In [136]:
valid_word_rests,valid_grapheme_strings,valid_prior_probs, valid_cond_probs = get_valid_grapheme_strings(test_words, possible_grapheme_strings, word_rests,possible_prior_probs, possible_cond_probs)

In [137]:
max_cond_prob_for_grapheme = berndt_conditional_probs.groupby("Grapheme").max()["Conditional_Probability"].to_dict()

In [138]:
max_cond_prob_for_grapheme

{'$H': 1.0,
 'A': 0.542,
 'A-E': 0.6509999999999999,
 'AE': 0.833,
 'AH': 1.0,
 'AI': 0.7340000000000001,
 'AI-E': 0.818,
 'AIGH': 1.0,
 'AL': 1.0,
 'AO': 1.0,
 'AU': 0.948,
 'AU-E': 0.75,
 'AUGH': 1.0,
 'AW': 1.0,
 'AW-E': 1.0,
 'AY': 0.97,
 'AY-E': 1.0,
 'B': 1.0,
 'BB': 1.0,
 'BT': 1.0,
 'C': 0.757,
 'CCH': 1.0,
 'CE': 1.0,
 'CH': 0.64,
 'CHT': 1.0,
 'CK': 1.0,
 'CQ': 1.0,
 'CQU': 1.0,
 'CS': 1.0,
 'CT': 1.0,
 'CZ': 1.0,
 'Cl': 1.0,
 'D': 0.991,
 'DD': 1.0,
 'DG': 1.0,
 'DI': 1.0,
 'DJ': 1.0,
 'E': 0.419,
 'E-E': 0.321,
 'EA': 0.5760000000000001,
 'EA-E': 0.882,
 'EAU': 0.545,
 'ED': 1.0,
 'EE': 0.9790000000000001,
 'EE-E': 1.0,
 'EI': 0.315,
 'EI-E': 0.75,
 'EIGH': 0.857,
 'EL': 1.0,
 'EN': 1.0,
 'EO': 0.6659999999999999,
 'EOU': 1.0,
 'ES': 1.0,
 'ET': 1.0,
 'EU': 0.7170000000000001,
 'EU-E': 1.0,
 'EW': 0.603,
 'EW-E': 1.0,
 'EY': 0.741,
 'EY-E': 1.0,
 'F': 0.998,
 'FF': 1.0,
 'FT': 1.0,
 'G': 0.64,
 'GG': 0.971,
 'GH': 0.555,
 'GM': 1.0,
 'GN': 1.0,
 'GUE': 1.0,
 'Gl': 1.0,
 'H'

In [91]:
np.asarray([1,2,3])/np.asarray([1,2,3])

array([1., 1., 1.])

In [139]:
def calculate_m_scores(valid_grapheme_strings,valid_cond_probs):
    m_scores = []
    for i, grapheme_strings_i in enumerate(valid_grapheme_strings):
        m_score_i = []
        for j, possible_string_ij in enumerate(grapheme_strings_i):
            cond_prob = np.asarray(valid_cond_probs[i][j])
            most_prob = np.asarray([max_cond_prob_for_grapheme[g] for g in possible_string_ij])

            m_score_i.append(np.mean(cond_prob/most_prob))

        m_scores.append(m_score_i)
    return m_scores

In [140]:
m_scores = calculate_m_scores(valid_grapheme_strings,valid_cond_probs)

In [141]:
m_scores

[[1.0],
 [1.0],
 [0.7127496159754224],
 [0.7127496159754224],
 [1.0],
 [1.0],
 [1.0, 0.7127496159754224],
 [0.7127496159754224],
 [0.4593436214250137],
 [1.0, 0.8686346863468636, 0.8357933579335793],
 [0.8916666666666666, 0.7545433579335794],
 [1.0],
 [1.0],
 [0.7596863468634687],
 [1.0],
 [1.0],
 [0.7964669738863287],
 [0.6881720430107526],
 [1.0],
 [1.0],
 [0.8391203703703703],
 [1.0],
 [0.9462616822429907, 1.0],
 [1.0],
 [1.0],
 [1.0],
 [0.839, 0.7615000000000001, 0.839, 0.7615000000000001],
 [0.8595909342177999],
 [0.7107633949739213, 0.8644338118022329],
 [0.690246516613076, 1.0],
 [0.767684887459807, 1.0],
 [1.0],
 [0.6853503184713375, 1.0],
 [0.7640127388535032, 1.0],
 [0.875],
 [1.0],
 [0.8793402777777778],
 [0.809375],
 [1.0],
 [1.0],
 [0.7503597122302158],
 [1.0],
 [0.8489208633093526],
 [0.8752327746741155, 0.7992242971112873],
 [1.0, 0.9795268425841674],
 [0.8618229854689563],
 [0.769704975781594],
 [0.6695720629597313, 0.6695720629597313],
 [0.5810181937874873, 0.581018193

In [143]:
for i,m_i in enumerate(m_scores):
    if len(m_i)>1:
        print(test_words[i])
        print(valid_grapheme_strings[i])
        print(valid_cond_probs[i])
        print(m_i)
        print("\n")

('aides', ['ay', 'd', 'z'])
[['AI' 'D' 'ES']
 ['AI-E' 'D' 'S']]
[[0.734 0.991 1.   ]
 [0.818 0.991 0.12 ]]
[1.0, 0.7127496159754224]


('allowed', [['ul', ['uh-', 'l']], 'au', 'd'])
[list(['AL', 'L', 'OW-E', 'D']) list(['A', 'L', 'L', 'OW-E', 'D'])
 list(['A', 'LL', 'OW-E', 'D'])]
[list([1.0, 1.0, 0.6659999999999999, 0.991])
 list([0.18600000000000003, 1.0, 1.0, 0.6659999999999999, 0.991])
 list([0.18600000000000003, 1.0, 0.6659999999999999, 0.991])]
[1.0, 0.8686346863468636, 0.8357933579335793]


('aloud', [['ul', ['uh-', 'l']], 'au', 'd'])
[list(['AL', 'OU', 'D']) list(['A', 'L', 'OU', 'D'])]
[list([1.0, 0.324, 0.991]) list([0.18600000000000003, 1.0, 0.324, 0.991])]
[0.8916666666666666, 0.7545433579335794]


('beets', ['b', 'ee', 't', 's'])
[['B' 'E-E' 'T' 'S']
 ['B' 'EE' 'T' 'S']]
[[1.    0.252 0.973 0.868]
 [1.    0.979 0.973 0.868]]
[0.9462616822429907, 1.0]


('billed', ['b', ['ih', ['ee']], 'l', 'd'])
[['B' 'I-E' 'LL' 'D']
 ['B' 'I-E' 'LL' 'D']
 ['B' 'I-E' 'L' 'LD']
 ['B' 'I-E' 

In [154]:
m_score_dict = {}
for i, word_tuple in enumerate(test_words):
    word = word_tuple[0]
    phon = word_tuple[1]
          
    if len(m_scores[i])>0:
        m_score_dict[word] = np.mean(m_scores[i])
    else:
        m_score_dict[word] = np.nan

('chute', ['sh', 'oo', 't'])
[0.413367561640588]
[['CH' 'U-E' 'T']]
[[0.069 0.093 0.973]]


In [156]:
m_score_dict

{'ad': 1.0,
 'add': 1.0,
 'adds': 0.7127496159754224,
 'ads': 0.7127496159754224,
 'aid': 1.0,
 'aide': 1.0,
 'aides': 0.8563748079877112,
 'aids': 0.7127496159754224,
 'airs': 0.4593436214250137,
 'allowed': 0.9014760147601476,
 'aloud': 0.823105012300123,
 'bail': 1.0,
 'baits': 1.0,
 'bald': 0.7596863468634687,
 'bale': 1.0,
 'band': 1.0,
 'banned': 0.7964669738863287,
 'bare': 0.6881720430107526,
 'bates': 1.0,
 'bawled': 1.0,
 'bear': 0.8391203703703703,
 'beats': 1.0,
 'beets': 0.9731308411214954,
 'bell': 1.0,
 'belle': 1.0,
 'berry': 1.0,
 'billed': 0.8002499999999999,
 'blew': 0.8595909342177999,
 'blue': 0.7875986033880771,
 'boar': 0.845123258306538,
 'board': 0.8838424437299035,
 'bold': 1.0,
 'bore': 0.8426751592356687,
 'bored': 0.8820063694267516,
 'bowled': 0.875,
 'brakes': 1.0,
 'bread': 0.8793402777777778,
 'breaks': 0.809375,
 'bred': 1.0,
 'build': 1.0,
 'bury': 0.7503597122302158,
 'callous': 1.0,
 'callus': 0.8489208633093526,
 'capital': 0.8372285358927014,
 'ca

In [160]:
m_scores_df = pd.DataFrame({"word":list(m_score_dict.keys()),"m-score" : list(m_score_dict.values())})

In [161]:
m_scores_df

Unnamed: 0,word,m-score
0,ad,1.000000
1,add,1.000000
2,adds,0.712750
3,ads,0.712750
4,aid,1.000000
...,...,...
401,wring,0.834497
402,wringing,0.900698
403,write,0.863000
404,writes,0.897250


In [162]:
homophones_with_mscore = homophones_in_data_celex_mapped.merge(m_scores_df, on="word")

In [163]:
homophones_in_data_celex_mapped

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,cond_pred_next,has_pair,pron,celexPhon,pron_frequency,is_max,disc,clx,disc_no_bound,clx_no_bound
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.070000,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.017207,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.320000,38.540000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.660000,139.880000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.000264,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.530000,277.750000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.939999,415.159999,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.660000,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.150000,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.980000,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.000002,True,sp1d,sp1d,13,1,'sp1d,[speId],sp1d,speId
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.059999,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.001192,True,plVm,plVm,23,1,'plVm,[plVm],plVm,plVm


In [164]:
homophones_with_mscore

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,has_pair,pron,celexPhon,pron_frequency,is_max,disc,clx,disc_no_bound,clx_no_bound,m-score
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.070000,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt,1.000000
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.320000,38.540000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt,1.000000
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.660000,139.880000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt,1.000000
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.530000,277.750000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt,1.000000
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.939999,415.159999,0.22,high-confidence,no-error,no-error,no-error,no-error,...,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.660000,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk,0.806618
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.150000,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk,0.806618
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.980000,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,True,sp1d,sp1d,13,1,'sp1d,[speId],sp1d,speId,1.000000
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.059999,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,True,plVm,plVm,23,1,'plVm,[plVm],plVm,plVm,1.000000
