In [1]:
import preprocessing
import pandas as pd
import numpy as np
import importlib
from g2p_en import G2p # https://github.com/Kyubyong/g2p

In [2]:
celex_dict_file = "Data/epw.cd"
filename = "Data/2016_all_words_no_audio.pickle"
hom_filename = "Data/hom.csv"

In [3]:
df = preprocessing.read_dataframe(filename, remove_pauses=True, remove_errors=True, preprocessing=True, drop_error_columns=False)

read dataframe from Data/2016_all_words_no_audio.pickle
Preprocessing: extract pause information...
Remove pauses from data!
Preprocessing: apply word preprocessing...
Preprocessing: calculate word duration...
Preprocessing: calculate word frequency...
Preprocessing: extract context information...
Preprocessing: calculate letter length...
Preprocessing: calculate contextual predictability...
(18864660, 25) RangeIndex(start=0, stop=18864660, step=1)


In [12]:
source_files = ["2016-12-17_1330_US_KCET_Asia_Insight", "2016-10-25_2300_US_KABC_Eyewitness_News_4PM"]

In [13]:
sub_df = pd.read_csv("sub_df.csv", index_col="Unnamed: 0")
sub_df

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,prev_word_frequency,next_word,next_word_frequency,letter_length,prev_word_string,next_word_string,prev_word_string_frequency,next_word_string_frequency,cond_pred_prev,cond_pred_next
14828820,2016-10-25_2300_US_KABC_Eyewitness_News_4PM,police,0.29,0.67,0.38,high-confidence,no-error,no-error,no-error,no-error,...,,don't,32647.0,6,,police-don't,,37.0,,0.001133
14828821,2016-10-25_2300_US_KABC_Eyewitness_News_4PM,don't,0.67,0.88,0.21,high-confidence,no-error,no-error,no-error,no-error,...,18598.0,believe,9847.0,5,police-don't,don't-believe,37.0,540.0,0.001989,0.054839
14828822,2016-10-25_2300_US_KABC_Eyewitness_News_4PM,believe,0.88,1.21,0.33,high-confidence,no-error,no-error,no-error,no-error,...,32647.0,the,932396.0,7,don't-believe,believe-the,540.0,793.0,0.016541,0.000850
14828823,2016-10-25_2300_US_KABC_Eyewitness_News_4PM,the,1.22,1.40,0.18,high-confidence,no-error,no-error,no-error,no-error,...,9847.0,mother,3407.0,3,believe-the,the-mother,793.0,490.0,0.080532,0.143822
14828824,2016-10-25_2300_US_KABC_Eyewitness_News_4PM,mother,1.40,1.79,0.39,high-confidence,no-error,no-error,no-error,no-error,...,932396.0,or,57737.0,6,the-mother,mother-or,490.0,21.0,0.000526,0.000364
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
17805800,2016-12-17_1330_US_KCET_Asia_Insight,2017,1657.55,1657.71,0.16,high-confidence,no-error,no-error,no-error,no-error,...,335519.0,as,89095.0,4,in-2017,2017-as,151.0,15.0,0.000450,0.000168
17805801,2016-12-17_1330_US_KCET_Asia_Insight,as,1659.25,1659.49,0.24,high-confidence,no-error,no-error,no-error,no-error,...,597.0,the,932396.0,2,2017-as,as-the,15.0,6909.0,0.025126,0.007410
17805802,2016-12-17_1330_US_KCET_Asia_Insight,the,1659.50,1659.65,0.15,high-confidence,no-error,no-error,no-error,no-error,...,89095.0,world's,1593.0,3,as-the,the-world's,6909.0,1265.0,0.077546,0.794099
17805803,2016-12-17_1330_US_KCET_Asia_Insight,world's,1659.65,1660.25,0.60,high-confidence,no-error,no-error,no-error,no-error,...,932396.0,largest,1971.0,7,the-world's,world's-largest,1265.0,218.0,0.001357,0.110604


In [4]:
homophones_in_data, gahls_homophones, gahls_homophones_missing_in_data = preprocessing.read_and_extract_homophones(hom_filename, df)

read Gahls Homophone data from Data/hom.csv
406 out of 412 homophones found in Data:
Homophone Pairs found in Data: 200
Homophones without Pair:  ['flowers', 'holes', 'moose', 'naval', 'pairs', 'taught']
Missing homophones: ['flours' 'mousse' 'navel' 'pears' 'taut' 'wholes']


In [9]:
gahls_homophones.columns

Index(['spell', 'pron', 'lgPronCelFq', 'logCelFq', 'logAvgDur', 'stem',
       'is_complex', 'celexPhon', 'phonNeighCount', 'NearestSemNeighCor',
       'MeanCorTop20', 'AvCor', 'MedianCor', 'MeanCorTop20Unrel',
       'CossinTwinsStem', 'CossinTwinsFull', 'L2Ldiag', 'EuclidDistTwins',
       'SL1norm', 'CorrectLDLpred', 'SumChatWord', 'MinChatWord', 'L1ChatWord',
       'CorPredWord', 'LWLinkRatioWord', 'RankProd'],
      dtype='object')

In [5]:
def get_ARPAbet_phonetic_transcription(word_list):
    g2p = G2p()
    arpabet_word_list = []
    for word in word_list:
        transcription = g2p(word)
        arpabet_word_list.append(transcription)

    return arpabet_word_list


def get_english_phonology_from_celex(filename):
    phonology_dict = {"word":[], "disc":[], "clx":[]}
    with open(filename) as f:
        for line in f:
            line = line.strip().split("\\")
            word = line[1] # the word
            phonology_dict["word"].append(word)
            disc = line[6] # pronunciation in DISC notation, hyphens to mark syllable boundaries, inverted comma for primary stress and double quote for secondary stress (PhonStrsDISC)
            phonology_dict["disc"].append(disc)
            clx = line[8] # pronunciation in CELEX notation, with brackets (PhonSylBCLX)
            phonology_dict["clx"].append(clx)

    celex_phonology_dict = pd.DataFrame.from_dict(phonology_dict).drop_duplicates()
    celex_phonology_dict["disc_no_bound"] = celex_phonology_dict["disc"].apply(
        lambda x: x.replace("'", "").replace("-", ""))
    celex_phonology_dict["clx_no_bound"] = celex_phonology_dict["clx"].apply(
        lambda x: x.replace("[", "").replace("]", ""))
    return celex_phonology_dict



def get_celex_transcription(df, celex_phonology_dict):

    return df.merge(celex_phonology_dict[["word", "disc", "clx", "disc_no_bound", "clx_no_bound"]], how = "left", left_on=["word", "celexPhon"], right_on=["word","disc_no_bound"])


In [6]:
celex_phonology_dict = get_english_phonology_from_celex(celex_dict_file)

In [13]:
celex_phonology_dict

Unnamed: 0,word,disc,clx,disc_no_bound,clx_no_bound
0,a,'1,[eI],1,eI
2,A,'1,[eI],1,eI
4,AA,"""1-'1",[eI][eI],"""11",eIeI
6,AAs,"""1-'1z",[eI][eIz],"""11z",eIeIz
7,abaci,'{-b@-s2,[&][b@][saI],{b@s2,&b@saI
...,...,...,...,...,...
100620,Zouave,zu-'#v,[zu:][A:v],zu#v,zu:A:v
100621,Zouaves,zu-'#vz,[zu:][A:vz],zu#vz,zu:A:vz
100622,z's,'zEdz,[zEdz],zEdz,zEdz
100623,zucchini,zU-'ki-nI,[zU][ki:][nI],zUkinI,zUki:nI


In [7]:
homophones_in_data_celex_mapped = get_celex_transcription(homophones_in_data,celex_phonology_dict)

In [15]:
homophones_in_data_celex_mapped

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,cond_pred_next,has_pair,pron,celexPhon,pron_frequency,is_max,disc,clx,disc_no_bound,clx_no_bound
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.070000,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.017207,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.320000,38.540000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.660000,139.880000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.000264,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.530000,277.750000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.939999,415.159999,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.660000,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.150000,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.980000,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.000002,True,sp1d,sp1d,13,1,'sp1d,[speId],sp1d,speId
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.059999,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.001192,True,plVm,plVm,23,1,'plVm,[plVm],plVm,plVm


In [8]:
berndt_character_code = pd.read_csv("Data/celex_phonetic_character_code_berndt1987.csv", delimiter=";")
berndt_conditional_probs = pd.read_csv("Data/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv",delimiter=";")

In [17]:
berndt_character_code

Unnamed: 0,keyboard_compatible_phonetic_symbol,CELEX,g2p(ARPAbet),DISC,Example,Note
0,ay,eI,"EY1,EY2",1,ale,
1,ae,&,"AE0,AE1,AE2",{,add,
2,ee,i:,"IY0, IY1",i,bee,
3,eh,E,"EH0,EH1,EH2",E,end,
4,er,@r*,"ER0,ER1,ER2",@R,father,
5,ai,aI,"AY0,AY1,AY2",2,high,
6,ih,I,"IH0,IH1,IH2",I,bin,
7,o,@U,"OW0,OW1,OW2",5,boat,
8,ah,O,"AA1,AA2",Q,cot,
9,aw,O,"AO1,AO2",Q,soft,AA1 in soft


In [39]:
homophones_in_data_celex_mapped.word

0          right
1          right
2          right
3          right
4          right
           ...  
530336     franc
530337     franc
530338    spayed
530339     plumb
530340     reeks
Name: word, Length: 530341, dtype: object

In [18]:
#arpabet_encoded_words = get_ARPAbet_phonetic_transcription(homophones_in_data_celex_mapped.word)
#arpabet_used_in_data = set(sum(arpabet_encoded_words,[]))

disc_encoded_words = list(pd.Series(homophones_in_data_celex_mapped.disc.unique()).dropna().str.replace("'","").str.split("-"))
disc_used_in_data = set(sum(disc_encoded_words,[]))

#clx_encoded_words = list(homophones_in_data_celex_mapped.clx[pd.notnull(homophones_in_data_celex_mapped.clx)].str.replace("[","").str.split("]"))
#clx_used_in_data = set(filter(lambda x: x != "",sum(clx_encoded_words,[])))

In [34]:
pd.Series(homophones_in_data_celex_mapped.disc.unique()).dropna()

0         'r2t
1         'nju
2         'm$l
3      'w1-tIN
4        'list
        ...   
204      'k$dz
205       'mus
206      'sp1d
207      's{ks
208      'riks
Length: 208, dtype: object

In [137]:
problematic_words = []

for word in list(pd.Series(homophones_in_data_celex_mapped.disc.unique()).dropna()):
    if any(char in word for char in ['d','t']):#['#', '$', '3', '7', '8', 'q', "Q"]):  
        problematic_words.append((word,homophones_in_data_celex_mapped[homophones_in_data_celex_mapped.disc == word].word.iloc[0]))
print(problematic_words)

[("'r2t", 'right'), ("'w1-tIN", 'waiting'), ("'list", 'least'), ("'t2m", 'time'), ("'h#t", 'heart'), ("'s2t", 'site'), ("'h3d", 'heard'), ("'sEnt", 'sent'), ("'m1d", 'made'), ("'r5d", 'road'), ("'p#st", 'past'), ("'d2d", 'died'), ("'f2nd", 'find'), ("'str1t", 'straight'), ("'n2t", 'night'), ("'gr1t", 'great'), ("'fit", 'feet'), ("'t2d", 'tied'), ("@-'l6d", 'allowed'), ("'dju", 'due'), ("'w1t", 'wait'), ("'t{ks", 'tax'), ("'d2", 'die'), ("'b$d", 'board'), ("'sEnts", 'cents'), ("'Sut", 'shoot'), ("'gEst", 'guessed'), ("'k{-pI-tP", 'capital'), ("'b5ld", 'bold'), ("'b{nd", 'banned'), ("'tEk", 'tech'), ("'r5t", 'wrote'), ("'mIst", 'missed'), ("'{dz", 'adds'), ("'m5d", 'mode'), ("'h5ld", 'hold'), ("'1dz", 'aides'), ("'p1st", 'paste'), ("'dQk", 'dock'), ("'t$t", 'taught'), ("'t2z", 'ties'), ("'t2", 'tie'), ("'Suts", 'shoots'), ("'r2ts", 'rights'), ("'mit", 'meet'), ("'n2ts", 'nights'), ("'ni-dIN", 'needing'), ("'d7R", 'dear'), ("'p{kt", 'packed'), ("'pVts", 'puts'), ("'1d", 'aide'), ("'k$d", 

In [80]:
print(get_ARPAbet_phonetic_transcription(["fiancee"]))
print(get_ARPAbet_phonetic_transcription(["detente"]))
print(get_ARPAbet_phonetic_transcription(["fiancé"]))
print(get_ARPAbet_phonetic_transcription(["boston"]))
print(get_ARPAbet_phonetic_transcription(["chocolate"]))
print(get_ARPAbet_phonetic_transcription(["borrow"]))

[['F', 'IY0', 'AE1', 'N', 'S', 'IY0']]
[['D', 'EY0', 'T', 'AA1', 'N', 'T']]
[['F', 'IY0', 'AA1', 'N', 'S', 'EY2']]
[['B', 'AA1', 'S', 'T', 'AH0', 'N']]
[['CH', 'AO1', 'K', 'L', 'AH0', 'T']]
[['B', 'AA1', 'R', 'OW2']]


In [70]:
print(get_ARPAbet_phonetic_transcription(["pair"]))
print(get_ARPAbet_phonetic_transcription(["pairs"]))
print(get_ARPAbet_phonetic_transcription(["flair"]))
print(get_ARPAbet_phonetic_transcription(["fair"]))
print(get_ARPAbet_phonetic_transcription(["bear"]))
print(get_ARPAbet_phonetic_transcription(["airs"]))

[['P', 'EH1', 'R']]
[['P', 'EH1', 'R', 'Z']]
[['F', 'L', 'EH1', 'R']]
[['F', 'EH1', 'R']]
[['B', 'EH1', 'R']]
[['EH1', 'R', 'Z']]


In [68]:
print(get_ARPAbet_phonetic_transcription(["here"]))
print(get_ARPAbet_phonetic_transcription(["sheer"]))
print(get_ARPAbet_phonetic_transcription(["dear"]))
print(get_ARPAbet_phonetic_transcription(["pier"]))
print(get_ARPAbet_phonetic_transcription(["serial"]))
print(get_ARPAbet_phonetic_transcription(["peers"]))
print(get_ARPAbet_phonetic_transcription(["bin"]))

[['HH', 'IY1', 'R']]
[['SH', 'IH1', 'R']]
[['D', 'IH1', 'R']]
[['P', 'IH1', 'R']]
[['S', 'IH1', 'R', 'IY0', 'AH0', 'L']]
[['P', 'IH1', 'R', 'Z']]
[['B', 'IH1', 'N']]


In [61]:
print(get_ARPAbet_phonetic_transcription(["heart"]))
print(get_ARPAbet_phonetic_transcription(["past"]))
print(get_ARPAbet_phonetic_transcription(["marks"]))
print(get_ARPAbet_phonetic_transcription(["barn"]))
print(get_ARPAbet_phonetic_transcription(["cot"]))

[['HH', 'AA1', 'R', 'T']]
[['P', 'AE1', 'S', 'T']]
[['M', 'AA1', 'R', 'K', 'S']]
[['B', 'AA1', 'R', 'N']]
[['K', 'AA1', 'T']]
[['P', 'OW1', 'L', 'Z']]
[['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'AH0', 'S']]
[['D', 'AA1', 'K']]


In [81]:
print(get_ARPAbet_phonetic_transcription(["polls"]))
print(get_ARPAbet_phonetic_transcription(["populous"]))
print(get_ARPAbet_phonetic_transcription(["dock"]))
print(get_ARPAbet_phonetic_transcription(["over"]))

[['P', 'OW1', 'L', 'Z']]
[['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'AH0', 'S']]
[['D', 'AA1', 'K']]
[['OW1', 'V', 'ER0']]


In [51]:
print(get_ARPAbet_phonetic_transcription(["war"]))
print(get_ARPAbet_phonetic_transcription(["mall"]))
print(get_ARPAbet_phonetic_transcription(["hall"]))
print(get_ARPAbet_phonetic_transcription(["course"]))
print(get_ARPAbet_phonetic_transcription(["halls"]))
print(get_ARPAbet_phonetic_transcription(["cords"]))
print(get_ARPAbet_phonetic_transcription(["bore"]))

[['W', 'AO1', 'R']]
[['M', 'AO1', 'L']]
[['HH', 'AO1', 'L']]
[['K', 'AO1', 'R', 'S']]
[['HH', 'AO1', 'L', 'Z']]
[['K', 'AO1', 'R', 'D', 'Z']]
[['B', 'AO1', 'R']]


In [10]:
arpabet_used_in_data - set(berndt_character_code["g2p(ARPAbet)"])

{'AA2', 'AO1', 'EH0', 'ER1', 'EY2', 'IH0', 'IY0', 'OW0', 'OW2'}

In [20]:
disc_characters_used_in_data = set(''.join(list(disc_used_in_data)))
disc_characters_for_berndts_encoding = set(''.join([str(i) for i in list(set(berndt_character_code.DISC))]))

In [24]:
clx_characters_used_in_data = set(''.join(list(clx_used_in_data)))
clx_characters_for_berndts_encoding = set(''.join([str(i) for i in list(set(berndt_character_code.CELEX))]))

NameError: name 'clx_used_in_data' is not defined

In [21]:
disc_characters_used_in_data - disc_characters_for_berndts_encoding

{'#', '$', '3', '7', '8', 'q'}

In [170]:
clx_characters_used_in_data - clx_characters_for_berndts_encoding

{'A'}

In [173]:
for i,word in enumerate(clx_encoded_words):
    for j in word:
        if any(x in j for x in ['A']):
            print("Missing:", j, word, homophones_in_data_celex_mapped.word[pd.notnull(homophones_in_data_celex_mapped.clx)].iloc[i])

Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] past
Missing: pA:st ['pA:st', ''] passed
Missing: pA:st ['pA:st', ''] passed


In [10]:
berndt_conditional_probs_words = get_ARPAbet_phonetic_transcription(berndt_conditional_probs.Example)
arpabet_used_in_bernd_examples = set(sum(berndt_conditional_probs_words,[]))

In [30]:
for i,word in enumerate(berndt_conditional_probs_words):
    for j in word:
        if j in ["AO1"]:
            print(i,j, word, berndt_conditional_probs.Example.iloc[i])

4 AO1 ['AO1', 'L', 'S', 'OW0'] also
11 AO1 ['AO1', 'R', 'AH0', 'N', 'JH'] orange
13 AO1 ['F', 'AO1', 'L', 'S'] false
27 AO1 ['IH0', 'K', 'S', 'T', 'R', 'AO1', 'R', 'D', 'AH0', 'N', 'EH2', 'R', 'IY0'] extraordinary
28 AO1 ['F', 'AO1', 'S', 'AH0', 'T'] faucet
30 AO1 ['T', 'AO1', 'P'] taupe 
31 AO1 ['IH0', 'P', 'AO1', 'L', 'AH0', 'T'] epaulet
33 AO1 ['M', 'AO1', 'V'] mauve
36 AO1 ['L', 'AO1'] law
73 AO1 ['F', 'AO1', 'R', 'F', 'IH0', 'T'] forfeit
127 AO1 ['AO1', 'F'] off
137 AO1 ['G', 'AO1', 'N'] gone
142 AO1 ['B', 'R', 'AO1', 'D'] broad
143 AO1 ['K', 'AO1', 'R', 'S'] coarse
150 AO1 ['T', 'AO1', 'R', 'T', 'AH0', 'S'] tortoise
153 AO1 ['D', 'AO1', 'R'] door
155 AO1 ['IH0', 'N', 'AO1', 'R', 'M', 'AH0', 'S'] enormous
158 AO1 ['F', 'AO1', 'R'] four
164 AO1 ['K', 'AO1', 'R', 'S'] course
203 AO1 ['S', 'T', 'AO1', 'R', 'IY0'] story
223 AO1 ['K', 'AO1', 'R', 'AH0', 'S'] chorus
237 AO1 ['K', 'AO1', 'R', 'JH', 'AH0', 'L'] cordial
241 AO1 ['SH', 'AO1', 'R', 'T', 'AH0', 'N'] shorten
246 AO1 ['AO1', 'F

In [42]:
arpabet_used_in_data - arpabet_used_in_bernd_examples

{'AA2', 'OW2'}

## Get Keyboard compatible phonetic symbols

In [23]:
homophones_in_data_celex_mapped["clx"].unique()

array(['[raIt]', '[nju:]', '[mO:l]', '[weI][tIN]', '[li:st]', '[si:]',
       '[taIm]', '[hA:t]', '[saIt]', '[p@Ul]', '[h3:d]', '[sEnt]',
       '[wO:n]', '[meId]', '[m&][n@r*]', '[beIl]', '[kru:z]', '[si:n]',
       '[flu:]', '[r@Ud]', '[pA:st]', '[mO:][nIN]', '[hI@r*]', '[daId]',
       '[faInd]', '[weI]', '[hO:l]', '[r@Uz]', '[streIt]', '[naIt]',
       '[k@U][k@U]', '[si:m]', '[tSI][lI]', '[greIt]', '[reIn]', '[fIl]',
       '[pi:k]', '[si:mz]', '[fi:t]', '[seIl]', '[wi:k]', '[taId]',
       '[tSu:z]', '[sVn]', '[@][laUd]', '[dju:]', '[s@Ul]', '[hi:l]',
       '[fr&Nk]', '[rI][NIN]', '[haI][@r*]', '[hO:s]', '[h@Ul]',
       '[flaU][@z]', '[weIt]', '[kO:s]', '[t&ks]', '[daI]', '[li:]',
       '[bO:d]', '[sEnts]', '[pE@r*]', '[Su:t]', '[gEst]', '[rVf]',
       '[reIz]', '[k&][pI][tl,]', '[b@Uld]', '[wO:r*]', '[weIv]',
       '[b&nd]', '[tEk]', '[r@Ut]', '[mIst]', '[seIlz]', '[blu:]',
       '[&dz]', '[m@Ud]', '[rIN]', '[sE][l@z]', '[r&p]', '[r@Um]',
       '[h@Uld]', '[eIdz]', '[pOlz

In [116]:
berndt_character_code = pd.read_csv("Data/celex_phonetic_character_code_berndt1987.csv", delimiter=";")
berndt_conditional_probs = pd.read_csv("Data/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv",delimiter=";")

In [117]:
berndt_conditional_probs_words = get_ARPAbet_phonetic_transcription(berndt_conditional_probs.Example)
arpabet_used_in_bernd_examples = set(sum(berndt_conditional_probs_words,[]))

In [10]:
berndt_character_code

Unnamed: 0,keyboard_compatible_phonetic_symbol,CELEX,g2p(ARPAbet),DISC,Example,Note
0,ay,eI,"EY1,EY2",1,ale,
1,ae,&,"AE0,AE1,AE2",{,add,
2,ee,i:,"IY0, IY1",i,bee,
3,eh,E,"EH0,EH1,EH2",E,end,
4,er,@r*,"ER0,ER1,ER2",@R,father,
5,ai,aI,"AY0,AY1,AY2",2,high,
6,ih,I,"IH0,IH1,IH2",I,bin,
7,o,@U,"OW0,OW1,OW2",5,boat,
8,ah,O,"AA1,AA2",Q,cot,
9,aw,O,"AO1,AO2",Q,soft,AA1 in soft


In [118]:
berndt_arpabbet_dict = {}
for i,p in enumerate(berndt_character_code["g2p(ARPAbet)"]):
    if not p is np.nan:
        p = p.replace(" ","").split(",")       
        if len(p) > 1:
            for p_i in p:
                """
                if p_i in berndt_arpabbet_dict:
                    value_list = berndt_arpabbet_dict[p_i]
                    if isinstance(value_list, list): 
                        value_list += [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                    else:
                        value_list = [value_list] + [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                    
                    berndt_arpabbet_dict[p_i] = value_list
                else:
                """
                berndt_arpabbet_dict[p_i] = berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]
        else:
            """
            if p[0] in berndt_arpabbet_dict:
                value_list = berndt_arpabbet_dict[p[0]]
                if isinstance(value_list, list): 
                    value_list += [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                else:
                    value_list = [value_list] + [berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]]
                    
                berndt_arpabbet_dict[p[0]] = value_list
            else:   
            """
            berndt_arpabbet_dict[p[0]] = berndt_character_code.keyboard_compatible_phonetic_symbol.iloc[i]

In [119]:
berndt_arpabbet_dict

{'EY1': 'ay',
 'EY2': 'ay',
 'AE0': 'ae',
 'AE1': 'ae',
 'AE2': 'ae',
 'IY0': 'ee',
 'IY1': 'ee',
 'EH0': 'eh',
 'EH1': 'eh',
 'EH2': 'eh',
 'ER0': 'er',
 'ER1': 'er',
 'ER2': 'er',
 'AY0': 'ai',
 'AY1': 'ai',
 'AY2': 'ai',
 'IH0': 'ih',
 'IH1': 'ih',
 'IH2': 'ih',
 'OW0': 'o',
 'OW1': 'o',
 'OW2': 'o',
 'AA1': 'ah',
 'AA2': 'ah',
 'AO1': 'aw',
 'AO2': 'aw',
 'UW0': 'oo',
 'UW1': 'oo',
 'UH1': 'u',
 'Y-UW0': 'yu',
 'Y-UW1': 'yu',
 'Y-AH0': 'yu',
 'AH1': 'uh+',
 'OY1': 'oy',
 'OY2': 'oy',
 'AW0': 'au',
 'AW1': 'au',
 'AH0': 'uh-',
 'B': 'b',
 'D': 'd',
 'F': 'f',
 'G': 'g',
 'HH': 'h',
 'JH': 'dj',
 'K': 'k',
 'L': 'l',
 'M': 'm',
 'N': 'n',
 'P': 'p',
 'R': 'r',
 'S': 's',
 'T': 't',
 'V': 'v',
 'W': 'w',
 'Y': 'y',
 'Z': 'z',
 'CH': 'tch',
 'KS': 'ks',
 'GZ': 'gz',
 'KW': 'kw',
 'AH0-L': 'ul',
 'AH0-M': 'um',
 'AH0-N': 'un',
 'NG': 'ng',
 'SH': 'sh',
 'TH': 'th-',
 'DH': 'th+',
 'ZH': 'zh'}

In [199]:
def get_keyboard_phonetic_symbols(arpabet_word, dictionary):
    skip_word = False
    keyboard_encoding = []
    for i,p in enumerate(arpabet_word):
        if skip_word == True:
            skip_word = False
        else:
            if p == 'AH0':
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] == "L":
                        keyboard_encoding.append([dictionary[p + "-" + "L"],[dictionary[p],dictionary["L"]]])
                        skip_word = True
                    elif arpabet_word[i+1] == "M":
                        keyboard_encoding.append([dictionary[p + "-" + "M"],[dictionary[p],dictionary["M"]]])
                        skip_word = True
                    elif arpabet_word[i+1] == "N":
                        keyboard_encoding.append([dictionary[p + "-" + "N"],[dictionary[p],dictionary["N"]]])
                        skip_word = True
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            elif p == "K":
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] == "S":
                        keyboard_encoding.append([dictionary[p]+dictionary["S"],[dictionary[p],dictionary["S"]]])
                        skip_word = True
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            
            elif p=="Y":
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] in ["UW0", "UW1", "AH0"]:
                        keyboard_encoding.append([dictionary[p + "-" + arpabet_word[i+1]],[dictionary[p],dictionary[arpabet_word[i+1]]]])
                        skip_word = True
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            elif p=="AO1":
                if i<len(arpabet_word)-1:
                    if arpabet_word[i+1] == "R":
                        keyboard_encoding.append([dictionary[p],[dictionary["OW0"]]])
                    else:
                        keyboard_encoding.append(dictionary[p])
                else:
                    keyboard_encoding.append(dictionary[p])
            
            elif p=="IH1":
                keyboard_encoding.append([dictionary[p],[dictionary["IY1"]]])
            
            elif p in ["ER0", "ER1", "ER2"]:
                keyboard_encoding.append([dictionary[p],dictionary["R"],[dictionary[p],dictionary["R"]]])
            else:
                keyboard_encoding.append(dictionary[p])
    return keyboard_encoding

In [200]:
for i,word in enumerate(berndt_conditional_probs_words):
    print(word,berndt_conditional_probs.Example.iloc[i])
    print(get_keyboard_phonetic_symbols(word, berndt_arpabbet_dict))

['K', 'AE1', 'B'] cab
['k', 'ae', 'b']
['K', 'AH0', 'N', 'AE1', 'L'] canal
['k', ['un', ['uh-', 'n']], 'ae', 'l']
['EY1', 'N', 'JH', 'AH0', 'L'] angel
['ay', 'n', 'dj', ['ul', ['uh-', 'l']]]
['W', 'AA1', 'D'] wad
['w', 'ah', 'd']
['AO1', 'L', 'S', 'OW0'] also
['aw', 'l', 's', 'o']
['K', 'AW1', 'ER0', 'D'] coward
['k', 'au', ['er', 'r', ['er', 'r']], 'd']
['M', 'EH1', 'N', 'IY0'] many
['m', 'eh', 'n', 'ee']
['S', 'P', 'IH1', 'N', 'AH0', 'CH'] spinach
['s', 'p', ['ih', ['ee']], 'n', 'uh-', 'tch']
['EY1', 'T'] ate
['ay', 't']
['S', 'EH1', 'N', 'AH0', 'T'] senate
['s', 'eh', 'n', 'uh-', 't']
['M', 'AE1', 'D', 'AH0', 'M'] madame
['m', 'ae', 'd', ['um', ['uh-', 'm']]]
['AO1', 'R', 'AH0', 'N', 'JH'] orange
[['aw', ['o']], 'r', ['un', ['uh-', 'n']], 'dj']
['AA1', 'R'] are
['ah', 'r']
['F', 'AO1', 'L', 'S'] false
['f', 'aw', 'l', 's']
['P', 'AY1', 'R', 'AH0', 'T'] pirate
['p', 'ai', 'r', 'uh-', 't']
['AE1', 'L', 'JH', 'IY0'] algae
['ae', 'l', 'dj', 'ee']
['EH0', 'S', 'TH', 'EH1', 'T', 'IH0', 'K

In [201]:
berndt_computer_phonem_graph_prob_dict = {}
for phoneme in berndt_conditional_probs.Phoneme.unique():
    berndt_computer_phonem_graph_prob_dict[phoneme] = []

In [202]:
for i,row in berndt_conditional_probs.iterrows():
    grapheme_prior_cond = (row["Grapheme"], row['Prior_Probability'], row["Conditional_Probability"])
    berndt_computer_phonem_graph_prob_dict[row["Phoneme"]].append(grapheme_prior_cond)

In [205]:
berndt_computer_phonem_graph_prob_dict

{'ae': [('A', 0.0712, 0.542), ('A-E', 0.0111, 0.121), ('AI', 0.0026, 0.003)],
 'uh-': [('A', 0.0712, 0.18600000000000003),
  ('A-E', 0.0111, 0.002),
  ('AI', 0.0026, 0.031),
  ('AU', 0.0014, 0.006),
  ('E', 0.073, 0.096),
  ('E-E', 0.0032, 0.28600000000000003),
  ('EA', 0.0047, 0.005),
  ('El', 0.0005, 0.035),
  ('EO', 0.0001, 0.6659999999999999),
  ('EOU', 7e-05, 1.0),
  ('EY-E', 0.0688, 0.18),
  ('I-E', 0.0086, 6.0),
  ('IA', 1e-05, 1.0),
  ('IE', 0.0011, 0.171),
  ('IE-E', 0.0002, 0.129),
  ('O', 0.055, 0.26899999999999996),
  ('O-E', 0.0043, 0.044000000000000004),
  ('OI-E', 9e-05, 0.2),
  ('OU', 0.0064, 0.48),
  ('U', 0.0267, 0.102),
  ('U-E', 0.0033, 0.01),
  ('Y', 0.0193, 0.01)],
 'ay': [('A', 0.0712, 0.129),
  ('A-E', 0.0111, 0.6509999999999999),
  ('AI', 0.0026, 0.7340000000000001),
  ('AI-E', 0.0002, 0.818),
  ('AIGH', 2.9999999999999997e-05, 1.0),
  ('AU-E', 0.0001, 0.083),
  ('AY', 0.0012, 0.97),
  ('AY-E', 9e-06, 1.0),
  ('E', 0.073, 0.002),
  ('E-E', 0.0032, 0.017),
  ('E

In [123]:
unique_homophones = np.unique(homophones_in_data.word)
hom_phon_words = get_ARPAbet_phonetic_transcription(unique_homophones)

In [208]:
test_words = []
for i,word in enumerate(hom_phon_words):
    print(word,unique_homophones[i])
    print(get_keyboard_phonetic_symbols(word, berndt_arpabbet_dict))
    test_words.append((unique_homophones[i],get_keyboard_phonetic_symbols(word, berndt_arpabbet_dict)))

['AE1', 'D'] ad
['ae', 'd']
['AE1', 'D'] add
['ae', 'd']
['AE1', 'D', 'Z'] adds
['ae', 'd', 'z']
['AE1', 'D', 'Z'] ads
['ae', 'd', 'z']
['EY1', 'D'] aid
['ay', 'd']
['EY1', 'D'] aide
['ay', 'd']
['EY1', 'D', 'Z'] aides
['ay', 'd', 'z']
['EY1', 'D', 'Z'] aids
['ay', 'd', 'z']
['EH1', 'R', 'Z'] airs
['eh', 'r', 'z']
['AH0', 'L', 'AW1', 'D'] allowed
[['ul', ['uh-', 'l']], 'au', 'd']
['AH0', 'L', 'AW1', 'D'] aloud
[['ul', ['uh-', 'l']], 'au', 'd']
['B', 'EY1', 'L'] bail
['b', 'ay', 'l']
['B', 'EY1', 'T', 'S'] baits
['b', 'ay', 't', 's']
['B', 'AO1', 'L', 'D'] bald
['b', 'aw', 'l', 'd']
['B', 'EY1', 'L'] bale
['b', 'ay', 'l']
['B', 'AE1', 'N', 'D'] band
['b', 'ae', 'n', 'd']
['B', 'AE1', 'N', 'D'] banned
['b', 'ae', 'n', 'd']
['B', 'EH1', 'R'] bare
['b', 'eh', 'r']
['B', 'EY1', 'T', 'S'] bates
['b', 'ay', 't', 's']
['B', 'AO1', 'L', 'D'] bawled
['b', 'aw', 'l', 'd']
['B', 'EH1', 'R'] bear
['b', 'eh', 'r']
['B', 'IY1', 'T', 'S'] beats
['b', 'ee', 't', 's']
['B', 'IY1', 'T', 'S'] beets
['b', 

In [305]:
test_words[0:10]

[('ad', ['ae', 'd']),
 ('add', ['ae', 'd']),
 ('adds', ['ae', 'd', 'z']),
 ('ads', ['ae', 'd', 'z']),
 ('aid', ['ay', 'd']),
 ('aide', ['ay', 'd']),
 ('aides', ['ay', 'd', 'z']),
 ('aids', ['ay', 'd', 'z']),
 ('airs', ['eh', 'r', 'z']),
 ('allowed', [['ul', ['uh-', 'l']], 'au', 'd'])]

In [210]:
#test_words = [("soft", get_keyboard_phonetic_symbols(get_ARPAbet_phonetic_transcription(["soft"])[0], berndt_arpabbet_dict))]
#test_words = [("flower", get_keyboard_phonetic_symbols(get_ARPAbet_phonetic_transcription(["flower"])[0], berndt_arpabbet_dict))]


possible_grapheme_strings = [] # list for each word
possible_prior_probs = [] # list for each word
possible_cond_probs = [] # list for each word
word_rests = [] # list for each word

for i,word_pron in enumerate(test_words):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    print(word,pron)
    if word[0] == 'h' and pron[0] != 'h': #leading silent h         
        possible_grapheme_strings_i = [['$H']] # for each word a list of possible lists with grapheme given pronunciation strings 
        possible_prior_probs_i = [[0.0003]] # for each word a list of possible lists with corresponding prio probabilites 
        possible_cond_probs_i = [[1.000]] # for each word a list of possible lists possible corresponding conditional probabilities
        word_rests_i = [word.upper()[1:]] # for each word a list of remaining word characters after having splitted it into a list of possible graphemes  

    else:    
        possible_grapheme_strings_i = [[]] # for each word a list of possible lists with grapheme given pronunciation strings 
        possible_prior_probs_i = [[]] # for each word a list of possible lists with corresponding prio probabilites 
        possible_cond_probs_i = [[]] # for each word a list of possible lists possible corresponding conditional probabilities
        word_rests_i = [word.upper()] # for each word a list of remaining word characters after having splitted it into a list of possible graphemes  

    for j,p in enumerate(pron):  
        new_word_rests_i = [] # new rest of the word after looking at the current encoded syllable pronunciation
        new_possible_grapheme_strings_i = [] # new possible grapheme strings given the current syllable pronunciation
        new_possible_prior_probs_i = []
        new_possible_cond_probs_i = []
        if isinstance(p,list):
            for p_i in p:   
                if isinstance(p_i,list):
                    for p_ij in p_i:
                        for possible_grapheme in berndt_computer_phonem_graph_prob_dict[p_ij]: # all possible corresponding graphemes
                            grapheme = possible_grapheme[0].split("-") # account for silent e encoded by e.g. A-E
                            prior = possible_grapheme[1] # prior prob
                            cond = possible_grapheme[2] # cond prop
                            for k,word_rest in enumerate(word_rests_i): # for each possible combination we get a different word rest e.g. APPLE can have [A,P], [A,PP], [A-E,P], [A-E,PP] --> ["PLE", LE, PL, L]
                                if len(grapheme)>1: #silent E 
                                    # if word_rest.startswith(grapheme[0]) and word_rest.endswith(grapheme[1]): # check whether the grapheme fits to the rest of the word
                                     if word_rest.startswith(grapheme[0]) and grapheme[1] in word_rest[len(grapheme[0]):]:
                                        new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]]) # add the grapheme to the grapheme list which corresponds to the word ret we are currently looking at
                                        new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                        new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                        #new_word_rests_i.append(word_rest[len(grapheme[0]):-1]) # new word_rests 
                                        new_word_rests_i.append(word_rest[len(grapheme[0]):].replace(grapheme[1],"",1))
                                else:
                                    if word_rest.startswith(grapheme[0]): #no silent E
                                        new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]])
                                        new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                        new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                        new_word_rests_i.append(word_rest[len(grapheme[0]):])
                            
                        word_rests_i = new_word_rests_i #update word rests 
                        possible_grapheme_strings_i = new_possible_grapheme_strings_i #update possible grapheme strings 
                        possible_prior_probs_i = new_possible_prior_probs_i
                        possible_cond_probs_i = new_possible_cond_probs_i

                    
                else: # proceed like normal but without updating the words_rests
                    for possible_grapheme in berndt_computer_phonem_graph_prob_dict[p_i]: # all possible corresponding graphemes
                        grapheme = possible_grapheme[0].split("-") # account for silent e encoded by e.g. A-E
                        prior = possible_grapheme[1] # prior prob
                        cond = possible_grapheme[2] # cond prop

                        for k,word_rest in enumerate(word_rests_i): # for each possible combination we get a different word rest e.g. APPLE can have [A,P], [A,PP], [A-E,P], [A-E,PP] --> ["PLE", LE, PL, L]
                            if len(grapheme)>1: #silent E 
                                #if word_rest.startswith(grapheme[0]) and word_rest.endswith(grapheme[1]): # check whether the grapheme fits to the rest of the word
                                if word_rest.startswith(grapheme[0]) and grapheme[1] in word_rest[len(grapheme[0]):]:    
                                    new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]]) # add the grapheme to the grapheme list which corresponds to the word ret we are currently looking at
                                    new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                    new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                    #new_word_rests_i.append(word_rest[len(grapheme[0]):-1]) # new word_rests 
                                    new_word_rests_i.append(word_rest[len(grapheme[0]):].replace(grapheme[1],"",1))
                            else:
                                if word_rest.startswith(grapheme[0]): #no silent E
                                    new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]])
                                    new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                                    new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                                    new_word_rests_i.append(word_rest[len(grapheme[0]):])

                
            
        else:
            for possible_grapheme in berndt_computer_phonem_graph_prob_dict[p]: # all possible corresponding graphemes
                grapheme = possible_grapheme[0].split("-") # account for silent e encoded by e.g. A-E
                prior = possible_grapheme[1] # prior prob
                cond = possible_grapheme[2] # cond prop
                for k,word_rest in enumerate(word_rests_i): # for each possible combination we get a different word rest e.g. APPLE can have [A,P], [A,PP], [A-E,P], [A-E,PP] --> ["PLE", LE, PL, L]
                    if len(grapheme)>1: #silent E 
                        #if word_rest.startswith(grapheme[0]) and word_rest.endswith(grapheme[1]): # check whether the grapheme fits to the rest of the word
                        if word_rest.startswith(grapheme[0]) and grapheme[1] in word_rest[len(grapheme[0]):]:    
                            new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]]) # add the grapheme to the grapheme list which corresponds to the word ret we are currently looking at
                            new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                            new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                            #new_word_rests_i.append(word_rest[len(grapheme[0]):-1]) # new word_rests 
                            new_word_rests_i.append(word_rest[len(grapheme[0]):].replace(grapheme[1],"",1))

                    else:
                        if word_rest.startswith(grapheme[0]): #no silent E
                            new_possible_grapheme_strings_i.append(possible_grapheme_strings_i[k] + [possible_grapheme[0]])
                            new_possible_prior_probs_i.append(possible_prior_probs_i[k] + [prior])
                            new_possible_cond_probs_i.append(possible_cond_probs_i[k] + [cond])
                            new_word_rests_i.append(word_rest[len(grapheme[0]):])


            word_rests_i = new_word_rests_i #update word rests 
            possible_grapheme_strings_i = new_possible_grapheme_strings_i #update possible grapheme strings 
            possible_prior_probs_i = new_possible_prior_probs_i
            possible_cond_probs_i = new_possible_cond_probs_i
    
        #print(possible_grapheme_strings_i,word_rests_i)
    possible_grapheme_strings.append(possible_grapheme_strings_i)
    possible_prior_probs.append(possible_prior_probs_i)
    possible_cond_probs.append(possible_cond_probs_i)
    word_rests.append(word_rests_i)

ad ['ae', 'd']
add ['ae', 'd']
adds ['ae', 'd', 'z']
ads ['ae', 'd', 'z']
aid ['ay', 'd']
aide ['ay', 'd']
aides ['ay', 'd', 'z']
aids ['ay', 'd', 'z']
airs ['eh', 'r', 'z']
allowed [['ul', ['uh-', 'l']], 'au', 'd']
aloud [['ul', ['uh-', 'l']], 'au', 'd']
bail ['b', 'ay', 'l']
baits ['b', 'ay', 't', 's']
bald ['b', 'aw', 'l', 'd']
bale ['b', 'ay', 'l']
band ['b', 'ae', 'n', 'd']
banned ['b', 'ae', 'n', 'd']
bare ['b', 'eh', 'r']
bates ['b', 'ay', 't', 's']
bawled ['b', 'aw', 'l', 'd']
bear ['b', 'eh', 'r']
beats ['b', 'ee', 't', 's']
beets ['b', 'ee', 't', 's']
bell ['b', 'eh', 'l']
belle ['b', 'eh', 'l']
berry ['b', 'eh', 'r', 'ee']
billed ['b', ['ih', ['ee']], 'l', 'd']
blew ['b', 'l', 'oo']
blue ['b', 'l', 'oo']
boar ['b', ['aw', ['o']], 'r']
board ['b', ['aw', ['o']], 'r', 'd']
bold ['b', 'o', 'l', 'd']
bore ['b', ['aw', ['o']], 'r']
bored ['b', ['aw', ['o']], 'r', 'd']
bowled ['b', 'o', 'l', 'd']
brakes ['b', 'r', 'ay', ['ks', ['k', 's']]]
bread ['b', 'r', 'eh', 'd']
breaks ['b', 

In [241]:
print(possible_grapheme_strings[100])
print(word_rests[100])
print(get_ARPAbet_phonetic_transcription(["aloud"]))

[['F', 'L', 'AI', 'R']]
['']
[['AH0', 'L', 'AW1', 'D']]


In [309]:
#print(get_ARPAbet_phonetic_transcription(["aid"]))
#print(berndt_arpabbet_dict)
print(berndt_computer_phonem_graph_prob_dict["ks"])

[('CS', 0.0002, 1.0), ('X', 0.0033, 0.885)]


In [330]:
counter = 0 
for i,word_pron in enumerate(test_words):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(possible_grapheme_strings[i]) == 0:
        counter+=1
        print(word,pron)
        print(possible_grapheme_strings[i])
        print(possible_prior_probs[i])
        print(possible_cond_probs[i])
        print(word_rests[i])

allowed [['ul', ['uh-', 'l']], 'au', 'd']
[]
[]
[]
[]
banned ['b', 'ae', 'n', 'd']
[]
[]
[]
[]
bates ['b', 'ay', 't', 's']
[]
[]
[]
[]
bawled ['b', 'aw', 'l', 'd']
[]
[]
[]
[]
billed ['b', 'ih', 'l', 'd']
[]
[]
[]
[]
bored ['b', 'aw', 'r', 'd']
[]
[]
[]
[]
bowled ['b', 'o', 'l', 'd']
[]
[]
[]
[]
cellars ['s', 'eh', 'l', 'er', 'z']
[]
[]
[]
[]
chutes ['sh', 'oo', 't', 's']
[]
[]
[]
[]
course ['k', 'aw', 'r', 's']
[]
[]
[]
[]
dear ['d', 'ih', 'r']
[]
[]
[]
[]
dyed ['d', 'ai', 'd']
[]
[]
[]
[]
fined ['f', 'ai', 'n', 'd']
[]
[]
[]
[]
flour ['f', 'l', 'au', 'er']
[]
[]
[]
[]
flower ['f', 'l', 'au', 'er']
[]
[]
[]
[]
flowers ['f', 'l', 'au', 'er', 'z']
[]
[]
[]
[]
guessed ['g', 'eh', 's', 't']
[]
[]
[]
[]
guest ['g', 'eh', 's', 't']
[]
[]
[]
[]
guise ['g', 'ai', 'z']
[]
[]
[]
[]
heard ['h', 'er', 'd']
[]
[]
[]
[]
heirs ['eh', 'r', 'z']
[]
[]
[]
[]
herd ['h', 'er', 'd']
[]
[]
[]
[]
hire ['h', 'ai', 'er']
[]
[]
[]
[]
holed ['h', 'o', 'l', 'd']
[]
[]
[]
[]
hurts ['h', 'er', 't', 's']
[]
[]
[]
[

In [115]:
# Same as AHO-M, AH0-L .. can be pronunced as ul but also as uh- l --> Y-AH0,Y-UW1/UW0 as "yu" for u but youth also youth as Y and UW1 as "y" "oo" 
print(get_ARPAbet_phonetic_transcription(["populace"]))
print(get_ARPAbet_phonetic_transcription(["populous"]))
print(get_ARPAbet_phonetic_transcription(["unite"]))
print(get_ARPAbet_phonetic_transcription(["union"]))
print(get_ARPAbet_phonetic_transcription(["popular"]))
print(get_ARPAbet_phonetic_transcription(["yet"]))
print(get_ARPAbet_phonetic_transcription(["youth"]))
print(get_ARPAbet_phonetic_transcription(["argue"]))

[['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'AH0', 'S']]
[['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'AH0', 'S']]
[['Y', 'UW1', 'N', 'AY2', 'T']]
[['Y', 'UW1', 'N', 'Y', 'AH0', 'N']]
[['P', 'AA1', 'P', 'Y', 'AH0', 'L', 'ER0']]
[['Y', 'EH1', 'T']]
[['Y', 'UW1', 'TH']]
[['AA1', 'R', 'G', 'Y', 'UW0']]


In [107]:
# A01 as aw not encoded as OU --> maybe OUR as ER0 (see glamour)
print(get_ARPAbet_phonetic_transcription(["mourning"]))
print(get_ARPAbet_phonetic_transcription(["cough"]))
print(get_ARPAbet_phonetic_transcription(["thought"]))
print(get_ARPAbet_phonetic_transcription(["about"]))
print(get_ARPAbet_phonetic_transcription(["cloth"]))
print(get_ARPAbet_phonetic_transcription(["abroad"]))
print(get_ARPAbet_phonetic_transcription(["dawn"]))
print(get_ARPAbet_phonetic_transcription(["course"]))
print(get_ARPAbet_phonetic_transcription(["four"]))
print(get_ARPAbet_phonetic_transcription(["court"]))
print(get_ARPAbet_phonetic_transcription(["glamour"]))

[['M', 'AO1', 'R', 'N', 'IH0', 'NG']]
[['K', 'AA1', 'F']]
[['TH', 'AO1', 'T']]
[['AH0', 'B', 'AW1', 'T']]
[['K', 'L', 'AO1', 'TH']]
[['AH0', 'B', 'R', 'AO1', 'D']]
[['D', 'AO1', 'N']]
[['K', 'AO1', 'R', 'S']]
[['F', 'AO1', 'R']]
[['K', 'AO1', 'R', 'T']]
[['G', 'L', 'AE1', 'M', 'ER0']]


In [96]:
# ER0 as er not encoded as RE
print(get_ARPAbet_phonetic_transcription(["hire"]))

[['HH', 'AY1', 'ER0']]


In [95]:
# ERI as er not encoded as EAR
print(get_ARPAbet_phonetic_transcription(["heard"]))

[['HH', 'ER1', 'D']]


In [94]:
# EH1 as eh not coded for UE --> u and e as eh 
print(get_ARPAbet_phonetic_transcription(["guest"]))

[['G', 'EH1', 'S', 'T']]
[['B', 'L', 'UW1']]
[['K', 'Y', 'UW1']]


In [89]:
# EA not for ih codede
print(get_ARPAbet_phonetic_transcription(["dear"]))

[['D', 'IH1', 'R']]


In [92]:
# ER0 as er but as E-R 
print(get_ARPAbet_phonetic_transcription(["cellars"]))
print(get_ARPAbet_phonetic_transcription(["flower"]))
print(get_ARPAbet_phonetic_transcription(["baker"]))

[['S', 'EH1', 'L', 'ER0', 'Z']]
[['F', 'L', 'AW1', 'ER0']]
[['B', 'EY1', 'K', 'ER0']]


In [136]:
# silent e not only at the ending of an word
# when ed as t and when as d with silent e 
print(get_ARPAbet_phonetic_transcription(["warned"]))
print(get_ARPAbet_phonetic_transcription(["bawled"]))
print(get_ARPAbet_phonetic_transcription(["bates"]))
print(get_ARPAbet_phonetic_transcription(["banned"]))
print(get_ARPAbet_phonetic_transcription(["ended"]))
print(get_ARPAbet_phonetic_transcription(["danceable"]))
print(get_ARPAbet_phonetic_transcription(["guessed"]))

[['W', 'AO1', 'R', 'N', 'D']]
[['B', 'AO1', 'L', 'D']]
[['B', 'EY1', 'T', 'S']]
[['B', 'AE1', 'N', 'D']]
[['EH1', 'N', 'D', 'AH0', 'D']]
[['D', 'AE1', 'N', 'S', 'AH0', 'B', 'AH0', 'L']]
[['G', 'EH1', 'S', 'T']]


In [331]:
counter

41

In [211]:
counter = 0 
for i,word_pron in enumerate(test_words):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(possible_grapheme_strings[i]) == 0:
        counter+=1
        print(word,pron)
        print(possible_grapheme_strings[i])
        print(possible_prior_probs[i])
        print(possible_cond_probs[i])
        print(word_rests[i])

guessed ['g', 'eh', 's', 't']
[]
[]
[]
[]
guest ['g', 'eh', 's', 't']
[]
[]
[]
[]
guise ['g', 'ai', 'z']
[]
[]
[]
[]
thai ['t', 'ai']
[]
[]
[]
[]
thais ['t', 'ai', 'z']
[]
[]
[]
[]
weighed ['w', 'ay', 'd']
[]
[]
[]
[]


In [212]:
counter

6

In [231]:
print(get_ARPAbet_phonetic_transcription(["thai"]))
print(get_ARPAbet_phonetic_transcription(["aisle"]))
print(get_ARPAbet_phonetic_transcription(["daiquiri"]))
print(get_ARPAbet_phonetic_transcription(["aisle"]))
print(get_ARPAbet_phonetic_transcription(["tie"]))
print(get_ARPAbet_phonetic_transcription(["captain"]))
print(get_ARPAbet_phonetic_transcription(["villain"]))
print(get_ARPAbet_phonetic_transcription(["plaid"]))

[['T', 'AY1']]
[['AY1', 'L']]
[['D', 'AE1', 'K', 'ER0', 'IY0']]
[['AY1', 'L']]
[['T', 'AY1']]
[['K', 'AE1', 'P', 'T', 'AH0', 'N']]
[['V', 'IH1', 'L', 'AH0', 'N']]
[['P', 'L', 'AE1', 'D']]


In [198]:
print(get_ARPAbet_phonetic_transcription(["guise"]))# silent U
print(get_ARPAbet_phonetic_transcription(["guest"]))
print(get_ARPAbet_phonetic_transcription(["guy"]))
print(get_ARPAbet_phonetic_transcription(["hire"]))# RE nor coded as er 
print(get_ARPAbet_phonetic_transcription(["thai"])) # AI not as ai coded 
print(get_ARPAbet_phonetic_transcription(["weighed"])) # missing silent e for eigh-e
print(get_ARPAbet_phonetic_transcription(["dear"])) # EA not as ih coded 
print(get_ARPAbet_phonetic_transcription(["shear"])) # EA not as ih coded but hear arpabet encoding IY1 instead of "IH1"
print(get_ARPAbet_phonetic_transcription(["hear"])) 

[['G', 'AY1', 'Z']]
[['G', 'EH1', 'S', 'T']]
[['G', 'AY1']]
[['HH', 'AY1', 'ER0']]
[['T', 'AY1']]
[['W', 'EY1', 'D']]
[['D', 'IH1', 'R']]
[['SH', 'IH1', 'R']]
[['HH', 'IY1', 'R']]


In [235]:
print(get_ARPAbet_phonetic_transcription(["wax"])) 
print(get_keyboard_phonetic_symbols(get_ARPAbet_phonetic_transcription(["wax"])[0], berndt_arpabbet_dict))
print(berndt_computer_phonem_graph_prob_dict["k"])

[['W', 'AE1', 'K', 'S']]
['w', 'ae', 'k', 's']
[('C', 0.042, 0.757), ('CCH', 0.0007, 1.0), ('CCH', 9e-06, 1.0), ('CH', 0.0045, 0.29), ('CK', 0.0026, 1.0), ('CQ', 2e-05, 1.0), ('K', 0.0055, 1.0), ('KH', 2e-05, 1.0), ('Lk', 0.0001, 1.0), ('Q', 0.0001, 1.0), ('QU', 0.002, 0.12300000000000001), ('SC', 0.0008, 0.033)]
