In [1]:
import preprocessing
import pandas as pd
import numpy as np
import importlib
from english_contractions import ENGLISH_CONTRACTIONS
import merging_dataframes
import word_pronunciation_predictibility
import celex_files

In [2]:
celex_dict_file = "Data/english/epw/epw.cd" #"/mnt/shared/corpora/Celex/english/epw/epw.cd"
filename = "Data/2016_all_words_no_audio.pickle" #"/mnt/Restricted/Corpora/RedHen/2016_all_words_no_audio.pickle"
hom_filename = "Data/hom.csv" # "/mnt/Restricted/Corpora/RedHen/hom.csv"
berndt_character_coding_file = "Data/phonetic_character_code_berndt1987.csv" # "/mnt/Restricted/Corpora/RedHen/phonetic_character_code_berndt1987.csv"
berndt_conditional_probs_file = "Data/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv" # "/mnt/Restricted/Corpora/RedHen/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv"

# Pickled RedHen Dataframe 
## Preprocessing:
- include pause information
- word duration
- word frequency
- length in letter
- contextual predictiaility given prev and next word

In [3]:
df = preprocessing.read_dataframe(filename, remove_pauses=True, remove_errors=True, preprocessing=True, drop_error_columns=False)

read dataframe from Data/2016_all_words_no_audio.pickle
Preprocessing: extract pause information...
Remove pauses from data!
Preprocessing: apply word preprocessing...
Preprocessing: calculate word duration...
Preprocessing: calculate word frequency...
Preprocessing: extract context information...
Preprocessing: calculate length in letter...
Preprocessing: calculate contextual predictability...
(18864660, 25) RangeIndex(start=0, stop=18864660, step=1)


In [4]:
source_files = ["2016-12-17_1330_US_KCET_Asia_Insight", "2016-10-25_2300_US_KABC_Eyewitness_News_4PM"]

In [5]:
sub_df = df[df["source_file"].isin(source_files)].copy()

In [6]:
df.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next'],
      dtype='object')

# Gahls Homophones extracted from RedHen Dataframe
## Preprocessing:
- is_pair for indicating whether homophones found in data have a matching pair 
- is_max factor for indicating most frequent homophone of pair (if not a pair always 1)
- pronunciation given by celex encoding and unbounded disc encoding (celexPhon)
- add further celex information

In [4]:
homophones_in_data, gahls_homophones, gahls_homophones_missing_in_data = preprocessing.read_and_extract_homophones(hom_filename, df)

read Gahls Homophone data from Data/hom.csv
406 out of 412 homophones found in Data:
Homophone Pairs found in Data: 200
Homophones without Pair:  ['flowers', 'holes', 'moose', 'naval', 'pairs', 'taught']
Missing homophones: ['flours' 'mousse' 'navel' 'pears' 'taut' 'wholes']


In [5]:
celex_dict = word_pronunciation_predictibility.get_english_phonology_from_celex(celex_dict_file)
homophones_in_data_celex_merged = merging_dataframes.get_celex_transcription(homophones_in_data, celex_dict)

In [8]:
homophones_in_data.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max'],
      dtype='object')

In [9]:
homophones_in_data_celex_merged.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound'],
      dtype='object')

In [29]:
gahls_homophones.columns

Index(['spell', 'pron', 'lgPronCelFq', 'logCelFq', 'logAvgDur', 'stem',
       'is_complex', 'celexPhon', 'phonNeighCount', 'NearestSemNeighCor',
       'MeanCorTop20', 'AvCor', 'MedianCor', 'MeanCorTop20Unrel',
       'CossinTwinsStem', 'CossinTwinsFull', 'L2Ldiag', 'EuclidDistTwins',
       'SL1norm', 'CorrectLDLpred', 'SumChatWord', 'MinChatWord', 'L1ChatWord',
       'CorPredWord', 'LWLinkRatioWord', 'RankProd'],
      dtype='object')

In [30]:
gahls_homophones.head()

Unnamed: 0,spell,pron,lgPronCelFq,logCelFq,logAvgDur,stem,is_complex,celexPhon,phonNeighCount,NearestSemNeighCor,...,L2Ldiag,EuclidDistTwins,SL1norm,CorrectLDLpred,SumChatWord,MinChatWord,L1ChatWord,CorPredWord,LWLinkRatioWord,RankProd
10963,gym,_Im,6.520621,4.290459,-1.007354,gym,False,_Im,17,,...,0.096807,,,,,,,,,
19868,jim,_Im,6.520621,6.40688,-1.30276,Jim,True,_Im,17,,...,,,,,,,,,,
331,aide,1d,7.020191,4.60517,-0.685675,aide,False,1d,37,0.582451,...,0.098318,0.050616,0.703443,True,1.479237,0.495043,1.479237,0.282871,4.0,1.0
255,aid,1d,7.020191,6.926577,-1.168132,aid,False,1d,37,0.879444,...,0.601393,0.050616,2.039614,True,1.250363,0.250344,1.250363,0.913718,8.0,1.0
336,aides,1dz,5.26269,4.330733,-0.770658,aide,True,1dz,17,0.582451,...,0.098318,0.050616,1.584693,True,1.988257,0.494036,1.988257,0.625906,6.122449,1.0


# Load additional information
- eaf files
- seg files
- gentle files

## EAF files
- information about present gestures

In [12]:
eaf_data = preprocessing.get_additional_data_from_files(sub_df, "eaf")

Load and extract information from eaf files...


## SEG files
- information about Part Of Speech
- information about Phrase final marker

In [14]:
seg_data = preprocessing.get_additional_data_from_files(sub_df, "seg")

Load and extract information from seg files...


## GENTLE files
- information about Phrase final marker

In [16]:
gentle_data = preprocessing.get_additional_data_from_files(sub_df, "gentle")

Load and extract information from gentle files...


## Video files
- information about entropy of situation in which the homophones was articulated

In [18]:
video_data = preprocessing.get_additional_data_from_files(homophones_in_data_celex_merged, "video") # only for homophones

Load and extract information from video files...


## Celex 

In [112]:
importlib.reload(celex_files)

<module 'celex_files' from '/Users/paule/Desktop/Gahls_Homophones_in_RedHen/celex_files.py'>

In [113]:
celex_data = celex_files.get_syl_counts(celex_files.read_celex_file())

# Word Pronunciation Predictability (Berndt et al. 1987)

### Load Berndt's tables for Phoneme Equivalents and Conditional Probabilities for Grapheme-to-Phoneme Correspondences

In [99]:
berndt_character_code_df = pd.read_csv(berndt_character_coding_file, delimiter=";")
berndt_conditional_probs = pd.read_csv(berndt_conditional_probs_file,delimiter=";")

### APPABET to corresponding Keyboard Compatible Phonemic (KCP) symbol dict 

In [100]:
berndt_arpabet_phon_dict = word_pronunciation_predictibility.get_ARPABET_to_keyboard_phonetic_symbols_dict(berndt_character_code_df)

### KCP to Grapheme Symbols and Probabilities dict

In [101]:
phonem_graphem_prob_dict = word_pronunciation_predictibility.get_keyboard_phonetic_symbols_to_grapheme_cond_prob_dict(berndt_conditional_probs)

### Homophones with corresponding ARPABET transcription

In [102]:
unique_homophones = np.unique(homophones_in_data.word)
hom_arpabet_words = word_pronunciation_predictibility.get_ARPABET_phonetic_transcription(unique_homophones)

### Homophones with corresponding KCP transcription

In [124]:
hom_kcp_word_tuples = []
for i,arpabet_word in enumerate(hom_arpabet_words):
    kcp_word = word_pronunciation_predictibility.get_keyboard_phonetic_symbols_for_ARPABET(arpabet_word, berndt_arpabet_phon_dict)
    #print(unique_homophones[i],arpabet_word,kcp_word)
    hom_kcp_word_tuples.append((unique_homophones[i],kcp_word))

In [125]:
hom_kcp_word_tuples

[('ad', ['ae', 'd']),
 ('add', ['ae', 'd']),
 ('adds', ['ae', 'd', 'z']),
 ('ads', ['ae', 'd', 'z']),
 ('aid', ['ay', 'd']),
 ('aide', ['ay', 'd']),
 ('aides', ['ay', 'd', 'z']),
 ('aids', ['ay', 'd', 'z']),
 ('airs', ['eh', 'r', 'z']),
 ('allowed', [['ul', ['uh-', 'l']], 'au', 'd']),
 ('aloud', [['ul', ['uh-', 'l']], 'au', 'd']),
 ('bail', ['b', 'ay', 'l']),
 ('baits', ['b', 'ay', 't', 's']),
 ('bald', ['b', 'aw', 'l', 'd']),
 ('bale', ['b', 'ay', 'l']),
 ('band', ['b', 'ae', 'n', 'd']),
 ('banned', ['b', 'ae', 'n', 'd']),
 ('bare', ['b', 'eh', 'r']),
 ('bates', ['b', 'ay', 't', 's']),
 ('bawled', ['b', 'aw', 'l', 'd']),
 ('bear', ['b', 'eh', 'r']),
 ('beats', ['b', 'ee', 't', 's']),
 ('beets', ['b', 'ee', 't', 's']),
 ('bell', ['b', 'eh', 'l']),
 ('belle', ['b', 'eh', 'l']),
 ('berry', ['b', 'eh', 'r', 'ee']),
 ('billed', ['b', ['ih', ['ee']], 'l', 'd']),
 ('blew', ['b', 'l', 'oo']),
 ('blue', ['b', 'l', 'oo']),
 ('boar', ['b', ['aw', ['o']], 'r']),
 ('board', ['b', ['aw', ['o']], 'r

### Get possible (valid) Grapheme strings and probs for each KCP encoded homophone 
Note: 6 homophones not captured: 
- corps ['k', ['aw', ['o']], 'r'] -> Silent "PS"
- guessed: ['g', 'eh', 's', 't’] —> Grapheme 'GUE' as kcp 'g' but no mapping for Grapheme 'GU' as 'g' (silent U) 
- guest: ['g', 'eh', 's', ’t']
- guise: ['g', 'ai', 'z‘]
- thai: ['t', 'ai‘] —> KCP Symbol 'ai' not mapped to Grapheme 'AI' in Berndt'sconditional probs
- thais: ['t', 'ai', 'z']
- weighed: ['w', 'ay', 'd‘] —> Grapheme 'EIGH' as kcp 'ay' but not EIGH-E (silent E)

Possible Solution: 
- corps ['k', ['aw', ['o']], 'r'] add a 's' to the KCP encoding to capture the "PS" with conditional probability 1. This has the least influence on the probabilites and is comparable to the to the case of a silent 'H' in the beginning of a word. 
- guessed: ['g', 'eh', 's', 't’] add an 'e' --> 'gueessed' in order to get the 'eh' to 'E' mapping after the silent u 
- guest: ['g', 'eh', 's', 't’] 
- guise: ['g', 'ai', 'z‘] add an 'e' --> 'gueise' in order to get the hard 'g' for the 'G' with silent 'U'
- thai : ['t', 'ai‘] add an 'e' at the end because 'AI-E' has a mapping for 'ai' 
- thais: 
- weighed: ['w', 'ay', 'd‘] --> drop the silent 'e' to get 'weighd' ['w', 'ay', 'd‘]

In [126]:
possible_grapheme_strings, possible_prior_probs, possible_cond_probs, word_rests = word_pronunciation_predictibility.get_grapheme_string_with_conditional_prob_for_keyboard_phonetics(hom_kcp_word_tuples, 
phonem_graphem_prob_dict)

In [127]:
# homophones for which we have no valid grapheme string: 
counter = 0 
for i,word_pron in enumerate(hom_kcp_word_tuples):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(possible_grapheme_strings[i]) == 0 :
        counter+=1
        print(word,pron)
    else:
        empty_string = ['' != i for i in word_rests[i]]
        if np.sum(empty_string) == len(word_rests[i]):
            counter+=1
            print(word,pron)
    
print(counter)

corps ['k', ['aw', ['o']], 'r']
guessed ['g', 'eh', 's', 't']
guest ['g', 'eh', 's', 't']
guise ['g', 'ai', 'z']
thai ['t', 'ai']
thais ['t', 'ai', 'z']
weighed ['w', 'ay', 'd']
7


In [128]:
problematic_hom_kcp_word_tuples = [
    ('corps', ['k', ['aw', ['o']], 'r', 's']),
    ('gueessed',['g', 'eh', 's', 't']),
    ('gueest', ['g', 'eh', 's', 't']),
    ('gueise', ['g', 'ai', 'z']),
    ('thaie', ['t', 'ai']),
    ('thaies', ['t', 'ai', 'z']),
    ('weighd', ['w', 'ay', 'd'])
]

In [129]:
problematic_hom_kcp_word_tuples_dict = {
    'corps': ('corps', ['k', ['aw', ['o']], 'r', 's']),
    'guessed' : ('gueessed',['g', 'eh', 's', 't']),
    'guest': ('gueest', ['g', 'eh', 's', 't']),
    'guise': ('gueise', ['g', 'ai', 'z']),
    'thai' :('thaie', ['t', 'ai']),
    'thais':('thaies', ['t', 'ai', 'z']),
    'weighed':('weighd', ['w', 'ay', 'd'])
}

In [130]:
possible_grapheme_strings_problematic_homs, possible_prior_probs_problematic_homs, possible_cond_probs_problematic_homs, word_rests_problematic_homs = word_pronunciation_predictibility.get_grapheme_string_with_conditional_prob_for_keyboard_phonetics(problematic_hom_kcp_word_tuples, 
phonem_graphem_prob_dict)

In [131]:
j = 0
for i,wp in enumerate(hom_kcp_word_tuples):
    word = wp[0]
    phon = wp[1]
    if word in problematic_hom_kcp_word_tuples_dict:
        possible_grapheme_strings[i] = possible_grapheme_strings_problematic_homs[j]
        word_rests[i] = word_rests_problematic_homs[j]
        possible_prior_probs[i] = possible_prior_probs_problematic_homs[j]
        possible_cond_probs[i] = possible_cond_probs_problematic_homs[j]
        j+=1

In [132]:
valid_word_rests,valid_grapheme_strings,valid_prior_probs, valid_cond_probs = word_pronunciation_predictibility.get_valid_grapheme_strings(hom_kcp_word_tuples, possible_grapheme_strings, word_rests,possible_prior_probs, possible_cond_probs)

In [133]:
# homophones for which we have no valid grapheme string: 
counter = 0 
for i,word_pron in enumerate(hom_kcp_word_tuples):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(valid_grapheme_strings[i]) == 0:
        counter+=1
        print(word,pron)        
print(counter)

0


In [28]:
counter = 0
for i,string in enumerate(valid_grapheme_strings):
    if len(string) >1:
        print(hom_kcp_word_tuples[i])
        print(valid_cond_probs[i])
        print(string, "\n")
        counter += 1
print(counter)

('aides', ['ay', 'd', 'z'])
[[0.734 0.991 1.   ]
 [0.818 0.991 0.12 ]]
[['AI' 'D' 'ES']
 ['AI-E' 'D' 'S']] 

('allowed', [['ul', ['uh-', 'l']], 'au', 'd'])
[list([1.0, 1.0, 0.6659999999999999, 0.991])
 list([0.18600000000000003, 1.0, 1.0, 0.6659999999999999, 0.991])
 list([0.18600000000000003, 1.0, 0.6659999999999999, 0.991])]
[list(['AL', 'L', 'OW-E', 'D']) list(['A', 'L', 'L', 'OW-E', 'D'])
 list(['A', 'LL', 'OW-E', 'D'])] 

('aloud', [['ul', ['uh-', 'l']], 'au', 'd'])
[list([1.0, 0.324, 0.991]) list([0.18600000000000003, 1.0, 0.324, 0.991])]
[list(['AL', 'OU', 'D']) list(['A', 'L', 'OU', 'D'])] 

('beets', ['b', 'ee', 't', 's'])
[[1.    0.252 0.973 0.868]
 [1.    0.979 0.973 0.868]]
[['B' 'E-E' 'T' 'S']
 ['B' 'EE' 'T' 'S']] 

('billed', ['b', ['ih', ['ee']], 'l', 'd'])
[[1.    0.356 1.    0.991]
 [1.    0.046 1.    0.991]
 [1.    0.356 1.    1.   ]
 [1.    0.046 1.    1.   ]]
[['B' 'I-E' 'LL' 'D']
 ['B' 'I-E' 'LL' 'D']
 ['B' 'I-E' 'L' 'LD']
 ['B' 'I-E' 'L' 'LD']] 

('blue', ['b', 'l

In [20]:
max_cond_prob_for_grapheme = word_pronunciation_predictibility.get_max_cond_prob_for_grapheme(berndt_conditional_probs)

In [49]:
m_score_data = word_pronunciation_predictibility.get_m_score_df(hom_kcp_word_tuples, valid_grapheme_strings,valid_cond_probs,max_cond_prob_for_grapheme)

# Merging Dataframes

In [119]:
importlib.reload(merging_dataframes)

<module 'merging_dataframes' from '/Users/paule/Desktop/Gahls_Homophones_in_RedHen/merging_dataframes.py'>

### Merging eaf data

In [59]:
homophones_in_data_celex_eaf = merging_dataframes.merge_eaf_df_to_homophone_data(homophones_in_data_celex_merged, eaf_data)

### Merging video data

In [71]:
homophones_in_data_celex_eaf_video = merging_dataframes.merge_video_df_to_homophone_data(homophones_in_data_celex_eaf, video_data)

### Merging gentle data

In [82]:
homophones_in_data_celex_eaf_video_gentle = merging_dataframes.merge_gentle_df_to_homophone_data(homophones_in_data_celex_eaf_video, gentle_data)

### Merging seg data

In [86]:
homophones_in_data_celex_eaf_video_gentle_seg = merging_dataframes.merge_seg_df_to_homophone_data(homophones_in_data_celex_eaf_video_gentle, seg_data)

### Merging m-scores data

In [89]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores = merging_dataframes.merge_m_scores_df_to_homophone_data(homophones_in_data_celex_eaf_video_gentle_seg,m_score_data)

### Merging celex syllable counts data

In [120]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll = merging_dataframes.merge_celex_syl_counts_df_to_homophone_data(homophones_in_data_celex_eaf_video_gentle_seg_m_scores,celex_data)

In [99]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gent

In [98]:
low_confidence_homs = homophones_in_data_celex_eaf_video_gentle_seg_m_scores[np.logical_or(homophones_in_data_celex_eaf_video_gentle_seg_m_scores.seg_merging == "low-confidence",homophones_in_data_celex_eaf_video_gentle_seg_m_scores.gentle_merging == "low-confidence")]

In [106]:
low_confidence_homs[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word',"seg_error",'eafgz_error','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,seg_error,eafgz_error,preceding_pause,subsequent_pause
19,see,let's,more,let's,if,us,if,no-error,no-error,False,False
22,time,the,been,the,they,the,they,no-error,no-error,False,False
28,time,this,,this,,this,,no-error,no-error,False,False
32,bail,million,new,million,new,million,ellen,no-error,no-error,True,True
33,new,bail,developments,bail,developments,ellen,developments,no-error,no-error,True,False
37,new,phone,allegations,phone,allegations,ellen,allegations,no-error,no-error,False,False
47,new,is,push,a,push,a,push,no-error,no-error,False,False
58,here,were,waiting,standing,waiting,standing,waiting,no-error,no-error,False,True
59,here,down,dmv,down,dmv,down,eileen,no-error,no-error,False,True
61,here,way,is,way,is,david,is,no-error,no-error,False,False
