In [1]:
import preprocessing
import pandas as pd
import numpy as np
import importlib
from english_contractions import ENGLISH_CONTRACTIONS
import merging_dataframes
import word_pronunciation_predictibility
import celex_files
import os
import timeit

In [2]:
celex_dict_file = "/mnt/shared/corpora/Celex/english/epw/epw.cd"
filename = "/mnt/Restricted/Corpora/RedHen/2016_all_words_no_audio.pickle"
hom_filename = "/mnt/Restricted/Corpora/RedHen/hom.csv"
berndt_character_coding_file = "/mnt/Restricted/Corpora/RedHen/phonetic_character_code_berndt1987.csv"
berndt_conditional_probs_file = "/mnt/Restricted/Corpora/RedHen/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv"

# Pickled RedHen Dataframe 
## Preprocessing:
- include pause information
- word duration
- word frequency
- length in letter
- contextual predictiaility given prev and next word

In [3]:
df = preprocessing.read_dataframe(filename, remove_pauses=True, remove_errors=True, preprocessing=True, drop_error_columns=False)

read dataframe from /mnt/Restricted/Corpora/RedHen/2016_all_words_no_audio.pickle
Preprocessing: extract pause information...
Remove pauses from data!
Preprocessing: apply word preprocessing...
Preprocessing: calculate word duration...
Preprocessing: calculate word frequency...
Preprocessing: extract context information...
Preprocessing: calculate length in letter...
Preprocessing: calculate contextual predictability...
(18864660, 25) RangeIndex(start=0, stop=18864660, step=1)


In [4]:
source_files = ["2016-12-17_1330_US_KCET_Asia_Insight", "2016-10-25_2300_US_KABC_Eyewitness_News_4PM"]

In [5]:
sub_df = df[df["source_file"].isin(source_files)].copy()

In [6]:
df.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next'],
      dtype='object')

# Gahls Homophones extracted from RedHen Dataframe
## Preprocessing:
- is_pair for indicating whether homophones found in data have a matching pair 
- is_max factor for indicating most frequent homophone of pair (if not a pair always 1)
- pronunciation given by celex encoding and unbounded disc encoding (celexPhon)
- add further celex information

In [4]:
homophones_in_data, gahls_homophones, gahls_homophones_missing_in_data = preprocessing.read_and_extract_homophones(hom_filename, df)

read Gahls Homophone data from /mnt/Restricted/Corpora/RedHen/hom.csv
406 out of 412 homophones found in Data:
Homophone Pairs found in Data: 200
Homophones without Pair:  ['flowers', 'holes', 'moose', 'naval', 'pairs', 'taught']
Missing homophones: ['flours' 'mousse' 'navel' 'pears' 'taut' 'wholes']


In [5]:
celex_dict = word_pronunciation_predictibility.get_english_phonology_from_celex(celex_dict_file)
homophones_in_data_celex_merged = merging_dataframes.get_celex_transcription(homophones_in_data, celex_dict)

In [6]:
#homophones_in_data_celex_merged.to_csv("2016_all_words_no_audio_homophones.csv")

In [3]:
homophones_in_data_celex_merged = pd.read_csv("2016_all_words_no_audio_homophones.csv", index_col = "Unnamed: 0")

In [4]:
homophones_in_data_celex_merged

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,cond_pred_next,has_pair,pron,celexPhon,pron_frequency,is_max,disc,clx,disc_no_bound,clx_no_bound
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.070000,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.017207,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.320000,38.540000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.660000,139.880000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.000264,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.530000,277.750000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.939999,415.159999,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.660000,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.150000,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.980000,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.000002,True,sp1d,sp1d,13,1,'sp1d,[speId],sp1d,speId
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.059999,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.001192,True,plVm,plVm,23,1,'plVm,[plVm],plVm,plVm


In [8]:
homophones_in_data.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max'],
      dtype='object')

In [9]:
homophones_in_data_celex_merged.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound'],
      dtype='object')

# Load additional information
- eaf files
- seg files
- gentle files

## EAF files
- information about present gestures

In [6]:
#eaf_data = preprocessing.get_additional_data_from_files(df, "eaf")

Load and extract information from eaf files...
Total files to laod and preprocess:  3647
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [7]:
#eaf_data.to_csv("2016_all_words_no_audio_eaf_gesture.csv")

In [7]:
eaf_data = pd.read_csv("2016_all_words_no_audio_eaf_gesture.csv",index_col = "Unnamed: 0")

  mask |= (ar1 == a)


In [8]:
eaf_data

Unnamed: 0,annotation,source_file,start,end,gesture,time_point,time_region_gesture,HeadMoving/MovingHorizontally,ShoulderMoving/NotWithHead,HeadMoving/MovingVertically,SpeakerOnScreen,ShoulderMoving/SlidingWindow,HandMoving,PersonOnScreen,none,ShoulderMoving/NoSlidingWindow,is_gesture
0,,2016-01-01_0100_US_KNBC_Channel_4_News,0,490.0,['PersonOnScreen'],"(0, 'PersonOnScreen', 'start', 'gesture')","(0, 2535)",0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,True
1,Klemack,2016-01-01_0100_US_KNBC_Channel_4_News,490,690.0,['PersonOnScreen'],"(490, 'Klemack', 'start', 'annotation')",,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,True
2,happened,2016-01-01_0100_US_KNBC_Channel_4_News,920,1340.0,['PersonOnScreen'],"(920, 'happened', 'start', 'annotation')",,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,True
3,to,2016-01-01_0100_US_KNBC_Channel_4_News,1340,1460.0,['PersonOnScreen'],"(1340, 'to', 'start', 'annotation')",,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,True
4,be,2016-01-01_0100_US_KNBC_Channel_4_News,1470,1730.0,['PersonOnScreen'],"(1470, 'be', 'start', 'annotation')",,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0,0.0,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
20250186,,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,3573807,3573873.0,['SpeakerOnScreen'],"(3573807, 'SpeakerOnScreen', 'start', 'gesture')","(3573807, 3577911)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,True
20250187,,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,3573873,3577911.0,"['SpeakerOnScreen', 'PersonOnScreen']","(3573873, 'PersonOnScreen', 'start', 'gesture')","(3573807, 3577911)",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0.0,True
20250188,,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,3579646,3579946.0,['SpeakerOnScreen'],"(3579646, 'SpeakerOnScreen', 'start', 'gesture')","(3579646, 3581381)",0.0,0.0,0.0,1.0,0.0,0.0,0.0,0,0.0,True
20250189,,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,3579946,3580347.0,"['SpeakerOnScreen', 'PersonOnScreen']","(3579946, 'PersonOnScreen', 'start', 'gesture')","(3579646, 3581381)",0.0,0.0,0.0,1.0,0.0,0.0,1.0,0,0.0,True


## SEG files
- information about Part Of Speech
- information about Phrase final marker

In [4]:
#seg_data = preprocessing.get_additional_data_from_files(df, "seg")

Load and extract information from seg files...
Total files to laod and preprocess:  3647
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [5]:
#seg_data.to_csv("2016_all_words_no_audio_seg_data.csv")

In [9]:
seg_data = pd.read_csv("2016_all_words_no_audio_seg_data.csv",index_col = "Unnamed: 0")

  mask |= (ar1 == a)


## GENTLE files
- information about Phrase final marker

In [4]:
gentle_data = preprocessing.get_additional_data_from_files(df, "gentle")

Load and extract information from gentle files...
Total files to laod and preprocess:  3647
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [5]:
#gentle_data.to_csv("2016_all_words_no_audio_gentle_data.csv")

In [10]:
gentle_data = pd.read_csv("2016_all_words_no_audio_gentle_data.csv",index_col = "Unnamed: 0")

In [12]:
gentle_data

Unnamed: 0,word,prev,next,end_of_sentence,start_of_sentence,preceding_marker,subsequent_marker,source_file,prev_word,next_word
0,Reporter,,John,False,False,False,False,2016-01-01_0100_US_KNBC_Channel_4_News,,John
1,John,Reporter,Cadiz,False,False,False,False,2016-01-01_0100_US_KNBC_Channel_4_News,Reporter,Cadiz
2,Cadiz,John,Klemack,False,False,False,False,2016-01-01_0100_US_KNBC_Channel_4_News,John,Klemack
3,Klemack,Cadiz,happened,False,False,False,False,2016-01-01_0100_US_KNBC_Channel_4_News,Cadiz,happened
4,happened,Klemack,to,False,False,False,False,2016-01-01_0100_US_KNBC_Channel_4_News,Klemack,to
...,...,...,...,...,...,...,...,...,...,...
21696119,story,the,all,False,False,False,False,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,the,all
21696120,all,story,evening,False,False,False,False,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,story,evening
21696121,evening,all,.,True,False,False,True,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,all,
21696122,.,evening,,False,False,False,False,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,,


## Video files
- information about entropy of situation in which the homophones was articulated

In [None]:
video_data = preprocessing.get_additional_data_from_files(homophones_in_data_celex_merged, "video") # only for homophones

Load and extract information from video files...
Total files to laod and preprocess:  3646
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200


In [None]:
#video_data.to_csv("2016_all_words_no_audio_video_data.csv")

In [11]:
video_data = pd.read_csv("2016_all_words_no_audio_video_data.csv",index_col = "Unnamed: 0")

In [13]:
video_data

Unnamed: 0,source_file,word,start,end,video_snippet_size
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.070000,12.280000,45944
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.320000,38.540000,47025
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.660000,139.880000,50005
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.530000,277.750000,53674
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.939999,415.159999,57177
...,...,...,...,...,...
530336,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,bare,2905.670000,2905.990000,182489
530337,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,steal,2145.440000,2145.770000,98687
530338,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,tract,2497.570000,2497.950000,147019
530339,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,heels,470.610000,471.000000,9981


## Celex 

In [22]:
celex_data = celex_files.get_syl_counts(celex_files.read_celex_file())

# Word Pronunciation Predictability (Berndt et al. 1987)

### Load Berndt's tables for Phoneme Equivalents and Conditional Probabilities for Grapheme-to-Phoneme Correspondences

In [4]:
berndt_character_code_df = pd.read_csv(berndt_character_coding_file, delimiter=";")
berndt_conditional_probs = pd.read_csv(berndt_conditional_probs_file,delimiter=";")

### APPABET to corresponding Keyboard Compatible Phonemic (KCP) symbol dict 

In [5]:
berndt_arpabet_phon_dict = word_pronunciation_predictibility.get_ARPABET_to_keyboard_phonetic_symbols_dict(berndt_character_code_df)

### KCP to Grapheme Symbols and Probabilities dict

In [6]:
phonem_graphem_prob_dict = word_pronunciation_predictibility.get_keyboard_phonetic_symbols_to_grapheme_cond_prob_dict(berndt_conditional_probs)

### Homophones with corresponding ARPABET transcription

In [8]:
unique_homophones = np.unique(homophones_in_data_celex_merged.word)
hom_arpabet_words = word_pronunciation_predictibility.get_ARPABET_phonetic_transcription(unique_homophones)

### Homophones with corresponding KCP transcription

In [9]:
hom_kcp_word_tuples = []
for i,arpabet_word in enumerate(hom_arpabet_words):
    kcp_word = word_pronunciation_predictibility.get_keyboard_phonetic_symbols_for_ARPABET(arpabet_word, berndt_arpabet_phon_dict)
    #print(unique_homophones[i],arpabet_word,kcp_word)
    hom_kcp_word_tuples.append((unique_homophones[i],kcp_word))

### Get possible (valid) Grapheme strings and probs for each KCP encoded homophone 
Note: 6 homophones not captured: 
- corps ['k', ['aw', ['o']], 'r'] -> Silent "PS"
- guessed: ['g', 'eh', 's', 't’] —> Grapheme 'GUE' as kcp 'g' but no mapping for Grapheme 'GU' as 'g' (silent U) 
- guest: ['g', 'eh', 's', ’t']
- guise: ['g', 'ai', 'z‘]
- thai: ['t', 'ai‘] —> KCP Symbol 'ai' not mapped to Grapheme 'AI' in Berndt'sconditional probs
- thais: ['t', 'ai', 'z']
- weighed: ['w', 'ay', 'd‘] —> Grapheme 'EIGH' as kcp 'ay' but not EIGH-E (silent E)

Possible Solution: 
- corps ['k', ['aw', ['o']], 'r'] add the mapping for grapheme "PS" to KCP ' ' to the conditional probability table to capture the silent "PS" with conditional probability 0.00001. In order to be a valid probability I changed the other possible outcome given "PS" --> 's' to have conditional prbabbility 0.99999. 
- guessed: ['g', 'eh', 's', 't’] add an 'e' --> 'gueessed' in order to get the 'eh' to 'E' mapping after the silent u 
- guest: ['g', 'eh', 's', 't’] 
- guise: ['g', 'ai', 'z‘] add an 'e' --> 'gueise' in order to get the hard 'g' for the 'G' with silent 'U'
- thai : ['t', 'ai‘] add an 'e' at the end because 'AI-E' has a mapping for 'ai' 
- thais: 
- weighed: ['w', 'ay', 'd‘] --> drop the silent 'e' to get 'weighd' ['w', 'ay', 'd‘]

In [10]:
possible_grapheme_strings, possible_prior_probs, possible_cond_probs, word_rests = word_pronunciation_predictibility.get_grapheme_string_with_conditional_prob_for_keyboard_phonetics(hom_kcp_word_tuples, 
phonem_graphem_prob_dict)

In [11]:
# homophones for which we have no valid grapheme string: 
counter = 0 
for i,word_pron in enumerate(hom_kcp_word_tuples):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(possible_grapheme_strings[i]) == 0 :
        counter+=1
        print(word,pron)
    else:
        empty_string = ['' != i for i in word_rests[i]]
        if np.sum(empty_string) == len(word_rests[i]):
            counter+=1
            print(word,pron)
    
print(counter)

corps ['k', ['aw', ['o']], 'r']
guessed ['g', 'eh', 's', 't']
guest ['g', 'eh', 's', 't']
guise ['g', 'ai', 'z']
thai ['t', 'ai']
thais ['t', 'ai', 'z']
weighed ['w', 'ay', 'd']
7


In [12]:
problematic_hom_kcp_word_tuples = [
    ('corps', ['k', ['aw', ['o']], 'r', ' ']),
    ('gueessed',['g', 'eh', 's', 't']),
    ('gueest', ['g', 'eh', 's', 't']),
    ('gueise', ['g', 'ai', 'z']),
    ('thaie', ['t', 'ai']),
    ('thaies', ['t', 'ai', 'z']),
    ('weighd', ['w', 'ay', 'd'])
]

In [13]:
problematic_hom_kcp_word_tuples_dict = {
    'corps': ('corps', ['k', ['aw', ['o']], 'r', ' ']),
    'guessed' : ('gueessed',['g', 'eh', 's', 't']),
    'guest': ('gueest', ['g', 'eh', 's', 't']),
    'guise': ('gueise', ['g', 'ai', 'z']),
    'thai' :('thaie', ['t', 'ai']),
    'thais':('thaies', ['t', 'ai', 'z']),
    'weighed':('weighd', ['w', 'ay', 'd'])
}

In [14]:
possible_grapheme_strings_problematic_homs, possible_prior_probs_problematic_homs, possible_cond_probs_problematic_homs, word_rests_problematic_homs = word_pronunciation_predictibility.get_grapheme_string_with_conditional_prob_for_keyboard_phonetics(problematic_hom_kcp_word_tuples, 
phonem_graphem_prob_dict)

In [15]:
j = 0
for i,wp in enumerate(hom_kcp_word_tuples):
    word = wp[0]
    phon = wp[1]
    if word in problematic_hom_kcp_word_tuples_dict:
        possible_grapheme_strings[i] = possible_grapheme_strings_problematic_homs[j]
        word_rests[i] = word_rests_problematic_homs[j]
        possible_prior_probs[i] = possible_prior_probs_problematic_homs[j]
        possible_cond_probs[i] = possible_cond_probs_problematic_homs[j]
        j+=1

In [16]:
valid_word_rests,valid_grapheme_strings,valid_prior_probs, valid_cond_probs = word_pronunciation_predictibility.get_valid_grapheme_strings(hom_kcp_word_tuples, possible_grapheme_strings, word_rests,possible_prior_probs, possible_cond_probs)

In [17]:
# homophones for which we have no valid grapheme string: 
counter = 0 
for i,word_pron in enumerate(hom_kcp_word_tuples):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(valid_grapheme_strings[i]) == 0:
        counter+=1
        print(word,pron)        
print(counter)

0


In [18]:
max_cond_prob_for_grapheme = word_pronunciation_predictibility.get_max_cond_prob_for_grapheme(berndt_conditional_probs)

In [19]:
m_score_data = word_pronunciation_predictibility.get_m_score_df(hom_kcp_word_tuples, valid_grapheme_strings,valid_cond_probs,max_cond_prob_for_grapheme)

In [20]:
m_score_data

Unnamed: 0,word,m_score
0,ad,1.000000
1,add,1.000000
2,adds,0.712750
3,ads,0.712750
4,aid,1.000000
...,...,...
401,wring,0.834497
402,wringing,0.900698
403,write,0.863000
404,writes,0.897250


# Merging Dataframes

In [34]:
homophones_in_data_celex_merged

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,cond_pred_next,has_pair,pron,celexPhon,pron_frequency,is_max,disc,clx,disc_no_bound,clx_no_bound
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.07,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.017207,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
1,2016-01-01_0100_US_KNBC_Channel_4_News,right,38.32,38.540000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
2,2016-01-01_0100_US_KNBC_Channel_4_News,right,139.66,139.880000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.000264,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
3,2016-01-01_0100_US_KNBC_Channel_4_News,right,277.53,277.750000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
4,2016-01-01_0100_US_KNBC_Channel_4_News,right,414.94,415.159999,0.22,high-confidence,no-error,no-error,no-error,no-error,...,0.188603,True,r2t,r2t,41655,1,'r2t,[raIt],r2t,raIt
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.66,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.15,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.000005,True,frANk,fr{Nk,515,1,'fr{Nk,[fr&Nk],fr{Nk,fr&Nk
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.98,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.000002,True,sp1d,sp1d,13,1,'sp1d,[speId],sp1d,speId
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.06,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.001192,True,plVm,plVm,23,1,'plVm,[plVm],plVm,plVm


### Merging eaf data

In [33]:
homophones_in_data_celex_eaf = merging_dataframes.merge_eaf_df_to_homophone_data(homophones_in_data_celex_merged, eaf_data)

In [36]:
len(homophones_in_data_celex_eaf)

530341

### Merging video data

In [37]:
homophones_in_data_celex_eaf_video = merging_dataframes.merge_video_df_to_homophone_data(homophones_in_data_celex_eaf, video_data)

In [38]:
len(homophones_in_data_celex_eaf_video)

530341

### Merging gentle data

In [43]:
importlib.reload(merging_dataframes)

<module 'merging_dataframes' from '/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/merging_dataframes.py'>

In [44]:
homophones_in_data_celex_eaf_video_gentle = merging_dataframes.merge_gentle_df_to_homophone_data(homophones_in_data_celex_eaf_video, gentle_data)

Merge gentle data for 3646 unique files!
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [45]:
len(homophones_in_data_celex_eaf_video_gentle)

530341

### Merging seg data

In [46]:
homophones_in_data_celex_eaf_video_gentle_seg = merging_dataframes.merge_seg_df_to_homophone_data(homophones_in_data_celex_eaf_video_gentle, seg_data)

Merge seg data for 3646 unique files!
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [None]:
len(homophones_in_data_celex_eaf_video_gentle_seg)

### Merging m-scores data

In [47]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores = merging_dataframes.merge_m_scores_df_to_homophone_data(homophones_in_data_celex_eaf_video_gentle_seg,m_score_data)

### Merging celex syllable counts data

In [48]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll = merging_dataframes.merge_celex_syl_counts_df_to_homophone_data(homophones_in_data_celex_eaf_video_gentle_seg_m_scores,celex_data)

In [50]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gent

### Merging Model Predictions for conditional predictibality of Homophones 

In [13]:
model_predictions = pd.read_csv("conditional_prob_of_homophones_model_predictions.csv",index_col = "Unnamed: 0")

In [14]:
model_predictions

Unnamed: 0,prev_word,homophone,next_word,prev_prob,next_prob
0,is,made,possible,1.015943e-02,1.604936e-02
1,is,made,possible,1.015943e-02,1.604936e-02
2,have,made,it,1.084759e-02,4.488743e-03
3,been,made,aware,1.313667e-02,1.515031e-03
4,been,made,now,1.313667e-02,5.712711e-04
...,...,...,...,...,...
530336,storm,cellars,underground,6.491179e-07,2.672024e-07
530337,this,bale,of,2.374624e-02,3.302273e-02
530338,this,bale,t,2.374624e-02,1.209081e-04
530339,this,bale,of,2.374624e-02,3.302273e-02


In [63]:
model_predictions.drop_duplicates(subset = ["prev_word", "homophone", "next_word"], inplace = True)
model_predictions.rename(columns= {"homophone": "word", "prev_prob": "prob|prev_model", "next_prob" : "prob|next_model"}, inplace=True)

In [64]:
model_predictions

Unnamed: 0,prev_word,word,next_word,prob|prev_model,prob|next_model
0,is,made,possible,1.015943e-02,1.604936e-02
2,have,made,it,1.084759e-02,4.488743e-03
3,been,made,aware,1.313667e-02,1.515031e-03
4,been,made,now,1.313667e-02,5.712711e-04
5,what,made,them,2.493435e-03,1.142306e-02
...,...,...,...,...,...
530334,ingredient,kneading,the,1.242674e-02,1.660406e-02
530335,sugar,beets,grown,5.856671e-04,8.358862e-07
530336,storm,cellars,underground,6.491179e-07,2.672024e-07
530337,this,bale,of,2.374624e-02,3.302273e-02


In [65]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions = homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll.merge(model_predictions, on = ["prev_word", "word", "next_word"])

In [66]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,seg_merging,seg_index,pos,rel1,rel2,lemma,m_score,SylCnt,prob|prev_model,prob|next_model
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.07,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,47.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
1,2016-02-08_0000_US_FOX-News_Fox_Report,right,1230.72,1230.920000,0.20,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,2244770.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
2,2016-02-24_2000_US_FOX-News_Shephard_Smith_Rep...,right,2170.14,2170.479999,0.34,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,3174746.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
3,2016-03-15_1900_US_CNN_Newsroom,right,171.51,171.800000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,4187674.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
4,2016-03-15_1900_US_CNN_Newsroom,right,2611.58,2611.840000,0.26,high-confidence,no-error,no-error,no-error,no-error,...,,,,,,,1.000000,1.0,2.993142e-04,2.300740e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.66,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,high-confidence,15556856.0,NN,I-NP,I-PNP,franc,0.806618,1.0,1.286732e-02,2.851425e-02
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.15,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,18199726.0,NN,I-NP,I-PNP,franc,0.806618,1.0,4.299633e-02,1.658379e-02
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.98,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,19202327.0,VBN,I-VP,O,spay,1.000000,1.0,2.965835e-02,4.454051e-02
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.06,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,19840499.0,NN,I-NP,O,plumb,1.000000,1.0,4.565296e-07,7.513650e-07


In [67]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gent

In [68]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions.rename(columns= {"cond_pred_prev": "prob|prev_text", "cond_pred_next" : "prob|next_text"}, inplace=True)

In [69]:
homophones_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'prob|prev_text', 'prob|next_text', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gent

In [71]:
#homophones_in_data_celex_eaf_video_gentle_seg_m_scores_model_predictions.to_csv("2016_all_words_no_audio_preprocessed.csv")

## Preprocessed Data

In [4]:
preprocessed_df = pd.read_csv("2016_all_words_no_audio_preprocessed.csv",index_col = "Unnamed: 0")

In [4]:
preprocessed_df

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,seg_merging,seg_index,pos,rel1,rel2,lemma,m_score,SylCnt,prob|prev_model,prob|next_model
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.07,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,47.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
1,2016-02-08_0000_US_FOX-News_Fox_Report,right,1230.72,1230.920000,0.20,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,2244770.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
2,2016-02-24_2000_US_FOX-News_Shephard_Smith_Rep...,right,2170.14,2170.479999,0.34,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,3174746.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
3,2016-03-15_1900_US_CNN_Newsroom,right,171.51,171.800000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,low-confidence,4187674.0,JJ,I-ADJP,O,right,1.000000,1.0,2.993142e-04,2.300740e-03
4,2016-03-15_1900_US_CNN_Newsroom,right,2611.58,2611.840000,0.26,high-confidence,no-error,no-error,no-error,no-error,...,,,,,,,1.000000,1.0,2.993142e-04,2.300740e-03
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.66,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,high-confidence,15556856.0,NN,I-NP,I-PNP,franc,0.806618,1.0,1.286732e-02,2.851425e-02
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.15,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,18199726.0,NN,I-NP,I-PNP,franc,0.806618,1.0,4.299633e-02,1.658379e-02
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.98,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,19202327.0,VBN,I-VP,O,spay,1.000000,1.0,2.965835e-02,4.454051e-02
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.06,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,19840499.0,NN,I-NP,O,plumb,1.000000,1.0,4.565296e-07,7.513650e-07


In [5]:
len(preprocessed_df.word.unique())

406

# Homophones merged with low confidence 

In [11]:
low_confidence_homs = preprocessed_df[np.logical_or(preprocessed_df.seg_merging == "low-confidence",preprocessed_df.gentle_merging == "low-confidence")]

In [12]:
low_confidence_homs[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word',"seg_error",'eafgz_error','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,seg_error,eafgz_error,preceding_pause,subsequent_pause
0,right,that's,there,that's,there,'s,there,no-error,no-error,False,False
1,right,that's,there,that's,there,'s,there,no-error,no-error,False,False
2,right,that's,there,that's,there,'s,there,no-error,no-error,False,False
3,right,that's,there,that's,99,'s,there,no-error,no-error,False,False
5,right,that's,there,that's,99,'s,there,no-error,no-error,False,False
...,...,...,...,...,...,...,...,...,...,...,...
530294,hertz,67,global,19.67,global,19.67,global,no-error,no-error,True,False
530300,hertz,77,dropped,35.77,dropped,35.77,dropped,no-error,no-error,True,False
530321,chutes,emergency,300,emergency,and,emergency,and,no-error,no-error,False,True
530322,chutes,emergency,but,emergency,300,emergency,300,no-error,no-error,False,True


In [13]:
#subset = ["word", "prev_word", "next_word", "gentle_prev_word", "gentle_next_word", "seg_prev_word", "seg_next_word"]
low_confidence_homs.drop_duplicates()[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
0,right,that's,there,that's,there,'s,there,False,False
1,right,that's,there,that's,there,'s,there,False,False
2,right,that's,there,that's,there,'s,there,False,False
3,right,that's,there,that's,99,'s,there,False,False
5,right,that's,there,that's,99,'s,there,False,False
...,...,...,...,...,...,...,...,...,...
530294,hertz,67,global,19.67,global,19.67,global,True,False
530300,hertz,77,dropped,35.77,dropped,35.77,dropped,True,False
530321,chutes,emergency,300,emergency,and,emergency,and,False,True
530322,chutes,emergency,but,emergency,300,emergency,300,False,True


## Seg Data

In [17]:
low_confidence_homs_seg = preprocessed_df[preprocessed_df.seg_merging == "low-confidence"][["word", 'prev_word', 'next_word', 'seg_prev_word', 'seg_next_word','preceding_pause','subsequent_pause']]
#low_confidence_homs_seg.drop_duplicates(subset = ["word", "prev_word", "next_word", "gentle_prev_word", "gentle_next_word", "seg_prev_word", "seg_next_word"])[["word", 'prev_word', 'next_word', 'seg_prev_word', 'seg_next_word','preceding_pause','subsequent_pause'], inplace = True]
low_confidence_homs_seg

Unnamed: 0,word,prev_word,next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
0,right,that's,there,'s,there,False,False
1,right,that's,there,'s,there,False,False
2,right,that's,there,'s,there,False,False
3,right,that's,there,'s,there,False,False
5,right,that's,there,'s,there,False,False
...,...,...,...,...,...,...,...
530294,hertz,67,global,19.67,global,True,False
530300,hertz,77,dropped,35.77,dropped,True,False
530321,chutes,emergency,300,emergency,and,False,True
530322,chutes,emergency,but,emergency,300,False,True


## Gentle Data

In [19]:
low_confidence_homs_gentle = preprocessed_df[preprocessed_df.gentle_merging == "low-confidence"][["word", 'prev_word', 'next_word',"gentle_prev_word", "gentle_next_word" ,'preceding_pause','subsequent_pause']]
#low_confidence_homs_gentle.drop_duplicates(subset = ["word", "prev_word", "next_word", "gentle_prev_word", "gentle_next_word"])[["word", 'prev_word', 'next_word',"gentle_prev_word", "gentle_next_word" ,'preceding_pause','subsequent_pause']]
low_confidence_homs_gentle

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,preceding_pause,subsequent_pause
3,right,that's,there,that's,99,False,False
5,right,that's,there,that's,99,False,False
35,right,and,now,happening,now,False,False
36,right,and,now,fames,now,False,False
42,right,and,now,60s,now,False,False
...,...,...,...,...,...,...,...
530253,flea,advil,can,advil,bites,False,False
530294,hertz,67,global,19.67,global,True,False
530300,hertz,77,dropped,35.77,dropped,True,False
530321,chutes,emergency,300,emergency,and,False,True


# Find Substrings
Because of differences in splitting it might happen that only a subpart of the word is listed in prev-/ next-word columns. Check wether the previous word column ends with the same string and check wether the next column starts with the same string.

## Seg Data

### Prev word

In [53]:
low_confidence_homs_seg_because_prev_word = low_confidence_homs_seg[low_confidence_homs_seg.seg_prev_word != low_confidence_homs_seg.prev_word]
print(len(low_confidence_homs_seg_because_prev_word))
matched_prev_word_substring_seg = low_confidence_homs_seg_because_prev_word.apply(lambda row: str(row.seg_prev_word).endswith(str(row.prev_word)) or str(row.prev_word).endswith(str(row.seg_prev_word)), axis = 1)
low_confidence_homs_seg_because_prev_word[matched_prev_word_substring_seg]
#low_confidence_homs_seg[low_confidence_homs_seg.apply(lambda row: str(row.seg_prev_word) in str(row.prev_word) or str(row.prev_word) in str(row.seg_prev_word), axis = 1)]

25382


Unnamed: 0,word,prev_word,next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
0,right,that's,there,'s,there,False,False
1,right,that's,there,'s,there,False,False
2,right,that's,there,'s,there,False,False
3,right,that's,there,'s,there,False,False
5,right,that's,there,'s,there,False,False
...,...,...,...,...,...,...,...
529773,piers,cnn's,morgan,'s,morgan,False,False
529774,piers,cnn's,morgan,'s,morgan,False,False
530256,flea,barrymore's,market,'s,market,False,False
530294,hertz,67,global,19.67,global,True,False


### Next word

In [54]:
low_confidence_homs_seg_because_next_word = low_confidence_homs_seg[low_confidence_homs_seg.seg_next_word != low_confidence_homs_seg.next_word]
print(len(low_confidence_homs_seg_because_next_word))
matched_next_word_substring_seg = low_confidence_homs_seg_because_next_word.apply(lambda row: str(row.seg_next_word).startswith(str(row.next_word)) or str(row.next_word).startswith(str(row.seg_next_word)), axis = 1)
low_confidence_homs_seg_because_next_word[matched_next_word_substring_seg]

22608


Unnamed: 0,word,prev_word,next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
816,right,be,back,be,back.ay,False,False
1135,right,be,back,be,back.k,False,False
1176,right,be,back,be,back.,False,False
1190,right,be,back,be,back.ight,False,False
1356,right,be,back,be,backe,False,False
...,...,...,...,...,...,...,...
526910,tail,the,r,the,r.,False,False
527018,tail,the,,the,,False,False
528207,souls,people's,we're,'s,we,False,False
528362,bore,full,1,full,1.4,False,False


## Gentle Data

### Prev word

In [55]:
low_confidence_homs_gentle_because_prev_word = low_confidence_homs_gentle[low_confidence_homs_gentle.gentle_prev_word != low_confidence_homs_gentle.prev_word]
print(len(low_confidence_homs_gentle_because_prev_word))
matched_prev_word_substring_gentle = low_confidence_homs_gentle_because_prev_word.apply(lambda row: str(row.gentle_prev_word).endswith(str(row.prev_word)) or str(row.prev_word).endswith(str(row.gentle_prev_word)), axis = 1)
low_confidence_homs_gentle_because_prev_word[matched_prev_word_substring_gentle]
#low_confidence_homs_seg[low_confidence_homs_seg.apply(lambda row: str(row.seg_prev_word) in str(row.prev_word) or str(row.prev_word) in str(row.seg_prev_word), axis = 1)]

14164


Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,preceding_pause,subsequent_pause
1633,right,is,now,this,now,False,False
3387,right,eliquis,for,is,for,False,False
10114,right,on,now,trillion,now,True,False
12227,right,com,behind,coveredca.com,behind,False,False
12298,right,00,here,5:00,here,False,False
...,...,...,...,...,...,...,...
529618,lumber,91,liquidators,17.91,liquidators,False,False
529638,lumber,82,liquidators,18.82,liquidators,False,False
529684,bored,syrian,or,jordanian/syrian,or,False,True
530294,hertz,67,global,19.67,global,True,False


### Next word

In [56]:
low_confidence_homs_gentle_because_next_word = low_confidence_homs_gentle[low_confidence_homs_gentle.gentle_next_word != low_confidence_homs_gentle.next_word]
print(len(low_confidence_homs_gentle_because_next_word))
matched_next_word_substring_gentle = low_confidence_homs_gentle_because_next_word.apply(lambda row: str(row.gentle_next_word).startswith(str(row.next_word)) or str(row.next_word).startswith(str(row.gentle_next_word)), axis = 1)
low_confidence_homs_gentle_because_next_word[matched_next_word_substring_gentle]

16300


Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,preceding_pause,subsequent_pause
381,right,but,now,hughesnet,now's,False,False
816,right,be,back,be,back.ay,False,False
1135,right,be,back,be,back.k,False,False
1190,right,be,back,be,back.ight,False,False
1356,right,be,back,be,backe,False,False
...,...,...,...,...,...,...,...
525912,wit,118,,118,,False,False
525933,wit,in,,in,,True,False
527018,tail,the,,the,,False,False
528362,bore,full,1,full,1.4,False,False


## Change merging confidence to medium

In [61]:
low_confidence_homs["matched_prev_word_substring_seg"] = np.nan
low_confidence_homs["matched_next_word_substring_seg"] = np.nan
low_confidence_homs["matched_prev_word_substring_gentle"] = np.nan
low_confidence_homs["matched_next_word_substring_gentle"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_prev_word_substring_seg"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_next_word_substring_seg"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_prev_word_substring_gentle"] = np.nan
A value is

In [62]:
low_confidence_homs

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'prob|prev_text', 'prob|next_text', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gent

In [88]:
low_confidence_homs.loc[matched_prev_word_substring_seg.index,"matched_prev_word_substring_seg"] = matched_prev_word_substring_seg.replace(False,-1)
low_confidence_homs.loc[matched_next_word_substring_seg.index,"matched_next_word_substring_seg"] = matched_next_word_substring_seg.replace(False,-1)
low_confidence_homs.loc[matched_prev_word_substring_gentle.index,"matched_prev_word_substring_gentle"] = matched_prev_word_substring_gentle.replace(False,-1)
low_confidence_homs.loc[matched_next_word_substring_gentle.index,"matched_next_word_substring_gentle"] = matched_next_word_substring_gentle.replace(False,-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [90]:
low_confidence_homs["matched_prev_word_substring_seg"] = low_confidence_homs["matched_prev_word_substring_seg"].replace(np.nan,0)
low_confidence_homs["matched_next_word_substring_seg"] = low_confidence_homs["matched_next_word_substring_seg"].replace(np.nan,0)
low_confidence_homs["matched_prev_word_substring_gentle"] = low_confidence_homs["matched_prev_word_substring_gentle"].replace(np.nan,0)
low_confidence_homs["matched_next_word_substring_gentle"] = low_confidence_homs["matched_next_word_substring_gentle"].replace(np.nan,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_prev_word_substring_seg"] = low_confidence_homs["matched_prev_word_substring_seg"].replace(np.nan,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_next_word_substring_seg"] = low_confidence_homs["matched_next_word_substring_seg"].replace(np.nan,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [91]:
low_confidence_homs[["matched_prev_word_substring_seg","matched_next_word_substring_seg","matched_prev_word_substring_gentle", "matched_next_word_substring_gentle"]]

Unnamed: 0,matched_prev_word_substring_seg,matched_next_word_substring_seg,matched_prev_word_substring_gentle,matched_next_word_substring_gentle
0,1.0,0.0,0.0,0.0
1,1.0,0.0,0.0,0.0
2,1.0,0.0,0.0,0.0
3,1.0,0.0,0.0,-1.0
5,1.0,0.0,0.0,-1.0
...,...,...,...,...
530294,1.0,0.0,1.0,0.0
530300,1.0,0.0,1.0,0.0
530321,0.0,-1.0,0.0,-1.0
530322,0.0,-1.0,0.0,-1.0


In [97]:
medium_confidence_seg = low_confidence_homs[low_confidence_homs["matched_prev_word_substring_seg"] + low_confidence_homs["matched_next_word_substring_seg"] > 0].index

In [99]:
medium_confidence_gentle = low_confidence_homs[low_confidence_homs["matched_prev_word_substring_gentle"] + low_confidence_homs["matched_next_word_substring_gentle"] > 0].index

In [100]:
preprocessed_df.loc[medium_confidence_seg,"seg_merging"] = "medium-confidence"
preprocessed_df.loc[medium_confidence_gentle,"gentle_merging"] = "medium-confidence"

# Remaining Low confidence homophones

In [72]:
preprocessed_df = pd.read_csv("2016_all_words_no_audio_preprocessed.csv",index_col = "Unnamed: 0")

In [76]:
remaining_low_confidence_homs = preprocessed_df[np.logical_or(preprocessed_df.seg_merging == "low-confidence",preprocessed_df.gentle_merging == "low-confidence")]

In [77]:
remaining_low_confidence_homs[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word',"seg_error",'eafgz_error','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,seg_error,eafgz_error,preceding_pause,subsequent_pause
3,right,that's,there,that's,99,'s,there,no-error,no-error,False,False
5,right,that's,there,that's,99,'s,there,no-error,no-error,False,False
35,right,and,now,happening,now,happening,now,no-error,no-error,False,False
36,right,and,now,fames,now,fames,now,no-error,no-error,False,False
42,right,and,now,60s,now,s,now,no-error,no-error,False,False
...,...,...,...,...,...,...,...,...,...,...,...
530254,flea,insurance,bites,insurance,bites,),bites,no-error,no-error,False,False
530255,flea,you,bites,you,bites,),bites,no-error,no-error,False,False
530321,chutes,emergency,300,emergency,and,emergency,and,no-error,no-error,False,True
530322,chutes,emergency,but,emergency,300,emergency,300,no-error,no-error,False,True


In [78]:
remaining_low_confidence_homs.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'prob|prev_text', 'prob|next_text', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gent

In [79]:
remaining_low_confidence_homs[["preceding_pause", "subsequent_pause", "gentle_start_of_sentence", "gentle_end_of_sentence", "seg_start_of_sentence", "seg_end_of_sentence","seg_preceding_marker",
       "seg_subsequent_marker", "gentle_preceding_marker","gentle_subsequent_marker"]]

Unnamed: 0,preceding_pause,subsequent_pause,gentle_start_of_sentence,gentle_end_of_sentence,seg_start_of_sentence,seg_end_of_sentence,seg_preceding_marker,seg_subsequent_marker,gentle_preceding_marker,gentle_subsequent_marker
3,False,False,False,False,False,False,False,True,False,True
5,False,False,False,False,False,False,False,True,False,True
35,False,False,False,False,False,False,False,False,False,False
36,False,False,False,False,False,False,False,False,False,False
42,False,False,True,False,True,False,True,False,True,False
...,...,...,...,...,...,...,...,...,...,...
530254,False,False,True,False,False,False,False,False,True,False
530255,False,False,True,False,False,False,False,False,True,False
530321,False,True,False,True,False,True,False,True,False,True
530322,False,True,False,True,False,True,False,True,False,True


In [69]:
not_index = preprocessed_df.index.isin(remaining_low_confidence_homs.index)
high_confidence_homs = preprocessed_df[~not_index]

In [80]:
print("High Confidence rows: ", len(high_confidence_homs))
print("Low Confidence rows: ", len(remaining_low_confidence_homs))
print("------------------------------------------------")
print(len(preprocessed_df), "Dataframe rows") 
print(len(remaining_low_confidence_homs)/len(preprocessed_df) * 100, "%")

High Confidence rows:  494139
Low Confidence rows:  36968
------------------------------------------------
530341 Dataframe rows
6.970609475790105 %


# Ambiguous Pause and Marker Information although high/medium confidence

# Find matching pauses and markers 
From gentle and seg files we get information about wether the homophone is the beginnning or ending of a sentence  and wether it is preceding or subsequent to a marker.
We want to check whether we get consistent information from both files (also as an indicator for the right merging).

In [81]:
high_confidence_homs

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,seg_merging,seg_index,pos,rel1,rel2,lemma,SylCnt,prob|prev_model,prob|next_model,m_score
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.07,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,medium-confidence,47.0,JJ,I-ADJP,O,right,1.0,2.993142e-04,2.300740e-03,1.000000
1,2016-02-08_0000_US_FOX-News_Fox_Report,right,1230.72,1230.920000,0.20,high-confidence,no-error,no-error,no-error,no-error,...,medium-confidence,2244770.0,JJ,I-ADJP,O,right,1.0,2.993142e-04,2.300740e-03,1.000000
2,2016-02-24_2000_US_FOX-News_Shephard_Smith_Rep...,right,2170.14,2170.479999,0.34,high-confidence,no-error,no-error,no-error,no-error,...,medium-confidence,3174746.0,JJ,I-ADJP,O,right,1.0,2.993142e-04,2.300740e-03,1.000000
3,2016-03-15_1900_US_CNN_Newsroom,right,171.51,171.800000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,medium-confidence,4187674.0,JJ,I-ADJP,O,right,1.0,2.993142e-04,2.300740e-03,1.000000
4,2016-03-15_1900_US_CNN_Newsroom,right,2611.58,2611.840000,0.26,high-confidence,no-error,no-error,no-error,no-error,...,,,,,,,1.0,2.993142e-04,2.300740e-03,1.000000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.66,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,high-confidence,15556856.0,NN,I-NP,I-PNP,franc,1.0,1.286732e-02,2.851425e-02,0.806618
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.15,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,18199726.0,NN,I-NP,I-PNP,franc,1.0,4.299633e-02,1.658379e-02,0.806618
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.98,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,19202327.0,VBN,I-VP,O,spay,1.0,2.965835e-02,4.454051e-02,1.000000
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.06,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,high-confidence,19840499.0,NN,I-NP,O,plumb,1.0,4.565296e-07,7.513650e-07,1.000000


In [52]:
matched_pauses_seg = high_confidence_homs.apply(lambda row: (row.preceding_pause == row.seg_start_of_sentence or 
                                                         row.preceding_pause == row.seg_preceding_marker) and 
                                                         (row.subsequent_pause == row.seg_end_of_sentence or 
                                                         row.subsequent_pause == row.seg_subsequent_marker), axis = 1)
matched_pauses_gentle = high_confidence_homs.apply(lambda row: (row.preceding_pause == row.gentle_start_of_sentence or 
                                                         row.preceding_pause == row.gentle_preceding_marker) and 
                                                         (row.subsequent_pause == row.gentle_end_of_sentence or 
                                                         row.subsequent_pause == row.gentle_subsequent_marker), axis = 1)

In [85]:
matched_seg_gentle_marker = high_confidence_homs.apply(lambda row: (row.seg_start_of_sentence == row.gentle_start_of_sentence and 
                                                         row.seg_preceding_marker == row.gentle_preceding_marker and 
                                                         row.seg_end_of_sentence == row.gentle_end_of_sentence and 
                                                         row.seg_subsequent_marker == row.gentle_subsequent_marker), axis = 1)

In [92]:
print("High Confidence rows: ", len(high_confidence_homs))
print("Low Confidence rows: ", len(remaining_low_confidence_homs))
print("High Confidence Consistent Marker rows: ", len(matched_seg_gentle_marker[matched_seg_gentle_marker]))
print("------------------------------------------------")
print(len(matched_seg_gentle_marker) - np.sum(matched_seg_gentle_marker), "rows with inconsisten information...")
print(len(preprocessed_df), "Dataframe rows") 
print((len(matched_seg_gentle_marker) - np.sum(matched_seg_gentle_marker))/len(high_confidence_homs) * 100, "% High Confidence inconsisten marker...")
print((len(matched_seg_gentle_marker) - np.sum(matched_seg_gentle_marker) + len(remaining_low_confidence_homs))/len(preprocessed_df) * 100,"% Homophones with wrong or inconsistent context information...")

High Confidence rows:  494139
Low Confidence rows:  36968
High Confidence Consistent Marker rows:  480177
------------------------------------------------
13962 rows with inconsisten information...
530341 Dataframe rows
2.82552075428169 % High Confidence inconsisten marker...
9.60325526406595 % Homophones with wrong or inconsistent context information...


In [30]:
BASE = '/mnt/Restricted/Corpora/RedHen'
DATA_FOLDER = os.path.join(BASE, 'original')
DF_SOURCE_PATH = os.path.join(BASE, '2016_all_words_no_audio.pickle')
DF_HOMEOHONES_PATH = os.path.join(BASE, 'homophone_analysis_scripts/2016_all_words_no_audio_preprocessed_speaking_rate_heuristic.csv')
celex_dict_file = "/mnt/shared/corpora/Celex/english/epw/epw.cd"

#STRETCHES_PATH = BASE + '/homophone_analysis_scripts'
STRETCHES_PATH = ''#'/mnt/shared/people/elnaz/homophones/10sec_stretch/'

NS = '<non-speech>'

In [31]:
df_hom_speaking_rate = pd.read_csv(DF_HOMEOHONES_PATH, index_col = "Unnamed: 0")

In [13]:
#df_hom = pd.read_csv(os.path.join(BASE, 'homophone_analysis_scripts/2016_all_words_no_audio_preprocessed.csv'), index_col = "idx1")

In [32]:
df_hom_speaking_rate

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,ShoulderMoving/NoSlidingWindow,none,ShoulderMoving/SlidingWindow,is_gesture,speaking_rate_prev_heuristic,speaking_rate_next_heuristic,prev_stretch_duration_heuristic,next_stretch_duration_heuristic,prev_stretch_syl_count_heuristic,next_stretch_syl_count_heuristic
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.07,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,5.278884,4.083665,10.040000,10.040000,53.0,41.0
1,2016-02-08_0000_US_FOX-News_Fox_Report,right,1230.72,1230.920000,0.20,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,2.222222,4.232283,0.450000,10.160000,1.0,43.0
2,2016-02-24_2000_US_FOX-News_Shephard_Smith_Rep...,right,2170.14,2170.479999,0.34,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,1.538464,3.872217,0.649999,10.330000,1.0,40.0
3,2016-03-15_1900_US_CNN_Newsroom,right,171.51,171.800000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,3.448276,1.818182,0.290000,1.100000,1.0,2.0
4,2016-03-15_1900_US_CNN_Newsroom,right,2611.58,2611.840000,0.26,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,4.365079,3.584229,7.560000,8.370000,33.0,30.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.66,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,,,,,3.529412,1.149425,0.850000,9.570000,3.0,11.0
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.15,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,1.0,True,4.871060,2.371542,6.980000,10.120000,34.0,24.0
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.98,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,1.650165,3.891051,6.060000,10.280000,10.0,40.0
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.06,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,0.0,True,5.641026,2.197805,9.749999,0.909999,55.0,2.0


In [33]:
df_hom_speaking_rate.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'prob|prev_text', 'prob|next_text', 'has_pair', 'pron', 'celexPhon',
       'pron_frequency', 'is_max', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'video_snippet_size', 'gentle_prev_word',
       'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gentle_preceding_marker',
       'gentle_subsequent_marker', 'gentle_merging', 'gentle_index', 'SylCnt',
       'prob|prev_model', 'prob|next_model', 'm_score', 'gesture',
       'HandMoving', 'PersonOnScreen', 'SpeakerOnScreen',
       'HeadMoving/MovingVertically', 'Shoulde

In [146]:
import importlib
importlib.reload(celex_files)

<module 'celex_files' from '/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/celex_files.py'>

In [147]:
noun_quotient = celex_files.main("/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/hom.csv", "spell")

In [143]:
noun_quotient

Unnamed: 0,word,NQuot
0,gym,1.000000e+00
1,jim,1.000000e+00
2,aide,1.000000e+00
3,aid,9.998861e-09
4,aides,1.000000e+00
...,...,...
301,wax,9.966446e-05
302,week,1.000000e+00
303,whit,1.000000e+00
304,wit,9.950522e-02


In [156]:
nquot[nquot.word == "franc"]

Unnamed: 0,word,NQuot
78,franc,1.0


In [150]:
nquot = pd.read_pickle("/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/Nquot.pickle")

In [151]:
df_hom_speaking_rate_nquote = pd.merge(df_hom_speaking_rate,nquot, how="left", on='word')

In [152]:
df_hom_speaking_rate_nquote

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,none,ShoulderMoving/SlidingWindow,is_gesture,speaking_rate_prev_heuristic,speaking_rate_next_heuristic,prev_stretch_duration_heuristic,next_stretch_duration_heuristic,prev_stretch_syl_count_heuristic,next_stretch_syl_count_heuristic,NQuot
0,2016-01-01_0100_US_KNBC_Channel_4_News,right,12.07,12.280000,0.21,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,5.278884,4.083665,10.040000,10.040000,53.0,41.0,0.139996
1,2016-02-08_0000_US_FOX-News_Fox_Report,right,1230.72,1230.920000,0.20,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,2.222222,4.232283,0.450000,10.160000,1.0,43.0,0.139996
2,2016-02-24_2000_US_FOX-News_Shephard_Smith_Rep...,right,2170.14,2170.479999,0.34,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,1.538464,3.872217,0.649999,10.330000,1.0,40.0,0.139996
3,2016-03-15_1900_US_CNN_Newsroom,right,171.51,171.800000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,3.448276,1.818182,0.290000,1.100000,1.0,2.0,0.139996
4,2016-03-15_1900_US_CNN_Newsroom,right,2611.58,2611.840000,0.26,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,4.365079,3.584229,7.560000,8.370000,33.0,30.0,0.139996
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
530336,2016-09-20_0030_US_KCET_Nightly_Business_Report,franc,1146.66,1147.190000,0.53,low-confidence,no-error,no-error,no-error,no-error,...,,,,3.529412,1.149425,0.850000,9.570000,3.0,11.0,1.000000
530337,2016-11-09_1300_US_KNBC_Today_in_LA_at_5am,franc,2347.15,2347.520000,0.37,high-confidence,no-error,no-error,no-error,no-error,...,0.0,1.0,True,4.871060,2.371542,6.980000,10.120000,34.0,24.0,1.000000
530338,2016-11-22_0200_US_KCBS_CBS_2_News_at_6PM,spayed,753.98,754.290000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,1.650165,3.891051,6.060000,10.280000,10.0,40.0,0.000000
530339,2016-11-30_1400_US_CNN_Newsroom,plumb,803.06,803.339999,0.28,high-confidence,no-error,no-error,no-error,no-error,...,0.0,0.0,True,5.641026,2.197805,9.749999,0.909999,55.0,2.0,0.000000


In [157]:
#df_hom_speaking_rate_nquote.to_csv("2016_all_words_no_audio_preprocessed_speaking_rate_heuristic.csv")