In [4]:
import os
import preprocessing
import pandas as pd
import numpy as np
import importlib
from english_contractions import ENGLISH_CONTRACTIONS
import merging_dataframes
import word_pronunciation_predictibility
import celex_files

In [2]:
celex_dict_file = "/mnt/shared/corpora/Celex/english/epw/epw.cd"
filename = "/mnt/Restricted/Corpora/RedHen/2016_all_words_no_audio.pickle"
hom_filename = "/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/hom.csv"
berndt_character_coding_file = "/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/phonetic_character_code_berndt1987.csv"
berndt_conditional_probs_file = "/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/Conditional_Probabilities_for_Grapheme-to-Phoneme_Correspondences_Berndt1987.csv"

In [3]:
df = preprocessing.read_dataframe(filename, remove_pauses=True, remove_errors=True, preprocessing=True, drop_error_columns=False)

read dataframe from /mnt/Restricted/Corpora/RedHen/2016_all_words_no_audio.pickle
Preprocessing: extract pause information...
Remove pauses from data!
Preprocessing: apply word preprocessing...
Preprocessing: calculate word duration...
Preprocessing: calculate word frequency...
Preprocessing: extract context information...
Preprocessing: calculate length in letter...
Preprocessing: calculate contextual predictability...
(18864660, 25) RangeIndex(start=0, stop=18864660, step=1)


# Homographs

### Problematic cases:
- excuse (N) is pronounced with a /s/ and excuse (V) is pronounces with a /z/. 
- practice : is listed only as Noun in celex. This is because the verb is spelled as "practise" in British English (celex) and "practice" in American English.

In [6]:
homograph_pairs = ["act", "answer","attack", "break", "call", "calls", "care", "cause", "change", "check", "control",      
                   "cook", "cost", "cut", "deal", "dress", "drive", "end", "face", "fall", "fight", "fire",
                   "focus", "help", "hope", "limit", "look", "love", "matter", "mind", "name","need", "needs","notice",
                   "offer", "order", "pass", "pay", "plan", "play", "raise","rent", "respect","ride","run",
                   "set", "show", "shows", "sign", "sleep", "sound", "stand", "struggle", "study", "support", "talk", "touch", 
                   "turn", "vote", "waste", "work"] 

In [7]:
gahls_homophones = pd.read_csv(hom_filename, index_col="Unnamed: 0")

## Homographs In Data

In [9]:
df

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,prev_word_frequency,next_word,next_word_frequency,length_in_letter,prev_word_string,next_word_string,prev_word_string_frequency,next_word_string_frequency,cond_pred_prev,cond_pred_next
0,2016-01-01_0100_US_KNBC_Channel_4_News,klemack,0.27,0.69,0.42,high-confidence,no-error,no-error,no-error,no-error,...,,happened,8596.0,7,,klemack-happened,,1.0,,0.000116
1,2016-01-01_0100_US_KNBC_Channel_4_News,happened,0.92,1.34,0.42,high-confidence,no-error,no-error,no-error,no-error,...,33.0,to,539420.0,8,klemack-happened,happened-to,1.0,910.0,0.030303,0.001687
2,2016-01-01_0100_US_KNBC_Channel_4_News,to,1.34,1.46,0.12,high-confidence,no-error,no-error,no-error,no-error,...,8596.0,be,102918.0,2,happened-to,to-be,910.0,37236.0,0.105863,0.361803
3,2016-01-01_0100_US_KNBC_Channel_4_News,be,1.47,1.73,0.26,high-confidence,no-error,no-error,no-error,no-error,...,539420.0,there,59861.0,2,to-be,be-there,37236.0,686.0,0.069030,0.011460
4,2016-01-01_0100_US_KNBC_Channel_4_News,there,1.73,1.97,0.24,high-confidence,no-error,no-error,no-error,no-error,...,102918.0,on,151861.0,5,be-there,there-on,686.0,478.0,0.006666,0.003148
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
18864655,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,follow,3458.11,3458.37,0.26,high-confidence,no-error,no-error,no-error,no-error,...,128106.0,the,932396.0,6,we-follow,follow-the,69.0,389.0,0.000539,0.000417
18864656,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,the,3458.38,3458.53,0.15,high-confidence,no-error,no-error,no-error,no-error,...,1791.0,story,7440.0,3,follow-the,the-story,389.0,1904.0,0.217197,0.255914
18864657,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,story,3458.53,3459.05,0.52,high-confidence,no-error,no-error,no-error,no-error,...,932396.0,all,62248.0,5,the-story,story-all,1904.0,29.0,0.002042,0.000466
18864658,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,all,3459.10,3459.39,0.29,high-confidence,no-error,no-error,no-error,no-error,...,7440.0,evening,4470.0,3,story-all,all-evening,29.0,14.0,0.003898,0.003132


In [8]:
homographs_in_data = df[df.word.isin(homograph_pairs)]

In [10]:
print("%d out of %d homographs found in Data:" % (len(np.unique(homographs_in_data.word)), len(homograph_pairs)))

61 out of 61 homographs found in Data:


# Celex information

In [11]:
celex_dict = word_pronunciation_predictibility.get_english_phonology_from_celex(celex_dict_file)

In [12]:
words, counts = np.unique(celex_dict[celex_dict.word.isin(homograph_pairs)].word, return_counts = True)
print(words[np.where(counts==2)])

[]


In [13]:
clex_info = celex_dict[celex_dict.word.isin(homograph_pairs)]#.groupby('word').first().reset_index()

In [14]:
clex_info

Unnamed: 0,word,disc,clx,disc_no_bound,clx_no_bound
669,act,'{kt,[&kt],{kt,&kt
2702,answer,'#n-s@R,[A:n][s@r*],#ns@R,A:ns@r*
4201,attack,@-'t{k,[@][t&k],@t{k,@t&k
9582,break,'br1k,[breIk],br1k,breIk
11570,call,'k$l,[kO:l],k$l,kO:l
...,...,...,...,...,...
92154,touch,'tVJ,[tVtS],tVJ,tVtS
93820,turn,'t3n,[t3:n],t3n,t3:n
96903,vote,'v5t,[v@Ut],v5t,v@Ut
97626,waste,'w1st,[weIst],w1st,weIst


In [15]:
homographs_in_data_celex_merged = homographs_in_data.merge(clex_info,
                    how = "left",
                    on = "word")

In [16]:
print(len(homographs_in_data_celex_merged))
print(len(homographs_in_data))

294818
294818


In [17]:
#homographs_in_data_celex_merged.to_csv("2016_all_words_no_audio_homographs.csv")

In [3]:
homographs_in_data_celex_merged = pd.read_csv("2016_all_words_no_audio_homographs.csv", index_col = "Unnamed: 0")

In [4]:
homographs_in_data_celex_merged

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,prev_word_string,next_word_string,prev_word_string_frequency,next_word_string_frequency,cond_pred_prev,cond_pred_next,disc,clx,disc_no_bound,clx_no_bound
0,2016-01-01_0100_US_KNBC_Channel_4_News,fire,21.750000,22.110000,0.36,high-confidence,no-error,no-error,no-error,no-error,...,this-fire,fire-broke,229.0,66.0,0.001380,0.046381,'f2-@R,[faI][@r*],f2@R,faI@r*
1,2016-01-01_0100_US_KNBC_Channel_4_News,show,43.280000,43.450000,0.17,high-confidence,no-error,no-error,no-error,no-error,...,fireworks-show,show-to,26.0,70.0,0.047016,0.000130,'S5,[S@U],S5,S@U
2,2016-01-01_0100_US_KNBC_Channel_4_News,fire,46.349999,46.879999,0.53,high-confidence,no-error,no-error,no-error,no-error,...,the-fire,fire-the,1104.0,200.0,0.001184,0.000215,'f2-@R,[faI][@r*],f2@R,faI@r*
3,2016-01-01_0100_US_KNBC_Channel_4_News,fire,48.070000,48.290000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,the-fire,fire-started,1104.0,69.0,0.001184,0.011880,'f2-@R,[faI][@r*],f2@R,faI@r*
4,2016-01-01_0100_US_KNBC_Channel_4_News,fire,68.990000,69.380000,0.39,high-confidence,no-error,no-error,no-error,no-error,...,hotel-fire,fire-nearby,2.0,1.0,0.001541,0.001164,'f2-@R,[faI][@r*],f2@R,faI@r*
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294813,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,end,3135.330000,3135.820000,0.49,high-confidence,no-error,no-error,no-error,no-error,...,high-end,end-nightclub,153.0,3.0,0.014494,0.005587,'End,[End],End,End
294814,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,check,3149.670000,3149.980000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,also-check,check-out,11.0,504.0,0.000428,0.008911,'JEk,[tSEk],JEk,tSEk
294815,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,attack,3252.360000,3252.980000,0.62,high-confidence,no-error,no-error,no-error,no-error,...,this-attack,attack-this,458.0,84.0,0.002761,0.000506,@-'t{k,[@][t&k],@t{k,@t&k
294816,2016-12-31_2300_US_CNN_CNN_Newsroom_With_Poppy...,end,3356.230000,3356.390000,0.16,high-confidence,no-error,no-error,no-error,no-error,...,high-end,end-nightclub,153.0,3.0,0.014494,0.005587,'End,[End],End,End


# Load Additional Information
- eaf files
- seg files
- gentle files

## EAF Files
- information about present gestures

In [7]:
eaf_data = pd.read_csv("2016_all_words_no_audio_eaf_gesture.csv",index_col = "Unnamed: 0")

  mask |= (ar1 == a)


## SEG files
- information about Part Of Speech
- information about Phrase final marker

In [8]:
seg_data = pd.read_csv("2016_all_words_no_audio_seg_data.csv",index_col = "Unnamed: 0")

## GENTLE files
- information about Phrase final marker

In [9]:
gentle_data = pd.read_csv("2016_all_words_no_audio_gentle_data.csv",index_col = "Unnamed: 0")

## Video files
- information about entropy of situation in which the homophones was articulated

In [5]:
#video_data = preprocessing.get_additional_data_from_files(homographs_in_data_celex_merged, "video") # only for homophones

Load and extract information from video files...
Total files to laod and preprocess:  3645
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [6]:
#video_data.to_csv("2016_all_words_no_audio_homographs_video_data.csv")

In [None]:
video_data = pd.read_csv("2016_all_words_no_audio_homographs_video_data.csv",index_col = "Unnamed: 0")

## Celex
- syllable count information

In [10]:
celex_data = celex_files.get_syl_counts(celex_files.read_celex_file())

# Word Pronunciation Predictability (Berndt et al. 1987)

### Load Berndt's tables for Phoneme Equivalents and Conditional Probabilities for Grapheme-to-Phoneme Correspondences

In [11]:
berndt_character_code_df = pd.read_csv(berndt_character_coding_file, delimiter=";")
berndt_conditional_probs = pd.read_csv(berndt_conditional_probs_file,delimiter=";")

### APPABET to corresponding Keyboard Compatible Phonemic (KCP) symbol dict 

In [12]:
berndt_arpabet_phon_dict = word_pronunciation_predictibility.get_ARPABET_to_keyboard_phonetic_symbols_dict(berndt_character_code_df)

### KCP to Grapheme Symbols and Probabilities dict

In [13]:
phonem_graphem_prob_dict = word_pronunciation_predictibility.get_keyboard_phonetic_symbols_to_grapheme_cond_prob_dict(berndt_conditional_probs)

### Homographs with corresponding ARPABET transcription

In [14]:
unique_homographs = np.unique(homographs_in_data_celex_merged.word)
hom_arpabet_words = word_pronunciation_predictibility.get_ARPABET_phonetic_transcription(unique_homographs)

### Homophones with corresponding KCP transcription

In [15]:
hom_kcp_word_tuples = []
for i,arpabet_word in enumerate(hom_arpabet_words):
    kcp_word = word_pronunciation_predictibility.get_keyboard_phonetic_symbols_for_ARPABET(arpabet_word, berndt_arpabet_phon_dict)
    #print(unique_homophones[i],arpabet_word,kcp_word)
    hom_kcp_word_tuples.append((unique_homographs[i],kcp_word))

### Get possible (valid) Grapheme strings and probs for each KCP encoded homograph
Note: 2 homographs not captured: 
- cause ['k', 'ah', 'z']: the AU-E is coded by aw not ah 

Possible Solution: 
- cause ['k', 'aw', 'z']

In [16]:
possible_grapheme_strings, possible_prior_probs, possible_cond_probs, word_rests = word_pronunciation_predictibility.get_grapheme_string_with_conditional_prob_for_keyboard_phonetics(hom_kcp_word_tuples, 
phonem_graphem_prob_dict)

In [17]:
# homophones for which we have no valid grapheme string: 
counter = 0 
for i,word_pron in enumerate(hom_kcp_word_tuples):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(possible_grapheme_strings[i]) == 0 :
        counter+=1
        print(word,pron)
    else:
        empty_string = ['' != i for i in word_rests[i]]
        if np.sum(empty_string) == len(word_rests[i]):
            counter+=1
            print(word,pron)
    
print(counter)

cause ['k', 'ah', 'z']
1


In [18]:
problematic_hom_kcp_word_tuples = [
    ('cause', ['k', 'aw', 'z']),
]

In [19]:
problematic_hom_kcp_word_tuples_dict = {
    'cause': ('cause', ['k', 'aw', 'z']),
}

In [20]:
possible_grapheme_strings_problematic_homs, possible_prior_probs_problematic_homs, possible_cond_probs_problematic_homs, word_rests_problematic_homs = word_pronunciation_predictibility.get_grapheme_string_with_conditional_prob_for_keyboard_phonetics(problematic_hom_kcp_word_tuples, 
phonem_graphem_prob_dict)

In [21]:
j = 0
for i,wp in enumerate(hom_kcp_word_tuples):
    word = wp[0]
    phon = wp[1]
    if word in problematic_hom_kcp_word_tuples_dict:
        possible_grapheme_strings[i] = possible_grapheme_strings_problematic_homs[j]
        word_rests[i] = word_rests_problematic_homs[j]
        possible_prior_probs[i] = possible_prior_probs_problematic_homs[j]
        possible_cond_probs[i] = possible_cond_probs_problematic_homs[j]
        j+=1

In [22]:
valid_word_rests,valid_grapheme_strings,valid_prior_probs, valid_cond_probs = word_pronunciation_predictibility.get_valid_grapheme_strings(hom_kcp_word_tuples, possible_grapheme_strings, word_rests,possible_prior_probs, possible_cond_probs)

In [23]:
# homophones for which we have no valid grapheme string: 
counter = 0 
for i,word_pron in enumerate(hom_kcp_word_tuples):
    word = word_pron[0] # word string
    pron = word_pron[1] # list of keyboard compatible phon characters
    if len(valid_grapheme_strings[i]) == 0:
        counter+=1
        print(word,pron)        
print(counter)

0


In [24]:
max_cond_prob_for_grapheme = word_pronunciation_predictibility.get_max_cond_prob_for_grapheme(berndt_conditional_probs)

In [25]:
m_score_data = word_pronunciation_predictibility.get_m_score_df(hom_kcp_word_tuples, valid_grapheme_strings,valid_cond_probs,max_cond_prob_for_grapheme)

In [26]:
m_score_data

Unnamed: 0,word,m_score
0,act,1.000000
1,answer,0.857661
2,attack,0.835793
3,break,0.761719
4,call,0.679582
...,...,...
56,touch,0.695833
57,turn,0.797962
58,vote,1.000000
59,waste,1.000000


# Merging Dataframes

In [None]:
homographs_in_data_celex_merged

In [28]:
import importlib
importlib.reload(merging_dataframes)

<module 'merging_dataframes' from '/mnt/Restricted/Corpora/RedHen/homophone_analysis_scripts/merging_dataframes.py'>

### Merging eaf data

In [29]:
homographs_in_data_celex_eaf = merging_dataframes.merge_eaf_df_to_homophone_data(homographs_in_data_celex_merged, eaf_data)

1  /  3645
101  /  3645
201  /  3645
301  /  3645
401  /  3645
501  /  3645
601  /  3645
701  /  3645
801  /  3645
901  /  3645
1001  /  3645
1101  /  3645
1201  /  3645
1301  /  3645
1401  /  3645
1501  /  3645
1601  /  3645
1701  /  3645
1801  /  3645
1901  /  3645
2001  /  3645
2101  /  3645
2201  /  3645
2301  /  3645
2401  /  3645
2501  /  3645
2601  /  3645
2701  /  3645
2801  /  3645
2901  /  3645
3001  /  3645
3101  /  3645
3201  /  3645
3301  /  3645
3401  /  3645
3501  /  3645
3601  /  3645


In [None]:
len(homographs_in_data_celex_eaf)

### Merging video data

In [31]:
homographs_in_data_celex_eaf_video = merging_dataframes.merge_video_df_to_homophone_data(homographs_in_data_celex_eaf, video_data)

In [None]:
len(homophones_in_data_celex_eaf_video)

### Merging gentle data

In [33]:
homographs_in_data_celex_eaf_video_gentle = merging_dataframes.merge_gentle_df_to_homophone_data(homographs_in_data_celex_eaf_video, gentle_data)

Merge gentle data for 3645 unique files!
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [None]:
len(homographs_in_data_celex_eaf_video_gentle)

### Merging seg data

In [34]:
homographs_in_data_celex_eaf_video_gentle_seg = merging_dataframes.merge_seg_df_to_homophone_data(homographs_in_data_celex_eaf_video_gentle, seg_data)

Merge seg data for 3645 unique files!
File:  0
File:  100
File:  200
File:  300
File:  400
File:  500
File:  600
File:  700
File:  800
File:  900
File:  1000
File:  1100
File:  1200
File:  1300
File:  1400
File:  1500
File:  1600
File:  1700
File:  1800
File:  1900
File:  2000
File:  2100
File:  2200
File:  2300
File:  2400
File:  2500
File:  2600
File:  2700
File:  2800
File:  2900
File:  3000
File:  3100
File:  3200
File:  3300
File:  3400
File:  3500
File:  3600


In [None]:
len(homographs_in_data_celex_eaf_video_gentle_seg)

### Merging m-scores data

In [35]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores = merging_dataframes.merge_m_scores_df_to_homophone_data(homographs_in_data_celex_eaf_video_gentle_seg,m_score_data)

### Merging celex syllable counts data

In [37]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll = merging_dataframes.merge_celex_syl_counts_df_to_homophone_data(homographs_in_data_celex_eaf_video_gentle_seg_m_scores,celex_data)

In [41]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,seg_preceding_marker,seg_subsequent_marker,seg_merging,seg_index,pos,rel1,rel2,lemma,m_score,SylCnt
0,2016-01-01_0100_US_KNBC_Channel_4_News,fire,21.75,22.110000,0.36,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,83.0,NN,I-NP,O,fire,0.863,2
1,2016-01-01_0100_US_KNBC_Channel_4_News,fire,46.35,46.879999,0.53,high-confidence,no-error,no-error,no-error,no-error,...,False,True,high-confidence,161.0,NN,I-NP,I-PNP,fire,0.863,2
2,2016-01-01_0100_US_KNBC_Channel_4_News,fire,48.07,48.290000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,164.0,NN,I-NP,O,fire,0.863,2
3,2016-01-01_0100_US_KNBC_Channel_4_News,fire,68.99,69.380000,0.39,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,240.0,NN,I-NP,O,fire,0.863,2
4,2016-01-01_0100_US_KNBC_Channel_4_News,fire,99.12,99.370000,0.25,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,317.0,WDT,I-NP,O,FIRE,0.863,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294813,2016-12-29_1400_US_CNN_CNN_Newsroom_With_Carol...,dress,182.11,182.440000,0.33,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,22092644.0,NN,I-NP,O,dress,1.000,1
294814,2016-12-29_1500_US_CNN_CNN_Newsroom_With_Carol...,dress,1350.66,1351.050000,0.39,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,22105392.0,NN,I-NP,I-PNP,dress,1.000,1
294815,2016-12-31_0230_US_KCBS_CBS_Evening_News_With_...,dress,1550.44,1550.730000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,,,,,,,,,1.000,1
294816,2016-12-31_1300_US_MSNBC_MSNBC_Live,dress,674.31,674.609999,0.30,high-confidence,no-error,no-error,no-error,no-error,...,False,False,high-confidence,22303686.0,NN,I-NP,I-PNP,dress,1.000,1


In [38]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll.to_csv("2016_all_words_no_audio_homographs_preprocessed.csv")

### Merging Model Predictions for conditional predictibality of Homophones 
- missing TODO

In [None]:
model_predictions = pd.read_csv("conditional_prob_of_homophones_model_predictions.csv",index_col = "Unnamed: 0")

In [None]:
model_predictions.drop_duplicates(subset = ["prev_word", "homophone", "next_word"], inplace = True)
model_predictions.rename(columns= {"homophone": "word", "prev_prob": "prob|prev_model", "next_prob" : "prob|next_model"}, inplace=True)

In [None]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions = homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll.merge(model_predictions, on = ["prev_word", "word", "next_word"])

In [None]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions.columns

In [None]:
homographs_in_data_celex_eaf_video_gentle_seg_m_scores_syll_model_predictions.rename(columns= {"cond_pred_prev": "prob|prev_text", "cond_pred_next" : "prob|next_text"}, inplace=True)

# Homographs merged with low confidence 

In [42]:
preprocessed_df = pd.read_csv("2016_all_words_no_audio_homographs_preprocessed.csv",index_col = "Unnamed: 0")

In [43]:
low_confidence_homs = preprocessed_df[np.logical_or(preprocessed_df.seg_merging == "low-confidence",preprocessed_df.gentle_merging == "low-confidence")]

In [44]:
low_confidence_homs[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word',"seg_error",'eafgz_error','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,seg_error,eafgz_error,preceding_pause,subsequent_pause
21,fire,hotel,holly,hotel,holly,hotel,quijano,no-error,no-error,False,True
52,fire,open,with,open,with,open,reporter,no-error,no-error,True,False
71,fire,on,who,on,who,on,reporter,no-error,no-error,False,False
74,fire,in,the,the,the,the,the,no-error,no-error,False,True
96,fire,down,and,the,and,the,and,no-error,no-error,False,False
...,...,...,...,...,...,...,...,...,...,...,...
294769,dress,icon's,sold,icon's,sold,'s,sold,no-error,no-error,True,False
294791,dress,woman's,stained,woman's,stained,'s,stained,no-error,no-error,False,True
294792,dress,term,for,``,for,"""",for,no-error,no-error,False,False
294799,dress,to,garang,to,garang,to,reporter,no-error,no-error,False,False


In [45]:
#subset = ["word", "prev_word", "next_word", "gentle_prev_word", "gentle_next_word", "seg_prev_word", "seg_next_word"]
low_confidence_homs.drop_duplicates()[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
21,fire,hotel,holly,hotel,holly,hotel,quijano,False,True
52,fire,open,with,open,with,open,reporter,True,False
71,fire,on,who,on,who,on,reporter,False,False
74,fire,in,the,the,the,the,the,False,True
96,fire,down,and,the,and,the,and,False,False
...,...,...,...,...,...,...,...,...,...
294769,dress,icon's,sold,icon's,sold,'s,sold,True,False
294791,dress,woman's,stained,woman's,stained,'s,stained,False,True
294792,dress,term,for,``,for,"""",for,False,False
294799,dress,to,garang,to,garang,to,reporter,False,False


## Seg Data

In [46]:
low_confidence_homs_seg = preprocessed_df[preprocessed_df.seg_merging == "low-confidence"][["word", 'prev_word', 'next_word', 'seg_prev_word', 'seg_next_word','preceding_pause','subsequent_pause']]
#low_confidence_homs_seg.drop_duplicates(subset = ["word", "prev_word", "next_word", "gentle_prev_word", "gentle_next_word", "seg_prev_word", "seg_next_word"])[["word", 'prev_word', 'next_word', 'seg_prev_word', 'seg_next_word','preceding_pause','subsequent_pause'], inplace = True]
low_confidence_homs_seg

Unnamed: 0,word,prev_word,next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
21,fire,hotel,holly,hotel,quijano,False,True
52,fire,open,with,open,reporter,True,False
71,fire,on,who,on,reporter,False,False
74,fire,in,the,the,the,False,True
96,fire,down,and,the,and,False,False
...,...,...,...,...,...,...,...
294769,dress,icon's,sold,'s,sold,True,False
294791,dress,woman's,stained,'s,stained,False,True
294792,dress,term,for,"""",for,False,False
294799,dress,to,garang,to,reporter,False,False


## Gentle Data

In [47]:
low_confidence_homs_gentle = preprocessed_df[preprocessed_df.gentle_merging == "low-confidence"][["word", 'prev_word', 'next_word',"gentle_prev_word", "gentle_next_word" ,'preceding_pause','subsequent_pause']]
#low_confidence_homs_gentle.drop_duplicates(subset = ["word", "prev_word", "next_word", "gentle_prev_word", "gentle_next_word"])[["word", 'prev_word', 'next_word',"gentle_prev_word", "gentle_next_word" ,'preceding_pause','subsequent_pause']]
low_confidence_homs_gentle

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,preceding_pause,subsequent_pause
74,fire,in,the,the,the,False,True
96,fire,down,and,the,and,False,False
115,fire,on,rating,on,'',False,False
125,fire,a,captain,l.a.,captain,False,False
126,fire,a,department's,l.a.,department's,False,False
...,...,...,...,...,...,...,...
294744,dress,islamic,codes,islamic,codes.s,False,False
294753,dress,not,mrs,not,mrs.,False,False
294760,dress,the,all,the,and,False,False
294792,dress,term,for,``,for,False,False


# Find Substrings
Because of differences in splitting it might happen that only a subpart of the word is listed in prev-/ next-word columns. Check wether the previous word column ends with the same string and check wether the next column starts with the same string.

## Seg Data

### Prev word

In [48]:
low_confidence_homs_seg_because_prev_word = low_confidence_homs_seg[low_confidence_homs_seg.seg_prev_word != low_confidence_homs_seg.prev_word]
print(len(low_confidence_homs_seg_because_prev_word))
matched_prev_word_substring_seg = low_confidence_homs_seg_because_prev_word.apply(lambda row: str(row.seg_prev_word).endswith(str(row.prev_word)) or str(row.prev_word).endswith(str(row.seg_prev_word)), axis = 1)
low_confidence_homs_seg_because_prev_word[matched_prev_word_substring_seg]
#low_confidence_homs_seg[low_confidence_homs_seg.apply(lambda row: str(row.seg_prev_word) in str(row.prev_word) or str(row.prev_word) in str(row.seg_prev_word), axis = 1)]

11592


Unnamed: 0,word,prev_word,next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
361,fire,clinton's,wall,'s,wall,False,False
552,fire,plane's,suppression,'s,suppression,False,False
610,fire,clinton's,wall,'s,wall,False,False
656,fire,solon's,battalion,'s,battalion,True,False
708,fire,rubio's,wall,'s,wall,False,False
...,...,...,...,...,...,...,...
293581,study,today's,suggested,'s,suggested,False,False
293713,study,com,a,bankrate.com,a,False,True
293823,study,school's,abroad,'s,abroad,False,False
294769,dress,icon's,sold,'s,sold,True,False


### Next word

In [49]:
low_confidence_homs_seg_because_next_word = low_confidence_homs_seg[low_confidence_homs_seg.seg_next_word != low_confidence_homs_seg.next_word]
print(len(low_confidence_homs_seg_because_next_word))
matched_next_word_substring_seg = low_confidence_homs_seg_because_next_word.apply(lambda row: str(row.seg_next_word).startswith(str(row.next_word)) or str(row.next_word).startswith(str(row.seg_next_word)), axis = 1)
low_confidence_homs_seg_because_next_word[matched_next_word_substring_seg]

10421


Unnamed: 0,word,prev_word,next_word,seg_prev_word,seg_next_word,preceding_pause,subsequent_pause
126,fire,a,department's,l.a.,department,False,False
754,fire,a,a,a,and,False,False
816,fire,opens,a,opens,at,False,False
878,fire,there's,there's,'s,there,True,True
1332,fire,a,the,a,then,False,False
...,...,...,...,...,...,...,...
292792,waste,and,it,and,it.ep,False,False
293244,study,study,i,study,i.,False,False
294282,study,new,a,new,a.a.a.,False,False
294744,dress,islamic,codes,islamic,codes.s,False,False


## Gentle Data

### Prev word

In [50]:
low_confidence_homs_gentle_because_prev_word = low_confidence_homs_gentle[low_confidence_homs_gentle.gentle_prev_word != low_confidence_homs_gentle.prev_word]
print(len(low_confidence_homs_gentle_because_prev_word))
matched_prev_word_substring_gentle = low_confidence_homs_gentle_because_prev_word.apply(lambda row: str(row.gentle_prev_word).endswith(str(row.prev_word)) or str(row.prev_word).endswith(str(row.gentle_prev_word)), axis = 1)
low_confidence_homs_gentle_because_prev_word[matched_prev_word_substring_gentle]
#low_confidence_homs_seg[low_confidence_homs_seg.apply(lambda row: str(row.seg_prev_word) in str(row.prev_word) or str(row.prev_word) in str(row.seg_prev_word), axis = 1)]

6747


Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,preceding_pause,subsequent_pause
1035,fire,oh,they're,uh-oh,aaagh,False,True
1081,fire,alarm,the,three-alarm,and,False,False
3626,fire,moving,out,fast-moving,barely,False,False
7609,show,000,up,200000,up,False,False
7745,show,free,9,commercial-free,9:00,False,False
...,...,...,...,...,...,...,...
289430,sign,com,up,myheritage.com,up,True,False
289451,sign,com,up,myheritage.com,up,True,False
289696,sign,com,up,myheritage.com,up,True,False
293069,waste,24,management,36.24,management,True,False


### Next word

In [51]:
low_confidence_homs_gentle_because_next_word = low_confidence_homs_gentle[low_confidence_homs_gentle.gentle_next_word != low_confidence_homs_gentle.next_word]
print(len(low_confidence_homs_gentle_because_next_word))
matched_next_word_substring_gentle = low_confidence_homs_gentle_because_next_word.apply(lambda row: str(row.gentle_next_word).startswith(str(row.next_word)) or str(row.next_word).startswith(str(row.gentle_next_word)), axis = 1)
low_confidence_homs_gentle_because_next_word[matched_next_word_substring_gentle]

7603


Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,preceding_pause,subsequent_pause
754,fire,a,a,a,and,False,False
816,fire,opens,a,opens,at,False,False
1332,fire,a,the,a,then,False,False
1723,fire,sniper,u,sniper,u.s.-led,False,False
1873,fire,brush,l,brush,l.a.,False,True
...,...,...,...,...,...,...,...
292792,waste,and,it,and,it.ep,False,False
293244,study,study,i,study,i.,False,False
294282,study,new,a,new,a.a.a.,False,False
294744,dress,islamic,codes,islamic,codes.s,False,False


## Change merging confidence to medium

In [52]:
low_confidence_homs["matched_prev_word_substring_seg"] = np.nan
low_confidence_homs["matched_next_word_substring_seg"] = np.nan
low_confidence_homs["matched_prev_word_substring_gentle"] = np.nan
low_confidence_homs["matched_next_word_substring_gentle"] = np.nan

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_prev_word_substring_seg"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_next_word_substring_seg"] = np.nan
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_prev_word_substring_gentle"] = np.nan
A value is

In [53]:
low_confidence_homs

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,pos,rel1,rel2,lemma,m_score,SylCnt,matched_prev_word_substring_seg,matched_next_word_substring_seg,matched_prev_word_substring_gentle,matched_next_word_substring_gentle
21,2016-01-01_0229_US_KCBS_CBS_Evening_News,fire,133.54,134.000000,0.46,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,I-PNP,fire,0.863,2,,,,
52,2016-01-02_0200_UK_KCET_BBC_World_News,fire,964.47,964.610000,0.14,low-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,I-PNP,fire,0.863,2,,,,
71,2016-01-02_0230_US_KNBC_NBC_Nightly_News,fire,1697.64,1697.920000,0.28,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,I-PNP,fire,0.863,2,,,,
74,2016-01-02_0700_US_KNBC_Channel_4_News_at_11PM,fire,392.87,393.290000,0.42,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,I-PNP,fire,0.863,2,,,,
96,2016-01-04_0735_US_KNBC_NBC_4_News,fire,218.01,218.620000,0.61,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,I-PNP,fire,0.863,2,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294769,2016-11-19_0230_US_KCBS_CBS_Evening_News_With_...,dress,1179.20,1179.740000,0.54,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,O,dress,1.000,1,,,,
294791,2016-12-19_0130_US_KCBS_CBS_Weekend_News,dress,637.00,637.430000,0.43,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,O,dress,1.000,1,,,,
294792,2016-12-19_0700_US_KNBC_Channel_4_News_at_11PM,dress,927.99,928.370000,0.38,high-confidence,no-error,no-error,no-error,no-error,...,NN,I-NP,O,dress,1.000,1,,,,
294799,2016-12-22_0837_US_KABC_Nightline,dress,1082.80,1083.160000,0.36,high-confidence,no-error,no-error,no-error,no-error,...,VB,I-VP,O,dress,1.000,1,,,,


In [54]:
low_confidence_homs.loc[matched_prev_word_substring_seg.index,"matched_prev_word_substring_seg"] = matched_prev_word_substring_seg.replace(False,-1)
low_confidence_homs.loc[matched_next_word_substring_seg.index,"matched_next_word_substring_seg"] = matched_next_word_substring_seg.replace(False,-1)
low_confidence_homs.loc[matched_prev_word_substring_gentle.index,"matched_prev_word_substring_gentle"] = matched_prev_word_substring_gentle.replace(False,-1)
low_confidence_homs.loc[matched_next_word_substring_gentle.index,"matched_next_word_substring_gentle"] = matched_next_word_substring_gentle.replace(False,-1)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self.obj[item] = s


In [55]:
low_confidence_homs["matched_prev_word_substring_seg"] = low_confidence_homs["matched_prev_word_substring_seg"].replace(np.nan,0)
low_confidence_homs["matched_next_word_substring_seg"] = low_confidence_homs["matched_next_word_substring_seg"].replace(np.nan,0)
low_confidence_homs["matched_prev_word_substring_gentle"] = low_confidence_homs["matched_prev_word_substring_gentle"].replace(np.nan,0)
low_confidence_homs["matched_next_word_substring_gentle"] = low_confidence_homs["matched_next_word_substring_gentle"].replace(np.nan,0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_prev_word_substring_seg"] = low_confidence_homs["matched_prev_word_substring_seg"].replace(np.nan,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  low_confidence_homs["matched_next_word_substring_seg"] = low_confidence_homs["matched_next_word_substring_seg"].replace(np.nan,0)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/use

In [56]:
low_confidence_homs[["matched_prev_word_substring_seg","matched_next_word_substring_seg","matched_prev_word_substring_gentle", "matched_next_word_substring_gentle"]]

Unnamed: 0,matched_prev_word_substring_seg,matched_next_word_substring_seg,matched_prev_word_substring_gentle,matched_next_word_substring_gentle
21,0.0,-1.0,0.0,0.0
52,0.0,-1.0,0.0,0.0
71,0.0,-1.0,0.0,0.0
74,-1.0,0.0,-1.0,0.0
96,-1.0,0.0,-1.0,0.0
...,...,...,...,...
294769,1.0,0.0,0.0,0.0
294791,1.0,0.0,0.0,0.0
294792,-1.0,0.0,-1.0,0.0
294799,0.0,-1.0,0.0,0.0


In [57]:
medium_confidence_seg = low_confidence_homs[low_confidence_homs["matched_prev_word_substring_seg"] + low_confidence_homs["matched_next_word_substring_seg"] > 0].index

In [58]:
medium_confidence_gentle = low_confidence_homs[low_confidence_homs["matched_prev_word_substring_gentle"] + low_confidence_homs["matched_next_word_substring_gentle"] > 0].index

In [59]:
preprocessed_df.loc[medium_confidence_seg,"seg_merging"] = "medium-confidence"
preprocessed_df.loc[medium_confidence_gentle,"gentle_merging"] = "medium-confidence"

# Remaining Low confidence homophones

In [None]:
#preprocessed_df = pd.read_csv("2016_all_words_no_audio_preprocessed.csv",index_col = "Unnamed: 0")

In [60]:
remaining_low_confidence_homs = preprocessed_df[np.logical_or(preprocessed_df.seg_merging == "low-confidence",preprocessed_df.gentle_merging == "low-confidence")]

In [61]:
remaining_low_confidence_homs[["word", 'prev_word', 'next_word', 'gentle_prev_word', 'gentle_next_word', 'seg_prev_word', 'seg_next_word',"seg_error",'eafgz_error','preceding_pause','subsequent_pause']]

Unnamed: 0,word,prev_word,next_word,gentle_prev_word,gentle_next_word,seg_prev_word,seg_next_word,seg_error,eafgz_error,preceding_pause,subsequent_pause
21,fire,hotel,holly,hotel,holly,hotel,quijano,no-error,no-error,False,True
52,fire,open,with,open,with,open,reporter,no-error,no-error,True,False
71,fire,on,who,on,who,on,reporter,no-error,no-error,False,False
74,fire,in,the,the,the,the,the,no-error,no-error,False,True
96,fire,down,and,the,and,the,and,no-error,no-error,False,False
...,...,...,...,...,...,...,...,...,...,...,...
294743,dress,can't,that,can't,that,not,that,no-error,no-error,False,False
294760,dress,the,all,the,and,the,and,no-error,no-error,False,False
294792,dress,term,for,``,for,"""",for,no-error,no-error,False,False
294799,dress,to,garang,to,garang,to,reporter,no-error,no-error,False,False


In [62]:
remaining_low_confidence_homs.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gentle_preceding_marker',
       'gentle_subsequent_marker', 'gentle_mer

In [63]:
remaining_low_confidence_homs[["preceding_pause", "subsequent_pause", "gentle_start_of_sentence", "gentle_end_of_sentence", "seg_start_of_sentence", "seg_end_of_sentence","seg_preceding_marker",
       "seg_subsequent_marker", "gentle_preceding_marker","gentle_subsequent_marker"]]

Unnamed: 0,preceding_pause,subsequent_pause,gentle_start_of_sentence,gentle_end_of_sentence,seg_start_of_sentence,seg_end_of_sentence,seg_preceding_marker,seg_subsequent_marker,gentle_preceding_marker,gentle_subsequent_marker
21,False,True,False,True,False,True,False,True,False,True
52,True,False,False,False,False,True,False,True,False,False
71,False,False,False,True,False,True,False,True,False,True
74,False,True,False,True,False,True,False,True,False,True
96,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...
294743,False,False,False,False,False,False,False,False,False,False
294760,False,False,False,False,False,False,False,False,False,False
294792,False,False,False,False,False,False,False,False,False,False
294799,False,False,False,True,False,True,False,True,False,True


In [64]:
not_index = preprocessed_df.index.isin(remaining_low_confidence_homs.index)
high_confidence_homs = preprocessed_df[~not_index]

In [69]:
medium_confidence_homs = preprocessed_df[np.logical_or(preprocessed_df.seg_merging == "medium-confidence",preprocessed_df.gentle_merging == "medium-confidence")]

In [70]:
print("High Confidence rows: ", len(high_confidence_homs)-len(medium_confidence_homs))
print("Medium Confidence rows: ", len(medium_confidence_homs))
print("Low Confidence rows: ", len(remaining_low_confidence_homs))
print("------------------------------------------------")
print(len(preprocessed_df), "Dataframe rows") 
print(len(remaining_low_confidence_homs)/len(preprocessed_df) * 100, "%")

High Confidence rows:  273061
Medium Confidence rows:  3629
Low Confidence rows:  18128
------------------------------------------------
294818 Dataframe rows
6.148878291013439 %


In [67]:
preprocessed_df[np.logical_or(preprocessed_df.seg_merging == "medium-confidence",preprocessed_df.gentle_merging == "medium-confidence")]

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,seg_preceding_marker,seg_subsequent_marker,seg_merging,seg_index,pos,rel1,rel2,lemma,m_score,SylCnt
361,2016-01-20_0230_US_KNBC_NBC_Nightly_News,fire,377.14,377.459999,0.32,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,1267038.0,NN,I-NP,O,fire,0.863,2
552,2016-02-11_0230_US_KNBC_NBC_Nightly_News,fire,657.74,658.080000,0.34,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,2329196.0,NN,I-NP,O,fire,0.863,2
610,2016-02-16_1700_US_CNN_Legal_View_With_Ashleig...,fire,1375.60,1376.020000,0.42,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,2650735.0,NN,I-NP,O,fire,0.863,2
656,2016-02-19_2302_US_WEWS_NewsChannel_5_at_6pm,fire,740.28,740.590000,0.31,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,2904383.0,NN,I-NP,O,fire,0.863,2
708,2016-02-24_2000_US_FOX-News_Shephard_Smith_Rep...,fire,316.81,317.099999,0.29,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,3170657.0,NN,I-NP,O,fire,0.863,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294282,2016-12-07_0230_US_KCBS_CBS_Evening_News_With_...,study,935.07,935.540000,0.47,high-confidence,no-error,no-error,no-error,no-error,...,False,True,medium-confidence,20251518.0,NN,I-NP,I-PNP,study,1.000,2
294744,2016-11-02_0100_US_KOCE_The_PBS_Newshour,dress,1656.21,1656.559999,0.35,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,17955382.0,NN,I-NP,I-PNP,dress,1.000,1
294753,2016-11-16_0030_US_KCAL_Inside_Edition,dress,1092.77,1093.150000,0.38,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,18657740.0,VB,I-VP,O,dress,1.000,1
294769,2016-11-19_0230_US_KCBS_CBS_Evening_News_With_...,dress,1179.20,1179.740000,0.54,high-confidence,no-error,no-error,no-error,no-error,...,False,False,medium-confidence,19008807.0,NN,I-NP,O,dress,1.000,1


In [71]:
#preprocessed_df.to_csv("2016_all_words_no_audio_homographs_preprocessed.csv")

In [5]:
BASE = '/mnt/Restricted/Corpora/RedHen'
DATA_FOLDER = os.path.join(BASE, 'original')
DF_SOURCE_PATH = os.path.join(BASE, '2016_all_words_no_audio.pickle')
DF_HOMOGRAPH_PATH = os.path.join(BASE, 'homophone_analysis_scripts/2016_all_words_no_audio_homographs_preprocessed.csv')
celex_dict_file = "/mnt/shared/corpora/Celex/english/epw/epw.cd"

#STRETCHES_PATH = BASE + '/homophone_analysis_scripts'
STRETCHES_PATH = ''#'/mnt/shared/people/elnaz/homophones/10sec_stretch/'

NS = '<non-speech>'

In [6]:
df_hom = pd.read_csv(DF_HOMOGRAPH_PATH, index_col = "Unnamed: 0")

In [9]:
df_hom = df_hom.drop(columns=['seg_prev_word', 'seg_next_word', 'seg_end_of_sentence',
       'seg_start_of_sentence', 'seg_preceding_marker',
       'seg_subsequent_marker', 'seg_merging', 'seg_index', 'pos', 'rel1',
       'rel2', 'lemma'])

In [10]:
df_hom.columns

Index(['source_file', 'word', 'start', 'end', 'duration', 'label_type',
       'mp4_error', 'aac_error', 'aac2wav_error', 'eafgz_error', 'seg_error',
       'preceding_pause', 'subsequent_pause', 'word_frequency', 'prev_word',
       'prev_word_frequency', 'next_word', 'next_word_frequency',
       'length_in_letter', 'prev_word_string', 'next_word_string',
       'prev_word_string_frequency', 'next_word_string_frequency',
       'cond_pred_prev', 'cond_pred_next', 'disc', 'clx', 'disc_no_bound',
       'clx_no_bound', 'gesture', 'HandMoving', 'PersonOnScreen',
       'SpeakerOnScreen', 'HeadMoving/MovingVertically',
       'ShoulderMoving/NotWithHead', 'HeadMoving/MovingHorizontally',
       'ShoulderMoving/NoSlidingWindow', 'none',
       'ShoulderMoving/SlidingWindow', 'is_gesture', 'video_snippet_size',
       'gentle_prev_word', 'gentle_next_word', 'gentle_end_of_sentence',
       'gentle_start_of_sentence', 'gentle_preceding_marker',
       'gentle_subsequent_marker', 'gentle_mer

In [12]:
df_hom

Unnamed: 0,source_file,word,start,end,duration,label_type,mp4_error,aac_error,aac2wav_error,eafgz_error,...,gentle_prev_word,gentle_next_word,gentle_end_of_sentence,gentle_start_of_sentence,gentle_preceding_marker,gentle_subsequent_marker,gentle_merging,gentle_index,m_score,SylCnt
0,2016-01-01_0100_US_KNBC_Channel_4_News,fire,21.75,22.110000,0.36,high-confidence,no-error,no-error,no-error,no-error,...,this,broke,False,False,False,False,high-confidence,81.0,0.863,2
1,2016-01-01_0100_US_KNBC_Channel_4_News,fire,46.35,46.879999,0.53,high-confidence,no-error,no-error,no-error,no-error,...,the,the,True,False,False,True,high-confidence,158.0,0.863,2
2,2016-01-01_0100_US_KNBC_Channel_4_News,fire,48.07,48.290000,0.22,high-confidence,no-error,no-error,no-error,no-error,...,the,started,False,False,False,False,high-confidence,161.0,0.863,2
3,2016-01-01_0100_US_KNBC_Channel_4_News,fire,68.99,69.380000,0.39,high-confidence,no-error,no-error,no-error,no-error,...,hotel,nearby,False,False,False,False,high-confidence,235.0,0.863,2
4,2016-01-01_0100_US_KNBC_Channel_4_News,fire,99.12,99.370000,0.25,high-confidence,no-error,no-error,no-error,no-error,...,caught,is,False,False,False,False,high-confidence,310.0,0.863,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
294813,2016-12-29_1400_US_CNN_CNN_Newsroom_With_Carol...,dress,182.11,182.440000,0.33,high-confidence,no-error,no-error,no-error,no-error,...,subway,at,False,False,False,False,high-confidence,21452902.0,1.000,1
294814,2016-12-29_1500_US_CNN_CNN_Newsroom_With_Carol...,dress,1350.66,1351.050000,0.39,high-confidence,no-error,no-error,no-error,no-error,...,eccentric,and,False,False,False,False,high-confidence,21465305.0,1.000,1
294815,2016-12-31_0230_US_KCBS_CBS_Evening_News_With_...,dress,1550.44,1550.730000,0.29,high-confidence,no-error,no-error,no-error,no-error,...,,,,,,,,,1.000,1
294816,2016-12-31_1300_US_MSNBC_MSNBC_Live,dress,674.31,674.609999,0.30,high-confidence,no-error,no-error,no-error,no-error,...,at,rehearsal,False,False,False,False,high-confidence,21657916.0,1.000,1


In [11]:
#df_hom.to_csv("2016_all_words_no_audio_homographs_preprocessed.csv")