# Using syllabifyARPA

In [1]:
from syllabifyARPA import syllabifyARPA

## With Python strings and arrays

In [2]:
ARPAstring = 'HH AE NG M AE N'
ARPAarray = ARPAstring.split()

In [3]:
print(syllabifyARPA(ARPAstring))
print(syllabifyARPA(ARPAarray)) # Same result with a string or phone array

0    HH AE NG
1      M AE N
dtype: object
0    HH AE NG
1      M AE N
dtype: object


In [4]:
# Use return_list parameter to change return type (Pandas Series by default)
syllabifyARPA(ARPAarray, return_list=True)

['HH AE NG', 'M AE N']

In [12]:
unsyllabifiable = 'M G L AA'
try:
    syllabifyARPA(unsyllabifiable)
except ValueError as e:
    print("Caught the following error:")
    import sys
    print(e, file=sys.stderr)

Caught the following error:


Bad onset cluster in M G L AA


## With Pandas Series and DataFrames

### Importing CMU dictionary data from the text file

In [6]:
import pandas as pd
from syllabifyARPA import syllabifyARPA

df = pd.read_csv('cmudict.txt', delimiter='\n', header=None, quoting=3, comment='#', names=['dict'])
#df = pd.read_csv('cmusubset.txt', delimiter='\n', header=None, names=['dict']) # For quick testing
df.head()

Unnamed: 0,dict
0,!EXCLAMATION-POINT EH2 K S K L AH0 M EY1 SH A...
1,"""CLOSE-QUOTE K L OW1 Z K W OW1 T"
2,"""DOUBLE-QUOTE D AH1 B AH0 L K W OW1 T"
3,"""END-OF-QUOTE EH1 N D AH0 V K W OW1 T"
4,"""END-QUOTE EH1 N D K W OW1 T"


### Preprocessing data

In [7]:
# Removing all rows containing non-alphanumeric characters and spaces 
df = df[df['dict'].str.contains(r'[^A-Z0-2 ]') == False]
df = df['dict'].str.extract(r'(?P<word>\w+) (?P<transcription>.+)', expand=True)
# Splitting into an array of phonemes described in phoneset.txt
df['transcription'] = df['transcription'].str.split()
df.head()

Unnamed: 0,word,transcription
38,A,[AH0]
45,AABERG,"[AA1, B, ER0, G]"
46,AACHEN,"[AA1, K, AH0, N]"
47,AAKER,"[AA1, K, ER0]"
48,AALSETH,"[AA1, L, S, EH0, TH]"


### Applying syllabifyARPA and manipulating the returned DataFrame

In [8]:
# Ignores any ValueErrors thrown by the function
syllables = df['transcription'].apply(syllabifyARPA, silence_warnings=True)
(syllables.dropna(thresh=1)).head() # Drops unsyllabifiable transcriptions - not in place!

Unnamed: 0,0,1,2,3,4,5,6,7,8
38,AH0,,,,,,,,
45,AA1,B ER0 G,,,,,,,
46,AA1,K AH0 N,,,,,,,
47,AA1,K ER0,,,,,,,
48,AA1 L,S EH0 TH,,,,,,,


In [9]:
df = pd.concat([df, syllables], axis=1)
melted_df = df.melt(id_vars=['word', 'transcription'], value_name='syllable', var_name='position')
melted_df.dropna(inplace=True) # Drops all NaN positions, e.g., syllables 3+ in a disyllabic word

In [10]:
melted_df.sort_values('word', inplace=True)
melted_df.head()

Unnamed: 0,word,transcription,position,syllable
0,A,[AH0],0,AH0
102287,AABERG,"[AA1, B, ER0, G]",1,B ER0 G
1,AABERG,"[AA1, B, ER0, G]",0,AA1
2,AACHEN,"[AA1, K, AH0, N]",0,AA1
102288,AACHEN,"[AA1, K, AH0, N]",1,K AH0 N
