# CMU dictionary

http://svn.code.sf.net/p/cmusphinx/code/trunk/cmudict/

digits are stress combinations

In [1]:
import functools as ft
import itertools as it
import json
import re

from paprotka.dataset import reddots

In [2]:
with open('paths.json') as opened:
    paths = json.load(opened)
    
reddots_root = paths['reddots_root']
cmudict_root = paths['cmudict_root']

In [3]:
def load_phones(path):
    phones = []
    with open(path) as opened:
        for line in opened:
            phone, phone_type = line.split('\t')
            phones.append(phone)
    return phones

In [4]:
unique_phones = load_phones(cmudict_root + '/cmudict-0.7b.phones')

In [15]:
len(unique_phones)

39

In [5]:
def load_cmudict(path):
    dictionary = {}
    with open(path, encoding='latin1') as opened:
        for line in opened:
            if line.startswith(';;;'):
                continue
            word, phone_line = line[:-1].split('  ')
            phones = phone_line.split(' ')
            dictionary[word] = phones
    return dictionary

In [6]:
cmudict = load_cmudict(cmudict_root + '/cmudict-0.7b')

## RedDots conversion

In [7]:
script = reddots.load_script(reddots_root + '/infos/script.txt')

In [9]:
non_word_regex = re.compile('[^\w\']+')

def convert_content(cmudict, content):
    try:
        words = filter(None, non_word_regex.split(content.upper()))
        phone_packs = map(cmudict.get, words)
        phone_pack_strings = map(' '.join, phone_packs)
        phones = ' _ '.join(phone_pack_strings)
        return phones
    except TypeError:
        return None
convert_content_cmudict = ft.partial(convert_content, cmudict)

for i in (51, 99):
    sample_content = script.loc[i].content
    print(sample_content, convert_content(cmudict, sample_content), sep='\n')

So how do the cowboys beat the blitz
S OW1 _ HH AW1 _ D UW1 _ DH AH0 _ K AW1 B OY2 Z _ B IY1 T _ DH AH0 _ B L IH1 T S
We'll do everything to find him he said
W IY1 L _ D UW1 _ EH1 V R IY0 TH IH2 NG _ T UW1 _ F AY1 N D _ HH IH1 M _ HH IY1 _ S EH1 D


In [10]:
script['phones'] = script.content.apply(convert_content_cmudict)

In [11]:
print(script.loc[script.phones.isna()].count())
script.loc[script.phones.isna()]

content    27
phones      0
dtype: int64


Unnamed: 0_level_0,content,phones
sentence_id,Unnamed: 1_level_1,Unnamed: 2_level_1
10049,Meine Stimme ist mein Passwort,
10085,Kobyla ma maly bok,
10194,Multi modal biometrics is best,
10136,Beleive you can and you're half way there,
10086,Hovno s makem a tabakem,
10178,dota is goog game,
10199,"Hola, cómo estás y en que te puedo ayudar?",
10105,je suis ton père,
10106,mangeons chers amis et buvons,
10200,Mas vale pajaro en mano que cien volando,


In [12]:
script.to_csv(reddots_root + '/infos/phones.csv', sep=';')