# Reformed English Dictionary

This document explores what it might look like to map english words to phonetic spellings. The phonetic dictionary is from [CMU Sphinx](https://github.com/cmusphinx/cmudict).

In [1]:
import re
from random import shuffle
import textwrap

In [2]:
# Read the phonetic alphabet
phonetic_alphabet = {}
with open("cmudict/cmudict.phones") as file:
    for line in file:
       (phoneme, type) = line.split()
       phonetic_alphabet[str(phoneme)] = type
print(phonetic_alphabet)

{'AA': 'vowel', 'AE': 'vowel', 'AH': 'vowel', 'AO': 'vowel', 'AW': 'vowel', 'AY': 'vowel', 'B': 'stop', 'CH': 'affricate', 'D': 'stop', 'DH': 'fricative', 'EH': 'vowel', 'ER': 'vowel', 'EY': 'vowel', 'F': 'fricative', 'G': 'stop', 'HH': 'aspirate', 'IH': 'vowel', 'IY': 'vowel', 'JH': 'affricate', 'K': 'stop', 'L': 'liquid', 'M': 'nasal', 'N': 'nasal', 'NG': 'nasal', 'OW': 'vowel', 'OY': 'vowel', 'P': 'stop', 'R': 'liquid', 'S': 'fricative', 'SH': 'fricative', 'T': 'stop', 'TH': 'fricative', 'UH': 'vowel', 'UW': 'vowel', 'V': 'fricative', 'W': 'semivowel', 'Y': 'semivowel', 'Z': 'fricative', 'ZH': 'fricative'}


In [3]:
# Read the dictionary
dictionary = {}
with open("cmudict/cmudict.dict") as file:
    for line in file:
       list = line.split('#')[0].split()
       dictionary[str(list[0])] = list[1::]

In [4]:
# Make a map from phoneme to how frequently it appears
phoneme_frequency = {}
for word in dictionary.keys():
    for phoneme in dictionary[word]:
        if phoneme not in phoneme_frequency:
            phoneme_frequency[phoneme] = 0
        phoneme_frequency[phoneme] += 1
sorted(phoneme_frequency.items())

[('AA0', 4966),
 ('AA1', 16960),
 ('AA2', 3368),
 ('AE0', 1730),
 ('AE1', 16910),
 ('AE2', 3404),
 ('AH0', 63133),
 ('AH1', 6873),
 ('AH2', 1143),
 ('AO0', 1505),
 ('AO1', 8176),
 ('AO2', 1886),
 ('AW0', 377),
 ('AW1', 2346),
 ('AW2', 652),
 ('AY0', 1201),
 ('AY1', 6930),
 ('AY2', 3393),
 ('B', 21432),
 ('CH', 4954),
 ('D', 32558),
 ('DH', 587),
 ('EH0', 2928),
 ('EH1', 20837),
 ('EH2', 3932),
 ('ER0', 23954),
 ('ER1', 4581),
 ('ER2', 612),
 ('EY0', 965),
 ('EY1', 9358),
 ('EY2', 3416),
 ('F', 13961),
 ('G', 13689),
 ('HH', 9376),
 ('IH0', 30198),
 ('IH1', 15769),
 ('IH2', 4462),
 ('IY0', 22141),
 ('IY1', 10321),
 ('IY2', 2640),
 ('JH', 6394),
 ('K', 43077),
 ('L', 49963),
 ('M', 29741),
 ('N', 61234),
 ('NG', 9989),
 ('OW0', 8212),
 ('OW1', 8667),
 ('OW2', 2458),
 ('OY0', 124),
 ('OY1', 957),
 ('OY2', 207),
 ('P', 19988),
 ('R', 46468),
 ('S', 50432),
 ('SH', 8812),
 ('T', 49074),
 ('TH', 2943),
 ('UH0', 255),
 ('UH1', 1601),
 ('UH2', 478),
 ('UW0', 2060),
 ('UW1', 6712),
 ('UW2', 119

In [5]:
# Read the top 1,000 most frequently used English words
popular_words = []
with open("popular_words") as file:
    for line in file:
        popular_words.append(line.strip().lower())

# Get a phonetic spelling of each popular word
popular_dictionary = {}
for word in popular_words:
    if word in dictionary:
        popular_dictionary[word] = dictionary[word]
        
examples = {}
for phoneme in phoneme_frequency:
    s = [x for x in popular_dictionary if phoneme in popular_dictionary[x]]
    shuffle(s)
    examples[phoneme] = s[:5]

print("example_words = {")
for k, v in sorted(examples.items()):
    print("    '{}': {},".format(k, v))
print("}")

example_words = {
    'AA0': ['participant'],
    'AA1': ['policy', 'far', 'watch', 'project', 'party'],
    'AA2': ['responsibility', 'somebody', 'population', 'operation', 'particularly'],
    'AE0': ['activity', 'campaign', 'accept', 'administration'],
    'AE1': ['glass', 'plant', 'challenge', 'chance', 'tax'],
    'AE2': ['program', 'democrat'],
    'AH0': ['social', 'provide', 'position', 'interest', 'record'],
    'AH1': ['public', 'husband', 'once', 'must', 'love'],
    'AH2': ['everyone', 'understand', 'whatever', 'anyone', 'someone'],
    'AO0': ['already', 'resource'],
    'AO1': ['draw', 'short', 'performance', 'your', 'wall'],
    'AO2': ['organization', 'although'],
    'AW0': [],
    'AW1': ['out', 'account', 'throughout', 'sound', 'without'],
    'AW2': ['however'],
    'AY0': ['identify', 'idea'],
    'AY1': ['describe', 'site', 'entire', 'try', 'violence'],
    'AY2': ['myself', 'recognize', 'environmental', 'realize', 'identify'],
    'B': ['member', 'maybe', 'body',

In [6]:
# Map phonemes to strings that may represent them.
phonemes_to_letters = {
    'AA': 'a',
    'AA1': 'o',
    'AE': 'a',
    'AH': 'a',
    'AH0': 'ah',
    'AH1': 'u',
    'AO': 'o',
    'A01': 'o',
    'AW': 'aw',
    'AW1': 'aw',
    'AY': 'i',
    'AY1': 'ii',
    'B': 'b',
    'CH': 'ch',
    'D': 'd',
    'DH': 'th',
    'EH': 'e',
    'EH1': 'eh',
    'ER': 'er',
    'ER0': 'er',
    'EY': 'ey',
    'F': 'f',
    'G': 'g',
    'HH': 'h',
    'IH': 'i',
    'IY': 'iy',
    'IY1': 'ee',
    'JH': 'j',
    'K': 'k',
    'L': 'l',
    'M': 'm',
    'N': 'n',
    'NG': 'ng',
    'OW': 'ow',
    'OY': 'oy',
    'P': 'p',
    'R': 'r',
    'S': 's',
    'SH': 'sh',
    'T': 't',
    'TH': 'th',
    'UH': 'u',
    'UW': 'uw',
    'V': 'v',
    'W': 'w',
    'Y': 'y',
    'Z': 'z',
    'ZH': 'z'
}

In [7]:
# Map a phoneme to a string representing it under the new dictionary
def mapPhoneme(phoneme):
    if phoneme in phonemes_to_letters:
        return phonemes_to_letters[phoneme]
    return phonemes_to_letters[re.split("[0-9]", phoneme)[0]]

# Map a word to a new spelling for that word
def mapWord(word):
    return ''.join(mapPhoneme(phoneme) for phoneme in dictionary[word.lower()])

def mapSloppyWord(sloppyWord):
    terms = re.split('[^a-z]$', sloppyWord.lower())
    for term in terms:
        if term in dictionary:
            sloppyWord = sloppyWord.lower().replace(term, mapWord(term))
    return sloppyWord

# Map a paragraph to a paragraph under the new dictionary
def mapParagraph(paragraph):
    return ' '.join(map(mapSloppyWord, paragraph.split()))    

In [8]:
# Look at some example words under the new spelling
for word in popular_words:
    print("{} → {}".format(word, mapWord(word)))


a → ah
ability → ahbilahtiy
able → eybahl
about → ahbawt
above → ahbuv
accept → aksehpt
according → ahkording
account → ahkawnt
across → ahkros
act → akt
action → akshahn
activity → aktivahtiy
actually → akchuwahliy
add → ad
address → adres
administration → administreyshahn
admit → ahdmit
adult → ahdult
affect → ahfehkt
after → after
again → ahgehn
against → ahgehnst
age → eyj
agency → eyjahnsiy
agent → eyjahnt
ago → ahgow
agree → ahgree
agreement → ahgreemahnt
ahead → ahhehd
air → ehr
all → ol
allow → ahlaw
almost → olmowst
alone → ahlown
along → ahlong
already → olrehdiy
also → olsow
although → olthow
always → olweyz
american → ahmehrahkahn
among → ahmung
amount → ahmawnt
analysis → ahnalahsahs
and → ahnd
animal → anahmahl
another → ahnuther
answer → anser
any → ehniy
anyone → ehniywan
anything → ehniything
appear → ahpir
apply → ahplii
approach → ahprowch
area → ehriyah
argue → orgyuw
arm → orm
around → erawnd
arrive → eriiv
art → ort
article → ortahkahl
artist → ortahst
as → az
ask

KeyError: "n't"

In [9]:
# Debug
term = "a"
print(term, dictionary[term], mapSloppyWord(term))

phoneme = 'AH0'
print(phoneme, examples[phoneme])
for word in examples[phoneme]:
    print("\t{} → {}".format(word, mapWord(word)))

a ['AH0'] ah
AH0 ['social', 'provide', 'position', 'interest', 'record']
	social → sowshahl
	provide → prahviid
	position → pahzishahn
	interest → intrahst
	record → rahkord


In [10]:
# Create a few sample paragraphs
paragraphs = {
    'Moby Dick': "Call me Ishmael. Some years ago — never mind how long precisely — having little or no money in my purse, and nothing particular to interest me on shore, I thought I would sail about a little and see the watery part of the world. It is a way I have of driving off the spleen, and regulating the circulation. Whenever I find myself growing grim about the mouth; whenever it is a damp, drizzly November in my soul; whenever I find myself involuntarily pausing before coffin warehouses, and bringing up the rear of every funeral I meet; and especially whenever my hypos get such an upper hand of me, that it requires a strong moral principle to prevent me from deliberately stepping into the street, and methodically knocking people’s hats off — then, I account it high time to get to sea as soon as I can. This is my substitute for pistol and ball. With a philosophical flourish Cato throws himself upon his sword; I quietly take to the ship. There is nothing surprising in this. If they but knew it, almost all men in their degree, some time or other, cherish very nearly the same feelings towards the ocean with me.",
    'A Confederacy of Dunces': "A green hunting cap squeezed the top of the fleshy balloon of a head. The green earflaps, full of large ears and uncut hair and the fine bristles that grew in the ears themselves, stuck out on either side like turn signals indicating two directions at once. Full, pursed lips protruded beneath the bushy black moustache and, at their corners, sank into little folds filled with disapproval and potato chip crumbs. In the shadow under the green visor of the cap Ignatius J. Reilly’s supercilious blue and yellow eyes looked down upon the other people waiting under the clock at the D.H. Holmes department store, studying the crowd of people for signs of bad taste in dress. Several of the outfits, Ignatius noticed, were new enough and expensive enough to be properly considered offenses against taste and decency. Possession of anything new or expensive only reflected a person’s lack of theology and geometry; it could even cast doubts upon one’s soul.",
    'The Stranger': "Mother died today. Or maybe yesterday, I don’t know. I had a telegram from the home: ‘Mother passed away. Funeral tomorrow. Yours sincerely.’ That doesn’t mean anything. It may have been yesterday.",
    'The Bible': "In the beginning God created the heaven and the earth. And the earth was without form, and void; and darkness was upon the face of the deep. And the Spirit of God moved upon the face of the waters.",
    'American Tabloid': "America was never innocent. We popped our cherry on the boat over and looked back with no regrets. You can’t ascribe our fall from grace to any single event or set of circumstances. You can’t lose what you lacked at conception."
}

# print(textwrap.fill(paragraphs['Moby Dick'], 80))

In [11]:
# Convert the sample paragraphs
for name in paragraphs:
    print("{}\n{}\n{}\n\n".format(
        mapParagraph(name),
        '_'*80,
        textwrap.fill(mapParagraph(paragraphs[name]), 80)
    ))

mowbiy dik
________________________________________________________________________________
kol mee ishmiyl. sum yirz ahgow — nehver miind haw long prisiisliy — having
litahl or now muniy in mii pers, ahnd nuthing pertikyahler tuw intrahst mee on
shor, ii thot ii wud seyl ahbawt ah litahl ahnd see thah woteriy port uv thah
werld. it iz ah wey ii hav uv driiving of thah spleen, ahnd rehgyahleyting thah
serkyahleyshahn. wenehver ii fiind misehlf growing grim ahbawt thah mawth;
wenehver it iz ah damp, drizliy nowvehmber in mii sowl; wenehver ii fiind
misehlf invowlunteriliy pozing bifor kofin wehrhawziz, ahnd bringing up thah rir
uv ehveriy fyuwnerahl ii meet; ahnd ahspehshliy wenehver mii hypos geht such an
uper hand uv mee, that it riykwiierz ah strong morahl prinsahpahl tuw privehnt
mee frum diliberahtliy stehping intuw thah street, ahnd mahthodikahliy noking
people’s hats of — thehn, ii ahkawnt it hii tiim tuw geht tuw see az suwn az ii
kan. this iz mii substahtuwt for pistahl ahnd bo

In [12]:
# Take note of some tough words
toughies = {
    'AH0': ['interest', 'a', 'the', 'analysis', 'towards'],
    'AA1': ['on', 'audience'],
    'ER0': ['forget'],
    'EH1': ['air']
}

for phoneme in toughies:
    for word in toughies[phoneme]:
        print('{} → {}'.format(word, mapSloppyWord(word)))

interest → intrahst
a → ah
the → thah
analysis → ahnalahsahs
towards → tahwordz
on → on
audience → odiyahns
forget → fergeht
air → ehr


In [13]:
list = []
for word in dictionary:
    if word == mapWord(word):
        list.append(word)
len(list)

5518

In [14]:
len(list)


5518

In [15]:
list = []
for word in popular_words:
    if word == mapSloppyWord(word):
        list.append(word)
list

['after',
 'ask',
 'at',
 'bad',
 'bag',
 'big',
 'bit',
 'born',
 'boy',
 'bring',
 'but',
 'deep',
 'dog',
 'drop',
 'drug',
 'during',
 'enjoy',
 'fast',
 'feel',
 'feeling',
 'film',
 'finish',
 'fish',
 'for',
 'form',
 'former',
 'free',
 'fund',
 'gas',
 'green',
 'grow',
 'growth',
 'gun',
 'hand',
 'hang',
 'her',
 'him',
 'hit',
 'hot',
 'if',
 'in',
 'indeed',
 'it',
 'its',
 'job',
 'just',
 'keep',
 'kid',
 'land',
 'last',
 'list',
 'long',
 'lot',
 'low',
 'man',
 'meet',
 'meeting',
 'modern',
 'morning',
 'much',
 'must',
 'need',
 'nor',
 'north',
 'not',
 "n't",
 'number',
 'on',
 'or',
 'order',
 'own',
 'owner',
 'past',
 'per',
 'perform',
 'perhaps',
 'plan',
 'plant',
 'push',
 'put',
 'rather',
 'rich',
 'risk',
 'run',
 'see',
 'seek',
 'seem',
 'short',
 'shot',
 'show',
 'sing',
 'sister',
 'sit',
 'skin',
 'song',
 'sort',
 'speech',
 'sport',
 'spring',
 'stand',
 'stop',
 'street',
 'strong',
 'such',
 'task',
 'term',
 'than',
 'that',
 'they',
 'thing',

In [29]:
list = []
for word in popular_words:
    if (len(mapSloppyWord(word)) - len(word)) >= 5:
        list.append(word)
for word in list:
    print(word, mapSloppyWord(word))

sexual sehkshuwahl
usually yuwzahwahliy


In [34]:
list = []
for word in dictionary:
    if (len(mapSloppyWord(word)) - len(word)) >= 6:
        list.append(word)
for word in list:
    print(word, mapSloppyWord(word))

aaa tripahley
accumulation ahkyuwmyahleyshahn
accumulations ahkyuwmyahleyshahnz
ach eyseeeych
adhd eydiyeychdee
adsl eydiyesehl
agrarianism ahgrehriyahnizahm
ahasuerus ahhashahwehrahs
americanization ahmerahkahnahzeyshahn
anfal eyehnehfeyehl
antidisestablishmentarianism antidisahstablishmahntehriyahnizahm
asap eyehseypee
asexual eysehksyuwahl
atx eyteeehks
authoritarianism ahthorahtehriyahnizahm
bbc biybiysee
bbq beebiykyuw
bmw beeemdubahlyuw
byu beewiiyuw
cbc siybiysee
cbs siybiyehs
ccd siysiydee
ccs seeseeehs
cctv seesiyteeviy
cdc siydiysee
cmu seeehmyuw
cnbc's seeehnbeeseez
cnn seeehnehn
cnn.com seeehnehndotkom
cnnfn seeehnehnehfehn
coeducational kowehjahkeyshahnahl
colonialism kahlowniyahlizahm
conceptualization kahnsehpchwahlizeyshahn
confucianism kahnfyuwshahnizahm
cps seepiyehs
cpu siypiyyuw
cspi seeehspeeii
csv seeehsvee
cxc seeehkssee
ddt diydiytee
decriminalization diykrimahnahlahzeyshahn
dehumanization diyhyuwmahnahzeyshahn
deinstitutionalization diyinstituwshahnahlahzeyshah