In [1]:
import re
import os
import json
import pandas as pd
from pathlib import Path

DATA_DIRECTORY = os.path.join(Path(os.getcwd()).parent.absolute(), 'data')
CMU_DIRECTORY = os.path.join(DATA_DIRECTORY, 'cmu')
PARENS_CONTENT_REGEX = re.compile(r"\([^()]*\)")


def add_field_to_dict(field_name, dictionary, value={}):
    if field_name not in dictionary:
        dictionary[field_name] = value
    return dictionary

def remove_digits_from_string(string):
    return ''.join([char for char in string if not char.isdigit()])

def any_item_startswith(prefix, array):
    return any(item.startswith(prefix) for item in array)

def remove_parens_content(string):
    try:
        result = re.sub(PARENS_CONTENT_REGEX, "", string)
        return result
    except:
        return string

# Using Researched Phoneme-Grapheme Correspondences

**Source:** [Phoneme-Grapheme Correspondences as Cues to Spelling
    Improvement (1966)](https://files.eric.ed.gov/fulltext/ED128835.pdf)

In [2]:
with open(os.path.join(DATA_DIRECTORY, 'phoneme_to_grapheme_map.json')) as f:
    phoneme_grapheme_correspondence = json.load(f)
    
silent_graphemes = {}
rare_graphemes = {}
for phoneme, graphemes in phoneme_grapheme_correspondence.items():
    for grapheme, metadata in graphemes.items():
        if 'silent' in metadata:
            silent_graphemes[grapheme] = metadata
        if 'rare' in metadata:
            rare_graphemes[grapheme] = metadata

In [3]:
phoneme_grapheme_correspondence

{'AA': {'o': {'examples': ['odd', 'drop'],
   'count': 1578,
   'syllabic_position': {'initial': True, 'medial': True, 'final': False}},
  'a': {'examples': ['are', 'father', 'schwa'],
   'count': 673,
   'syllabic_position': {'initial': True, 'medial': True, 'final': True}},
  'ea': {'examples': ['heart'],
   'count': 18,
   'silent_replacement': '_a',
   'syllabic_position': {'initial': False, 'medial': True, 'final': False}},
  'e': {'examples': ['encore'],
   'count': 10,
   'syllabic_position': {'initial': True, 'medial': False, 'final': False}},
  'ow': {'examples': ['knowledge'],
   'count': 4,
   'silent_replacement': 'o_',
   'syllabic_position': {'initial': False, 'medial': True, 'final': False}},
  'ah': {'examples': ['dahl', 'bah'],
   'count': 3,
   'silent_replacement': 'a_',
   'syllabic_position': {'initial': False, 'medial': True, 'final': True}},
  'aa': {'examples': ['aardvark'],
   'count': 1,
   'silent_replacement': 'a_',
   'syllabic_position': {'initial': True, 

## CMU (\~134K Words)

**Source:** [CMU](http://www.speech.cs.cmu.edu/cgi-bin/cmudict?in=urge)

**Description:** Dataset containing words and their phonemes. It does not break down into syllables or graphemes though.

In [4]:
cmudict_7b = pd.read_csv(
    os.path.join(DATA_DIRECTORY, 'cmu', 'cmudict-0.7b.txt'),
    encoding = "ISO-8859-1",
    header=None,
    names=['word', 'phonemes'],
    keep_default_na=False,
)

## Webster - Proper Nouns & Adjectives

  **Source:** [Scrapmaker Webster Dictionary](https://www.scrapmaker.com/data/wordlists/dictionaries/webster-dictionary.txt)
  
  **Method:** Extracted all capitalized nouns and adjectives from original file (alphabetized at the top)

In [5]:
proper_words = pd.read_csv(
    os.path.join(DATA_DIRECTORY, 'webster', 'webster_dictionary_proper.txt'),
    encoding = "ISO-8859-1",
    header=None,
    names=['word'],
    keep_default_na=False
)
proper_words = proper_words.set_index('word')

## Webster (\~282K words)
  **Source:** [Scrapmaker Webster Dictionary](https://www.scrapmaker.com/data/wordlists/dictionaries/webster-dictionary.txt)
  
  **Method:** Removed all proper nouns and adjectives from original file (alphabetized at the top)

In [6]:
webster_words = pd.read_csv(
    os.path.join(DATA_DIRECTORY, 'webster', 'webster_dictionary.txt'),
    encoding = "ISO-8859-1",
    header=None,
    names=['word'],
    keep_default_na=False
)
webster_words = webster_words.set_index('word')

## OED (\~89K words)
  **Source:** [Oxford-Dictionary-Json](https://github.com/cduica/Oxford-Dictionary-Json)
  
  **Method:** Remove all the oids's and just keep words and definitions.

In [7]:
with open(os.path.join(DATA_DIRECTORY, 'oed', 'oed.json')) as f:
    oed_words = json.load(f)
oed_words = {key.lower(): value for key, value in oed_words.items()}

## Unix (\~200K words)

  **Source:** `/usr/share/dict/words`
  
  **Method:** Unix systems come with a file of English words. Copy into file with `cat /usr/share/dict/words > ~/Desktop/words.txt`

In [8]:
unix_words = pd.read_csv(
    os.path.join(DATA_DIRECTORY, 'unix', 'words.txt'),
    encoding = "ISO-8859-1",
    header=None,
    names=['word'],
    keep_default_na=False
)
unix_words = unix_words[~unix_words['word'].str[0].str.isupper()]
unix_words = unix_words.set_index('word')

## GradyAugmented (\~122K words)

#### R Code to produce the GradyAugmented dictionary
    require(qdapDictionaries)
    write.csv(GradyAugmented, "<PATH_TO_FILE>/GradyAugmentedR.csv",  row.names = FALSE)

In [9]:
grady_words = pd.read_csv(
    os.path.join(DATA_DIRECTORY, 'grady', 'GradyAugmentedR.csv'),
    encoding = "ISO-8859-1",
    header=None,
    names=['word'],
    keep_default_na=False
)
grady_words = grady_words.set_index('word')
grady_words = grady_words[~grady_words.index.isin(proper_words.index.str.lower())]

## CMU English Word Check (\~36K Words)

**Method**: If a CMU word is in at least 2 of the above dictionaries (after proper word removal), count it.

In [10]:
def in_english(word, oed, unix, grady, webster):
    num_dicts = 0
    if word in oed_words:
        num_dicts += 1
    if word in unix:
        num_dicts += 1
    if word in grady:
        num_dicts += 1
    if word in webster:
        num_dicts += 1
    if num_dicts > 1:
        return True
    return False

In [11]:
cmudict_7b['in_english'] = [in_english(
    remove_parens_content(word.lower()),
    oed_words,
    unix_words.index,
    grady_words.index,
    webster_words.index
) for word in cmudict_7b['word']]

In [12]:
cmudict_7b_english = cmudict_7b[cmudict_7b['in_english']]
cmudict_7b_english = cmudict_7b_english.reset_index(drop=True)

In [13]:
cmudict_7b_english

Unnamed: 0,word,phonemes,in_english
0,A,AH0,True
1,A(1),EY1,True
2,AA,EY2 EY1,True
3,AARDVARK,AA1 R D V AA2 R K,True
4,ABA,EY2 B IY2 EY1,True
...,...,...,...
35797,ZORRO,Z AO1 R OW2,True
35798,ZOSTER,Z AA1 S T ER0,True
35799,ZOUNDS,Z AW1 N D Z,True
35800,ZUCCHINI,Z UW0 K IY1 N IY0,True


## Infer Graphemes

**Method:** We ostensibly have a dictionary mapping of all possible phonemes-to-graphemes. This should allow us to cycle through all possible mapped graphemes, which we can use to derive the correct graphemes of the original word.

In [14]:
from itertools import chain, combinations, product
from itertools import chain, combinations

def has_invalid_phoneme(phoneme_string, phoneme_grapheme_correspondence):
    phonemes = phoneme_string.split(' ')
    return any(phoneme not in phoneme_grapheme_correspondence for phoneme in phonemes)

def all_subsets(array, non_empty=True):
    subsets = []
    for L in range(0, len(array)+1):
        for subset in itertools.combinations(array, L):
            if non_empty and len(list(subset)) == 0:
                continue    
            subsets.append(list(subset))
    return subsets

def get_char_indices_in_string(character, string):
    return [pos for pos, char in enumerate(string) if char == character]

def remove_char(string, n):
    first_part = string[:n] 
    last_part = string[n+1:]
    return first_part + last_part

def powerset(iterable):
    "powerset([1,2,3]) --> () (1,) (2,) (3,) (1,2) (1,3) (2,3) (1,2,3)"
    s = list(iterable)
    return chain.from_iterable(combinations(s, r) for r in range(len(s)+1))

def get_all_diphoneme_permutations(string, grapheme, diphoneme):
    diphoneme_permutations = []
    grapheme_not_preceded_by_colon = r'(?<!:){0}'.format(grapheme)
    string = re.sub(grapheme_not_preceded_by_colon, diphoneme, string)
    
    graphemes = string.split()
    indices = [i for i, v in enumerate(graphemes) if v == diphoneme]
    
    for replacements in powerset(indices):
        new_graphemes = list(graphemes)
        for index in replacements:
            new_graphemes[index] = grapheme
        diphoneme_permutations.append(" ".join(new_graphemes))

    return diphoneme_permutations

def get_all_pronunciations(phoneme_string, phoneme_grapheme_correspondence):
    phoneme_string = remove_digits_from_string(phoneme_string).replace('HH W', 'HH:W')

    ambiguous_diphonemes = {
        'AH L': 'AH:L',
        'AH M': 'AH:M',
        'AH R': 'AH:R',
        'AH W': 'AH:W',
        'G Z': 'G:Z',
        'G ZH': 'G:ZH',
        'K S': 'K:S',
        'K SH': 'K:SH',
        'N Y AH': 'N:Y:AH',
        'T TH': 'T:TH',
        'T S': 'T:S',
        'W AA': 'W:AA',
        'W AH': 'W:AH',
        'Y AH': 'Y:AH',
        'Y UH': 'Y:UH',
        'Y UW': 'Y:UW',
    }

    applicable_diphonemes = {k:v for k,v in ambiguous_diphonemes.items() if k in phoneme_string}
    
    possible_pronunciations = [phoneme_string]
    for grapheme, diphoneme in applicable_diphonemes.items():
        new_pronunciations = []
        for possible_pronunciation in possible_pronunciations:
            if grapheme in possible_pronunciation:
                possible_pronunciations = list(set(possible_pronunciations + get_all_diphoneme_permutations(possible_pronunciation, grapheme, diphoneme)))
    
    pronunciations = set([p for p in possible_pronunciations if not has_invalid_phoneme(p, phoneme_grapheme_correspondence)])
    return pronunciations

def get_graphemes_from_word_and_phonemes(word, phoneme_string, phoneme_grapheme_correspondence, silent_graphemes, rare_graphemes):
    word = remove_parens_content(word).replace('-','')
    phonemes = remove_digits_from_string(phoneme_string).split(' ')

    possible_grapheme_splits = []
    for i, phoneme in enumerate(phonemes):

        # For a given phoneme, see whether any of its graphemes can come next in the sequence
        graphemes = phoneme_grapheme_correspondence[phoneme]

        possible_graphemes = []
        for grapheme, metadata in graphemes.items():
            
            if 'very_rare' in metadata:
                continue

            grapheme = grapheme.upper()

            if len(possible_grapheme_splits) > 0:
                for j, possible_grapheme_split in enumerate(possible_grapheme_splits):
                    starting_string = ''.join(possible_grapheme_split) + grapheme
                    
                    
                    if word.startswith(starting_string):
                            possible_graphemes.append('{0};{1}'.format(grapheme, j))

            elif word.startswith(grapheme):
                possible_grapheme_splits.append([])
                possible_graphemes.append('{0};{1}'.format(grapheme, 0))
        
        for value in possible_graphemes:
            value_split = value.split(';')
            possible_grapheme = value_split[0]
            index = int(value_split[1])
            a = possible_grapheme_splits[index] + [possible_grapheme]
            possible_grapheme_splits.append(a)

        indices_to_remove = [value.split(';')[1] for value in possible_graphemes]
        possible_grapheme_splits = [v for i, v in enumerate(possible_grapheme_splits) if i not in indices_to_remove]
        
    final_grapheme_splits = [split for split in possible_grapheme_splits if ''.join(split) == word and len(split) == len(phonemes)]
        
    non_silent_grapheme_splits = [split for split in final_grapheme_splits if not any((grapheme.lower() in silent_graphemes) for grapheme in split)]
    if len(non_silent_grapheme_splits) > 0:
        
        non_rare_grapheme_splits = [split for split in non_silent_grapheme_splits if not any((grapheme.lower() in rare_graphemes) for grapheme in split)]
        if len(non_rare_grapheme_splits) > 0:
            return non_rare_grapheme_splits, phonemes
        
        return non_silent_grapheme_splits, phonemes
    
    non_rare_grapheme_splits = [split for split in final_grapheme_splits if not any((grapheme.lower() in rare_graphemes) for grapheme in split)]
    if len(non_rare_grapheme_splits) > 0:
        return non_rare_grapheme_splits, phonemes
    
    return final_grapheme_splits, phonemes

def get_double_vowel_tiebreaker(possible_graphemes):
    double_vowels = ['AA', 'EE', 'II', 'OO', 'UU']
    
    a = possible_graphemes[0]
    b = possible_graphemes[1]
    
    diffA, diffB = get_string_diffs(a, b)
    
    if any(d in double_vowels for d in diffA):
        return a, 0
        
    elif any(d in double_vowels for d in diffB):
        return b, 1
    
    else:
        return possible_graphemes, None
    
def get_string_diffs(stringA, stringB):
    splitA = set(stringA.split(" "))
    splitB = set(stringB.split(" "))

    diffA = splitA.difference(splitB)
    diffB = splitB.difference(splitA)
    
    return diffA, diffB

In [15]:
def add_initial_graphemes_phonemes_to_df(df, phoneme_grapheme_correspondence, silent_graphemes, rare_graphemes):
    num_spellings = []
    all_graphemes = []
    all_phonemes = []
    for i, row in df.iterrows():
        if ((i + 1) % 10000) == 0:
            print('Processed num: {0}'.format(i+1))
        spellings = 0
        word = row['word']
        phoneme_string = row['phonemes']

        num_solutions = 0
        pronunciations = get_all_pronunciations(phoneme_string, phoneme_grapheme_correspondence)

        graphemes_strings = []
        phonemes_strings = []
        for pronunciation in pronunciations:
            graphemes, phonemes = get_graphemes_from_word_and_phonemes(word, pronunciation, phoneme_grapheme_correspondence, silent_graphemes, rare_graphemes)
            if len(graphemes) == 0:
                continue
            spellings += len(graphemes)

            graphemes_string = [' '.join(grapheme) for grapheme in graphemes]
            for g in graphemes_string:
                graphemes_strings.append(g)
                phonemes_strings.append(pronunciation)

        # Logic to override some ambiguous spellings (a lot of trouble with silent E overriding double EE)
        if len(graphemes_strings) == 2:
            winner, tiebreak_index = get_double_vowel_tiebreaker(graphemes_strings)
            if tiebreak_index is not None:
                graphemes_strings = [winner]
                phonemes_strings = [phonemes_strings[tiebreak_index]]
                spellings = 1

        all_graphemes.append('; '.join(graphemes_strings))
        all_phonemes.append('; '.join(phonemes_strings))
        num_spellings.append(spellings)

    df['num_initial_spellings'] = num_spellings
    df['initial_graphemes'] = all_graphemes
    df['initial_phonemes'] = all_phonemes
    
    return df

In [16]:
cmudict_7b_english = add_initial_graphemes_phonemes_to_df(
    cmudict_7b_english,
    phoneme_grapheme_correspondence,
    silent_graphemes,
    rare_graphemes
)

Processed num: 10000
Processed num: 20000
Processed num: 30000


In [17]:
cmudict_7b_english['num_initial_spellings'].value_counts()

1    29532
2     5180
0      551
4      416
3       93
6       14
8       11
5        5
Name: num_initial_spellings, dtype: int64

## Phonetisaurus

**Source:** [Phonetisaurus](https://github.com/AdolfVonKleist/Phonetisaurus), [Aligned CMUDict](https://github.com/ckw017/aligned-cmudict)

**Description:** Adding Phonetisaurus mapped grapheme-phoneme splits to the dataframe.

In [18]:
grapheme_phoneme_split_filename = os.path.join(DATA_DIRECTORY, 'phonetisaurus', 'graphemes_to_phonemes.json')
with open(grapheme_phoneme_split_filename) as f:
    grapheme_phoneme_split = json.load(f)

In [19]:
phonetisaurus_graphemes = []
phonetisaurus_phonemes = []
for word in cmudict_7b_english['word']:
    word = remove_parens_content(word).lower()
    if word in grapheme_phoneme_split:
        phonetisaurus_graphemes.append(' '.join(grapheme_phoneme_split[word]['graphemes']).replace('|', '').upper())
        phonetisaurus_phonemes.append(remove_digits_from_string(' '.join(grapheme_phoneme_split[word]['phonemes']).replace('|', ':')))
    else:
        phonetisaurus_graphemes.append('')
        phonetisaurus_phonemes.append('')

cmudict_7b_english['phonetisaurus_graphemes'] = phonetisaurus_graphemes
cmudict_7b_english['phonetisaurus_phonemes'] = phonetisaurus_phonemes

## m2m Aligner

**Source:** [m2m-aligner](https://github.com/letter-to-phoneme/m2m-aligner)

**Description:** Adding the results of the m2m-aligner for splitting graphemes and phonemes.

In [20]:
m2m = pd.read_csv(
    os.path.join(DATA_DIRECTORY, 'm2m', 'cmudict.txt.m-mAlign.2-2.delX.1-best.conYX.align'),
    encoding = "ISO-8859-1",
    header=None,
    names=['graphemes', 'phonemes'],
    sep="\t",
    keep_default_na=False
)

m2m['word'] = [grapheme.replace('|', '').replace(':', '').upper().strip() for grapheme in m2m['graphemes']]
m2m['phonemes'] = [phoneme.replace('|', ' ').strip() for phoneme in m2m['phonemes']]
m2m['graphemes'] = [grapheme.replace('|', ' ').replace(':', '').upper().strip() for grapheme in m2m['graphemes']]
m2m = m2m.set_index('word')

In [21]:
m2m_graphemes = []
m2m_phonemes = []

for word in cmudict_7b_english['word']:
    word = remove_parens_content(word)
    
    if word in m2m.index:
        m2m_graphemes.append(m2m.loc[word, 'graphemes'])
        m2m_phonemes.append(m2m.loc[word, 'phonemes'])
    else:
        m2m_graphemes.append('')
        m2m_phonemes.append('')

cmudict_7b_english['m2m_graphemes'] = m2m_graphemes
cmudict_7b_english['m2m_phonemes'] = m2m_phonemes

## Defaulting to m2m Graphemes / Phonemes (if initial attempt fails)

In [22]:
with open(os.path.join(DATA_DIRECTORY, 'manual_graphemes_phonemes.json')) as f:
    manual_graphemes_phonemes = json.load(f)

In [23]:
with open(os.path.join(DATA_DIRECTORY, 'words_to_remove.json')) as f:
    words_to_remove = json.load(f)

In [24]:
def add_final_graphemes_phonemes_to_df(df, manual_graphemes_phonemes, words_to_remove):
    indices_to_remove = []

    final_graphemes = []
    final_phonemes = []
    final_num_spellings = []

    for i, row in df.iterrows():
        word = row['word']
        cmu_phonemes = row['phonemes']
        m2m_phonemes = row['m2m_phonemes']
        m2m_graphemes = row['m2m_graphemes']

        if word in words_to_remove:
            if cmu_phonemes == words_to_remove[word]:
                indices_to_remove.append(i)
                continue
        
        if word in manual_graphemes_phonemes:
            final_graphemes.append(manual_graphemes_phonemes[word]['graphemes'])
            final_phonemes.append(manual_graphemes_phonemes[word]['phonemes'])
            final_num_spellings.append(1)
        elif row['num_initial_spellings'] == 1:
            final_graphemes.append(row['initial_graphemes'])
            final_phonemes.append(row['initial_phonemes'])
            final_num_spellings.append(row['num_initial_spellings'])
        elif '_' not in m2m_phonemes:
            final_graphemes.append(m2m_graphemes)
            final_phonemes.append(m2m_phonemes)
            final_num_spellings.append(1)
        elif m2m_phonemes.count('_') == 1 and m2m_phonemes.endswith('_') and m2m_graphemes.endswith(' E'):
            final_graphemes.append(m2m_graphemes[:-2] + 'E')
            final_phonemes.append(m2m_phonemes[:-2])
            final_num_spellings.append(1)
        else:
            final_graphemes.append(row['initial_graphemes'])
            final_phonemes.append(row['initial_phonemes'])
            final_num_spellings.append(row['num_initial_spellings'])

    df = df.drop(indices_to_remove)
    df['final_graphemes'] = final_graphemes
    df['final_phonemes'] = final_phonemes
    df['num_final_spellings'] = final_num_spellings
    
    return df

In [25]:
cmudict_7b_english = add_final_graphemes_phonemes_to_df(
    cmudict_7b_english,
    manual_graphemes_phonemes,
    words_to_remove
)

In [26]:
cmudict_7b_english['num_final_spellings'].value_counts()

1    35735
Name: num_final_spellings, dtype: int64

In [27]:
cmudict_7b_english.to_csv(os.path.join(DATA_DIRECTORY, 'cmu_7b_graphemes_phonemes.csv'), index=False)

## Getting Empirical Grapheme-Phoneme Mappings

In [28]:
def get_grapheme_phoneme_mappings(df, grapheme_colname='final_graphemes', phoneme_colname='final_phonemes'):
    # df needs to have these columns: "word", "final_graphemes", "final_phonemes"

    graphemes_to_phonemes = {}
    phonemes_to_graphemes = {}

    i = 0
    for i, row in df.iterrows():
        word = row['word']
        graphemes = row[grapheme_colname].split(' ')
        phonemes = row[phoneme_colname].split(' ')

        for zipped in zip(graphemes, phonemes):
            grapheme = zipped[0]
            phoneme = remove_digits_from_string(zipped[1])

            # (1) Update grapheme_to_phonemes dictionary
            if grapheme not in graphemes_to_phonemes:
                graphemes_to_phonemes[grapheme] = {}
            if phoneme not in graphemes_to_phonemes[grapheme]:
                 graphemes_to_phonemes[grapheme][phoneme] = {'count': 0, 'examples': [], 'probability': None}
            graphemes_to_phonemes[grapheme][phoneme]['count'] += 1
            if len(word) > 1 and not any_item_startswith(word[0:1], graphemes_to_phonemes[grapheme][phoneme]['examples']):
                graphemes_to_phonemes[grapheme][phoneme]['examples'].append(word)

            # (2) Update phonemes_to_graphemes dictionary
            if phoneme not in phonemes_to_graphemes:
                phonemes_to_graphemes[phoneme] = {}
            if grapheme not in phonemes_to_graphemes[phoneme]:
                phonemes_to_graphemes[phoneme][grapheme] = {'count': 0, 'examples': [], 'probability': None}
            phonemes_to_graphemes[phoneme][grapheme]['count'] += 1
            if len(word) > 1 and not any_item_startswith(word[0:1], phonemes_to_graphemes[phoneme][grapheme]['examples']):
                phonemes_to_graphemes[phoneme][grapheme]['examples'].append(word)
        i += 1
        
    return graphemes_to_phonemes, phonemes_to_graphemes


def check_for_unaccounted_graphemes_phonemes(df, graphemes_to_phonemes, phonemes_to_graphemes, phoneme_grapheme_correspondence):
    unaccounted_phonemes = []
    unaccounted_graphemes = []
    
    print("Unaccounted phonemes:")
    # Checking if there are any unaccounted phonemes in phoneme_to_grapheme_map.json
    for key in phonemes_to_graphemes.keys():
        if key not in phoneme_grapheme_correspondence:
            print(key)
            unaccounted_phonemes.append(key)
    
    # DO NOT CONTINUE IF THERE ARE ANY UNACCOUNTED FOR PHONEMES
    # FIX THEM FIRST
    if len(unaccounted_phonemes) > 0:
        print("WARNING: DO NOT CONTINUE IF THERE ARE ANY UNACCOUNTED FOR PHONEMES - FIX THEM FIRST")
        return unaccounted_phonemes, unaccounted_graphemes
    print()
    
    # Checking if there are any unaccounted graphemes in phoneme_to_grapheme_map.json
    print("Unaccounted graphemes:")
    for phoneme, graphemes in phonemes_to_graphemes.items():
        for grapheme, metadata in graphemes.items():
            if grapheme.lower() not in phoneme_grapheme_correspondence[phoneme]:
                print(phoneme)
                print('    ',grapheme.lower())
                for example in metadata['examples']:
                    example_row = df[df['word'].str.startswith(example)]
                    for i, row in example_row.iterrows():
                        w = row['word']
                        gs = row['final_graphemes']
                        ps = row['final_phonemes']

                        if grapheme in gs and phoneme in ps:
                            unaccounted_graphemes.append(grapheme)
                            print('    "{0}": "{1}",'.format(w, row['phonemes']))
                            print('    "{0}": {{\n        "graphemes": "{1}",\n        "phonemes": "{2}"\n    }},'.format(w, gs, ps))
                            print()
                            
    if len(unaccounted_graphemes) > 0:
        print("WARNING: DO NOT CONTINUE IF THERE ARE ANY UNACCOUNTED FOR GRAPHEMES - FIX THEM FIRST")
        return unaccounted_phonemes, unaccounted_graphemes
    
    return unaccounted_phonemes, unaccounted_graphemes

In [29]:
graphemes_to_phonemes, phonemes_to_graphemes = get_grapheme_phoneme_mappings(cmudict_7b_english)

In [30]:
# Check for any unaccounted phonemes or graphemes in the dataset
# If there are unaccounted graphemes: 
#     (1) If the grapheme is valid, add the grapheme into phoneme_to_grapheme_map.json
#            (A) Rerun the step to reload phoneme_to_grapheme_map.json at the top
#            (B) Rerun the step containing add_initial_graphemes_phonemes_to_df
#            (C) Rerun the section titled "Defaulting to m2m Graphemes / Phonemes (if initial attempt fails)" down to here
#     (2) If the grapheme is invalid, add the correct graphemes and phonemes into manual_graphemes_phonemes.json
#            (A) Rerun the section titled "Defaulting to m2m Graphemes / Phonemes (if initial attempt fails)" down to here
#     (3) If the word is invalid, add it to words_to_remove.json
#            (A) Rerun the section titled "Defaulting to m2m Graphemes / Phonemes (if initial attempt fails)" down to here
unaccounted_phonemes, unaccounted_graphemes = check_for_unaccounted_graphemes_phonemes(
    cmudict_7b_english,
    graphemes_to_phonemes,
    phonemes_to_graphemes,
    phoneme_grapheme_correspondence
)

Unaccounted phonemes:

Unaccounted graphemes:


## Manually Examining Issues With Grapheme & Phoneme Mappings

In [31]:
def get_unaccounted_phoneme_rows(df, unaccounted_phoneme):
    return df[df['final_phonemes'].str.contains(unaccounted_phoneme)][[
        'word',
        'phonemes',
        'initial_phonemes',
        'initial_graphemes',
        'm2m_phonemes',
        'm2m_graphemes',
        'final_phonemes',
        'final_graphemes'
    ]]

def get_unaccounted_grapheme_rows(df, unaccounted_grapheme):
    return df[
        (df['final_graphemes'].str.contains(' '+unaccounted_grapheme+' ')) |
        (df['final_graphemes'].str.startswith(unaccounted_grapheme+' ')) | 
        (df['final_graphemes'].str.endswith(' '+unaccounted_grapheme))
    ][[
        'word',
        'phonemes',
        'initial_phonemes',
        'initial_graphemes',
        'm2m_phonemes',
        'm2m_graphemes',
        'final_phonemes',
        'final_graphemes'
    ]]

# Format rows to be added to the manual_grapheme_phoneme.json file if needed
def format_rows_for_manual_correction(rows):
    for i, row in rows.iterrows():
        word = row['word']
        graphemes = row['final_graphemes']
        phonemes = row['final_phonemes']
        print(
            '"{0}": {{\n    "graphemes": "{1}",\n    "phonemes": "{2}"\n}},'.format(word, graphemes, phonemes)
        )

# Format rows to be added to the words_to_remove.json file if needed
def format_rows_for_exclusion(rows):
    for i, row in rows.iterrows():
        word = row['word']
        phonemes = row['phonemes']
        print(
            '"{0}": "{1}",'.format(word, phonemes)
        )
        
def format_grapheme_to_add_to_mapping(grapheme, df):
    formatted_grapheme = grapheme.lower().strip()
    example = df[df['final_graphemes'].str.contains(grapheme)]['word'].iloc[0].lower()
    print(
        '"{0}": {{\n    "examples": ["{1}"],\n    "count": 1,\n    "very_rare": true,\n    "silent": [""],\n    "syllabic_position": {{\n        "initial": false,\n        "medial": false,\n        "final": false\n    }},\n}}'.format(formatted_grapheme, example)
    )

### Examining Unaccounted for Phonemes

In [32]:
# print(unaccounted_phonemes)
unaccounted_phoneme = '_'
unaccounted_phoneme_rows = get_unaccounted_phoneme_rows(cmudict_7b_english, unaccounted_phoneme)
format_rows_for_manual_correction(unaccounted_phoneme_rows)

In [33]:
format_rows_for_exclusion(unaccounted_phoneme_rows)

### Examining Unaccounted for Graphemes

In [34]:
# print(unaccounted_graphemes)
unaccounted_grapheme = "RRH"
unaccounted_grapheme_rows = get_unaccounted_grapheme_rows(cmudict_7b_english, unaccounted_grapheme)
format_rows_for_manual_correction(unaccounted_grapheme_rows)

"PYRRHIC": {
    "graphemes": "P Y RRH I C",
    "phonemes": "P IH R IH K"
},


In [35]:
format_rows_for_exclusion(unaccounted_grapheme_rows)

"PYRRHIC": "P IH1 R IH0 K",


In [36]:
format_grapheme_to_add_to_mapping(unaccounted_grapheme, cmudict_7b_english)

"rrh": {
    "examples": ["arrhythmia"],
    "count": 1,
    "very_rare": true,
    "silent": [""],
    "syllabic_position": {
        "initial": false,
        "medial": false,
        "final": false
    },
}


### Final Check to See if there are Mismatched Graphemes & Phonemes By Length

In [37]:
for i, row in cmudict_7b_english.iterrows():
    num_phonemes = len(row['final_graphemes'].split(' '))
    num_graphemes = len(row['final_phonemes'].split(' '))
    if num_phonemes != num_graphemes:
        print(row['word'])

## Add Silent _ Into Final Graphemes & Phonemes

In [38]:
def add_faux_silent_item(graphemes, phonemes, phoneme_grapheme_correspondence):
    graphemes = graphemes.split(' ')
    phonemes = phonemes.split(' ')
    
    new_graphemes = []
    new_phonemes = []
    for item in zip(graphemes, phonemes):
        grapheme = item[0]
        phoneme = item[1]
        
        grapheme_metadata = phoneme_grapheme_correspondence[phoneme][grapheme.lower()]
        if "silent_replacement" in grapheme_metadata:
            silent_pattern = grapheme_metadata['silent_replacement'].upper()
            for i, char in enumerate(silent_pattern):
                if char == '_':
                    new_graphemes.append(grapheme[i])
                    new_phonemes.append('_')
                else:
                    new_graphemes.append(grapheme[i])
                    new_phonemes.append(phoneme)
        else:
            new_graphemes.append(grapheme)
            new_phonemes.append(phoneme)
        
    new_grapheme_string = ' '.join(new_graphemes)
    new_phoneme_string = ' '.join(new_phonemes)
    
    return new_grapheme_string, new_phoneme_string

def add_faux_silent_item_to_df(df, phoneme_grapheme_correspondence):
    all_silent_graphemes = []
    all_silent_phonemes = []

    for i, row in df.iterrows():
        graphemes = row['final_graphemes']
        phonemes = row['final_phonemes']
        silent_graphemes, silent_phonemes = add_faux_silent_item(graphemes, phonemes, phoneme_grapheme_correspondence)
        all_silent_graphemes.append(silent_graphemes)
        all_silent_phonemes.append(silent_phonemes)

    df['silent_graphemes'] = all_silent_graphemes
    df['silent_phonemes'] = all_silent_phonemes
    
    return df

In [39]:
cmudict_7b_english = add_faux_silent_item_to_df(cmudict_7b_english, phoneme_grapheme_correspondence)

In [40]:
silent_graphemes_to_phonemes, silent_phonemes_to_graphemes = get_grapheme_phoneme_mappings(
    cmudict_7b_english,
    "silent_graphemes",
    "silent_phonemes"
)

## Write Output to CSV

In [41]:
cmudict_7b_english.to_csv(os.path.join(DATA_DIRECTORY, 'cmu_7b_graphemes_phonemes.csv'), index=False)

In [42]:
with open(os.path.join(DATA_DIRECTORY, 'cmu_7b_graphemes_to_phonemes.json'), 'w') as f:
    json.dump(graphemes_to_phonemes, f)
    
with open(os.path.join(DATA_DIRECTORY, 'cmu_7b_phonemes_to_graphemes.json'), 'w') as f:
    json.dump(phonemes_to_graphemes, f)
    
with open(os.path.join(DATA_DIRECTORY, 'cmu_7b_silent_graphemes_to_phonemes.json'), 'w') as f:
    json.dump(silent_graphemes_to_phonemes, f)
    
with open(os.path.join(DATA_DIRECTORY, 'cmu_7b_silent_phonemes_to_graphemes.json'), 'w') as f:
    json.dump(silent_phonemes_to_graphemes, f)

## Format Output for JavaScript Files

In [45]:
def output_grapheme_counts_js(g2p):
    g2p_counts = []
    for k, v in g2p.items():
        summation = 0
        for i,j in g2p[k].items():
            summation += g2p[k][i]['count']
        num_vowels = 0
        num_chars = 0
        for c in k:
            num_chars += 1
            if c in ['A', 'E', 'I', 'O', 'U']:
                num_vowels += 1
        colors = {}
        if num_vowels == num_chars:
            colors = {"fill": "#e983a0", "light": "#fecad4", "stroke": "#B75470"}
        elif num_vowels > 0:
            colors = {"fill": "#c480e5", "light": "#e8cbf6", "stroke": "#6418b9"}
        else:
            colors = {"fill": "#5b8fcd", "light": "#a9cce4", "stroke": "#3568A4"}
        g2p_counts.append({
            "id": k,
            "count": summation,
            "fill": colors["fill"],
            "light": colors["light"],
            "stroke": colors["stroke"],
        })
    with open(os.path.join(DATA_DIRECTORY, 'grapheme_counts.js'), 'w') as outfile:
        outfile.write("export default ")
        outfile.write(json.dumps(g2p_counts))
        
        
def output_phoneme_counts_js_and_phoneme_to_grapheme_js(p2g, p2gMap):
    manual_corrections = {
        "['c*a*nv', '*a*', 's']": ['canv', '*a*', 's'],
        "['*u*k', '*u*', 'lele']": ['uk', '*u*', 'lele'],
        "['r*e*b', '*e*', 'l']": ['reb', '*e*', 'l'],
        "['porph*y*r', '*y*']": ['porph', '*y*', 'ry'],
        "['c*ou*rage', '*ou*', 's']": ['courage', '*ou*', 's'],
        "['m*es*dam', '*es*']": ['m', '*es*', 'dames'],
        "['*e*ncor', '*e*']": ['*e*', 'ncore'],
        "['l*i*nger', '*i*', 'e']": ['l', '*i*', 'ngerie'],
        "['a*ca*demi', '*ca*', 'lly']": ['academi', '*ca*', 'lly'],
        "['o*c*eani', '*c*']": ['o', '*c*', 'eanic'],
        "['ini*t*ia', '*t*', 'e']": ['ini', '*t*', 'iate'],
        "['m*o*nsign', '*o*', 'r']": ['monsign', '*o*', 'r'],
        "['*j*o', '*j*', 'oba']": ['*j*', 'o', '*j*', 'oba'],
        "['*l*itt', '*l*', 'e']": ['litt', '*le*'],
        "['a*c**c*ountan', '*c*', 'y']": ['accountan', '*c*', 'y'],
        "['semi*a*nnu', '*a*', 'l']": ['semiannu', '*a*', 'l'],
        "['ev*a*cu', '*a*', 'te']": ['evacu', '*a*', 'te'],
        "['*i*ns*i*gn*i*f', '*i*', 'cance']": ['insign', '*i*', 'ficance'],
        "['vign*e*tt', '*e*']": ['vign', '*e*', 'tte']
    }

    p2g_counts = []
    p2g_formatted = {}
    for k, v in p2g.items():
        summation = 0
        for i,j in p2g[k].items():
            summation += p2g[k][i]['count']

        p2g_formatted[k] = []

        if ":" in k:
            colors = {"fill": "#c480e5", "light": "#e8cbf6", "stroke": "#6418b9"}
        elif k[0] in ['A', 'E', 'I', 'O', 'U']:
            colors = {"fill": "#e983a0", "light": "#fecad4", "stroke": "#B75470"}
        else:
            colors = {"fill": "#5b8fcd", "light": "#a9cce4", "stroke": "#3568A4"}
        p2g_counts.append({
            "id": k,
            "count": summation,
            "fill": colors["fill"],
            "light": colors["light"],
            "stroke": colors["stroke"],
        })

        for i,j in p2g[k].items():
            p2g[k][i]["probability"] = float(p2g[k][i]["count"]) / float(summation)

            longest_example =  max(p2gMap[k][i.lower()]["examples"], key=len)
            longest_example = longest_example.replace(i.lower(), "*" + i.lower() + "*" )
            asterisk_count = len(re.findall("\*", longest_example))
            regex_search = re.search(r'(.*)(\*.*\*)(.*)', longest_example)
            longest_example = list(filter(None, regex_search.groups()))
            if asterisk_count > 2:
                longest_example = manual_corrections[str(longest_example)]

            p2g[k][i]["example"] = longest_example
            p2g[k][i]["grapheme"] = i

            p2g_formatted[k].append(p2g[k][i])
        numItems = len(p2g[k].items())

    with open(os.path.join(DATA_DIRECTORY, 'phoneme_to_grapheme.js'), 'w') as outfile:
        outfile.write("export default ")
        outfile.write(json.dumps(p2g_formatted))    

    with open(os.path.join(DATA_DIRECTORY, 'phoneme_counts.js'), 'w') as outfile:
        outfile.write("export default ")
        outfile.write(json.dumps(p2g_counts))

In [47]:
output_grapheme_counts_js(graphemes_to_phonemes)
output_phoneme_counts_js_and_phoneme_to_grapheme_js(phonemes_to_graphemes, phoneme_grapheme_correspondence)