Skip to content
Permalink
Browse files

add JSON data support

  • Loading branch information...
mphilli committed Feb 20, 2019
1 parent 031abc3 commit 0f98981345f91c922f96f69c0ce691e408992254
@@ -31,6 +31,9 @@ markings (ˈ, ˌ) should be retained. Understood arguments are:
* "secondary" - retains secondary stress only
* "both" - to keep both primary and secondary stress markers.

* **mode** : *string, optional (default='sql')* - Accepts "sql" or "json", depending on which version of the database you'd like to use.
As another option for JSON users, simply use the function `jonvert` instead of `convert`.

The `ipa_list` function returns a list of each word as a list of all its possible transcriptions. It has all the same
optional `stress_marks` and `keep_punct` parameters as `convert`.
```Python
@@ -53,6 +56,7 @@ The `get_rhymes` function returns a list of rhymes for a word or set of words.
>>> ipa.get_rhymes("rhyming function")
[['climbing', 'diming', 'liming', 'priming', 'timing'], ['compunction', 'conjunction', 'dysfunction', 'injunction', 'junction', 'malfunction']]
```
*Use the function `jhymes` function instead to force usage of the JSON database.*

The `syllable_count` function returns an integer, corresponding to the number of syllables in a word. Returns a list of
syllable counts if more than one word is provided in the input string.
@@ -2,4 +2,6 @@
from .stress import *
from .rhymes import *
from .syllables import *
from .transcriber import *

__all__ = ['en_to_ipa', 'rhymes', 'stress', 'syllables']
BIN +0 Bytes (100%) eng_to_ipa/resources/CMU_dict.db
Binary file not shown.

Large diffs are not rendered by default.

@@ -1,5 +1,5 @@
# Simple rhyming support. Call get_rhymes() on a word to find rhymes from the CMU dictionary.
from eng_to_ipa.transcribe import c, get_cmu, preprocess
from eng_to_ipa.transcribe import mode_type, get_cmu, preprocess


def remove_onset(word_in):
@@ -9,15 +9,30 @@ def remove_onset(word_in):
return ' '.join(phone_list[i:])


def get_rhymes(word):
def get_rhymes(word, mode="sql"):
if len(word.split()) > 1:
return [get_rhymes(w) for w in word.split()]
phones = remove_onset(preprocess(word))
phones_full = get_cmu([preprocess(word)])[0][0]
c.execute(f"SELECT word, phonemes FROM dictionary WHERE phonemes "
f"LIKE \"%{phones}\" AND NOT word=\"{word}\" " # don't count word as its own rhyme
f"AND NOT phonemes=\"{phones_full}\"") # don't return results that are the same but spelled differently
return sorted(list(set([r[0] for r in c.fetchall()])))
if mode == "sql":
c = mode_type(mode)
c.execute(f"SELECT word, phonemes FROM dictionary WHERE phonemes "
f"LIKE \"%{phones}\" AND NOT word=\"{word}\" " # don't count word as its own rhyme
f"AND NOT phonemes=\"{phones_full}\"")
# also don't return results that are the same but spelled differently
return sorted(list(set([r[0] for r in c.fetchall()])))
elif mode == "json":
r_list = []
for key, val in mode_type(mode).items():
for v in val:
if v.endswith(phones) and word != key and v != phones_full:
r_list.append(key)
return sorted(set(r_list))


def jhymes(word):
"""Get rhymes with forced JSON mode."""
return get_rhymes(word, mode="json")


if __name__ == "__main__":
@@ -30,9 +30,9 @@ def cmu_syllable_count(word):
return nuclei


def syllable_count(word: str):
def syllable_count(word: str, db_type="sql"):
"""transcribes a regular word to CMU to fetch syllable count"""
if len(word.split()) > 1:
return [syllable_count(w) for w in word.split()]
word = transcribe.get_cmu([transcribe.preprocess(word)])
word = transcribe.get_cmu([transcribe.preprocess(word)], db_type=db_type)
return cmu_syllable_count(word[0][0])
@@ -2,11 +2,20 @@
import re
from os.path import join, abspath, dirname
import eng_to_ipa.stress as stress
import sqlite3
from collections import defaultdict

conn = sqlite3.connect(join(abspath(dirname(__file__)), "./resources/CMU_dict.db"))
c = conn.cursor()

def mode_type(mode_in):
"""In the case of "sql", this will return an sqlite cursor.
In the case of "json", this will return a json dictionary of the data."""
if mode_in.lower() == "sql":
import sqlite3
conn = sqlite3.connect(join(abspath(dirname(__file__)), "./resources/CMU_dict.db"))
return conn.cursor()
elif mode_in.lower() == "json":
import json
json_file = open(join(abspath(dirname(__file__)), "../eng_to_ipa/resources/CMU_dict.json"), encoding="UTF-8")
return json.load(json_file)


def preprocess(words):
@@ -52,20 +61,28 @@ def _punct_replace_word(original, transcription):
return transcription


def fetch_words(words_in):
def fetch_words(words_in, db_type="sql"):
"""fetches a list of words from the database"""
quest = "?, " * len(words_in)
c.execute(f"SELECT word, phonemes FROM dictionary WHERE word IN ({quest[:-2]})", words_in)
result = c.fetchall()
d = defaultdict(list)
for k, v in result:
d[k].append(v)
return list(d.items())


def get_cmu(tokens_in):
asset = mode_type(db_type)
if db_type.lower() == "sql":
quest = "?, " * len(words_in)
asset.execute(f"SELECT word, phonemes FROM dictionary WHERE word IN ({quest[:-2]})", words_in)
result = asset.fetchall()
d = defaultdict(list)
for k, v in result:
d[k].append(v)
return list(d.items())
if db_type.lower() == "json":
words = []
for k, v in asset.items():
if k in words_in:
words.append((k, v))
return words


def get_cmu(tokens_in, db_type="sql"):
"""query the SQL database for the words and return the phonemes in the order of user_in"""
result = fetch_words(tokens_in)
result = fetch_words(tokens_in, db_type)
ordered = []
for word in tokens_in:
this_word = [[i[1] for i in result if i[0] == word]][0]
@@ -152,36 +169,41 @@ def get_all(ipa_list):
return sorted([sent[:-1] for sent in list_all])


def ipa_list(words_in, keep_punct=True, stress_marks='both'):
def ipa_list(words_in, keep_punct=True, stress_marks='both', db_type="sql"):
"""Returns a list of all the discovered IPA transcriptions for each word."""
if type(words_in) == str:
words = [preserve_punc(w.lower())[0] for w in words_in.split()]
else:
words = [preserve_punc(w.lower())[0] for w in words_in]
cmu = get_cmu([w[1] for w in words])
cmu = get_cmu([w[1] for w in words], db_type=db_type)
ipa = cmu_to_ipa(cmu, stress_marking=stress_marks)
if keep_punct:
ipa = _punct_replace_word(words, ipa)
return ipa


def isin_cmu(word):
def isin_cmu(word, db_type="sql"):
"""checks if a word is in the CMU dictionary. Doesn't strip punctuation.
If given more than one word, returns True only if all words are present."""
if type(word) == str:
word = [preprocess(w) for w in word.split()]
results = fetch_words(word)
results = fetch_words(word, db_type)
as_set = list(set(t[0] for t in results))
return len(as_set) == len(set(word))


def convert(text, retrieve_all=False, keep_punct=True, stress_marks='both'):
def convert(text, retrieve_all=False, keep_punct=True, stress_marks='both', mode="sql"):
"""takes either a string or list of English words and converts them to IPA"""
ipa = ipa_list(
words_in=text,
keep_punct=keep_punct,
stress_marks=stress_marks
)
stress_marks=stress_marks,
db_type=mode)
if retrieve_all:
return get_all(ipa)
return get_top(ipa)


def jonvert(text, retrieve_all=False, keep_punct=True, stress_marks='both'):
"""Forces use of JSON database for fetching phoneme data."""
return convert(text, retrieve_all, keep_punct, stress_marks, mode="json")
@@ -0,0 +1,26 @@
# 2/19 - not yet implemented

import re
from os.path import join, abspath, dirname
import eng_to_ipa.stress
import sqlite3
from collections import defaultdict


class Transcriber:

def __init__(self, mode="sql", stress="both"):
self._mode = mode
self.stress = stress
self.c = None # potential SQL cursor

@property
def _mode(self):
return self.mode

@_mode.setter
def _mode(self, value):
if value.lower() == "sql":
conn = sqlite3.connect(join(abspath(dirname(__file__)), "./resources/CMU_dict.db"))
self.c = conn.cursor()
self.mode = ""
@@ -0,0 +1,34 @@
import json
import re
from os.path import join, abspath, dirname


def create_json():
"""takes the prepared data and places it into the database"""
data_dict = {}
with open(join(abspath(dirname(__file__)), '..\eng_to_ipa\\resources\CMU_source_files/cmudict-0.7b.txt'), "r",
encoding="UTF-8") as source_file:
for line in source_file.readlines():
word_token = line.split(" ")[0].lower()
word = re.sub("\(\d\)", "", word_token)
phonemes = line.split(" ")[1].replace("\n", "").lower()
if word_token != word and word in data_dict:
# already encountered, append
data_dict[word].append(phonemes)
else:
data_dict[word] = [phonemes]
json_dict = json.dumps(data_dict)
with open(join(abspath(dirname(__file__)), "../eng_to_ipa/resources/CMU_dict.json"), "w",
encoding="UTF-8") as j_file:
j_file.write(str(json_dict))


if __name__ == "__main__":
create_json()
# small test to verify valid database creation:
json_file = open(join(abspath(dirname(__file__)), "../eng_to_ipa/resources/CMU_dict.json"), encoding="UTF-8")
json_obj = json.load(json_file)
for key, val in json_obj.items():
if "rose" in key:
for v in val:
print(key, v)
@@ -38,6 +38,6 @@ def insert_dictionary_values():
create_dictionary_table()
insert_dictionary_values()
# small test to verify valid database creation:
c.execute("SELECT * FROM dictionary WHERE word like \"%rose%\"")
c.execute("SELECT * FROM dictionary WHERE word like \"%the%\"")
for r in c.fetchall():
print(str(r))

0 comments on commit 0f98981

Please sign in to comment.
You can’t perform that action at this time.