In [1]:
# !pip install pronouncing



In [2]:
# !pip install Metaphone

Collecting Metaphone
  Downloading Metaphone-0.6.tar.gz (14 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: Metaphone
  Building wheel for Metaphone (setup.py) ... [?25ldone
[?25h  Created wheel for Metaphone: filename=Metaphone-0.6-py3-none-any.whl size=13902 sha256=018fc34d9085ec7a233d64dee2185133aeeb18655454a66ca82ce9bf5d9f2cc4
  Stored in directory: /home/byczong/.cache/pip/wheels/23/dd/1d/6cdd346605db62bde1f60954155e9ce48f4681c243f265b704
Successfully built Metaphone
Installing collected packages: Metaphone
Successfully installed Metaphone-0.6


In [1]:
import pronouncing
from metaphone import doublemetaphone
import re

### Finding rhymes

- Goal: Find set of words (unigrams) that rhyme with a given word

- Rough algorithm plan:
    - Input: `s`: str
    - Expand any abbreviations, symbols and numbers in `s` ('mr' -> 'mister', '%' -> 'percent')
    - Convert with the chosen phonetic algorithm (Soundex, Metaphone ...) to phonetic form (as pronounced in English)
    - Find similar, existing words (from unigrams file) based on matching suffixes (need to choose the appropriate suffix lenght)

- https://en.wikipedia.org/wiki/Soundex
    - (1918)
- https://en.wikipedia.org/wiki/Metaphone
    - Metaphone (1990)
    - Double Metaphone (2000)
    - Metaphone 3 (2009) - commercial product

**CMU ARPABET dictionary**

In [3]:
pronouncing.phones_for_word("sighs")

['S AY1 Z']

In [4]:
pronouncing.rhymes("failings")

['mailings', 'railings', 'tailings']

**Double Metaphone**

- does not take vowels (other than the first character) into account :(
- it can return both a primary and a secondary code for a string

In [5]:
def comp(t1, t2):
    print(f'{t1}\n{t2}')

In [6]:
comp(doublemetaphone("shell"), doublemetaphone("smell"))

('XL', '')
('SML', 'XML')


In [7]:
comp(doublemetaphone("generate"), doublemetaphone("degenerate"))

('JNRT', 'KNRT')
('TJNRT', 'TKNRT')


In [20]:
comp(doublemetaphone("mykola"), doublemetaphone("cocacola"))

('MKL', '')
('KKKL', '')


In [21]:
comp(doublemetaphone("mykole"), doublemetaphone("cocacola"))

('MKL', '')
('KKKL', '')


**Example "rhyme representation" algorithm**

- Input string s
- m = doublemetaphone(s)
- result = insert the vowels from s inside m  # (todo: how)
- in order to find set of rhymes match same suffixes # (todo: how long should the suffix be)

In [9]:
comp('generate',
     doublemetaphone('generate')[0])
'TJeNeRaTe'

generate
JNRT


'TJeNeRaTe'

In [10]:
comp('failings',
     doublemetaphone('failings')[0])
'FaiLiNKS'

failings
FLNKS


'FaiLiNKS'

In [12]:
comp("mykola", 
     doublemetaphone("mykola"))

'MyKoLa'

mykola
('MKL', '')


'MyKoLa'

In [16]:
comp("mass", 
     doublemetaphone("mass"))

'MaS'

mass
('MS', '')


'MaS'

**ARPABET model?**

In [1]:
!pip install g2p_en

Collecting g2p_en
  Downloading g2p_en-2.1.0-py3-none-any.whl (3.1 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m13.3 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m01[0m
Collecting nltk>=3.2.4 (from g2p_en)
  Downloading nltk-3.8.1-py3-none-any.whl (1.5 MB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.5/1.5 MB[0m [31m35.6 MB/s[0m eta [36m0:00:00[0m MB/s[0m eta [36m0:00:01[0m
[?25hCollecting inflect>=0.3.1 (from g2p_en)
  Downloading inflect-6.0.4-py3-none-any.whl (34 kB)
Collecting distance>=0.1.3 (from g2p_en)
  Downloading Distance-0.1.3.tar.gz (180 kB)
[2K     [38;2;114;156;31m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m180.3/180.3 kB[0m [31m39.0 MB/s[0m eta [36m0:00:00[0m
[?25h  Preparing metadata (setup.py) ... [?25ldone
[?25hCollecting pydantic>=1.9.1 (from inflect>=0.3.1->g2p_en)
  Downloading pydantic-1.10.8-cp310-cp310-manylinux_2_17_x86_64.manylinux201

In [30]:
from time import perf_counter
from g2p_en import G2p


g2p = G2p()

t1 = perf_counter()
out = g2p("qwertyuiop")
print(out)
t2 = perf_counter()

print(f'{t2-t1} s')

['K', 'W', 'ER1', 'T', 'IY0', 'AH0', 'W', 'AH0', 'M']
0.017724068999996234 s


In [23]:
pronouncing.search('N IH0 S T$')[:5]

['abortionist', 'africanist', 'agonist', 'anticommunist', 'arsonist']

In [25]:
pronouncing.search('AA1 L ER0 Z$')

['ahlers',
 'collars',
 "collor's",
 "controllers'",
 "dollar's",
 'dollars',
 "dollars'",
 'hollars',
 'hollers',
 "mahler's",
 "scholar's",
 'scholars',
 'sollars',
 'sollers',
 'wahlers',
 'zollars']

**Modify double metaphone to preserve vowels**