Import relevant packages

In [None]:
import numpy as np
import pandas as pd

#CMU pronunciation (phoneme) dictionary
import cmudict

#Natural Language Toolkit
import nltk
from nltk import word_tokenize
from nltk import pos_tag
from nltk.stem import WordNetLemmatizer
lemmas = WordNetLemmatizer()

nltk.download('cmudict')
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')

Word tokenization from raw text string

In [None]:
text='Good afternoon everyone, happy halloween!'
tokens=word_tokenize(text)
print(tokens)

Part-of-speech tagging tokenized text

In [None]:
#part-of-speech tag (FULL LIST OF TAGS: https://www.ling.upenn.edu/courses/Fall_2003/ling001/penn_treebank_pos.html)
tagged=pos_tag(tokens)
print(tagged)

Context-dependent POS tagging 

In [None]:
#"Jump" as a VERB
text='Matt can jump a really high jump.'
tokens=word_tokenize(text)
tagged=pos_tag(tokens)
print(tagged[tokens.index('jump')])

#"Jump" as a NOUN
text='That was a high jump.'
tokens=word_tokenize(text)
tagged=pos_tag(tokens)
print(tagged[tokens.index('jump')])



Word lemmatization by syntactic category

In [None]:
#POS keys
#Noun:      'n'
#Adjective: 'a'
#Verb:      'v'


print(lemmas.lemmatize('biggest','a'))

print(lemmas.lemmatize('giraffes','n'))

print(lemmas.lemmatize('fighting','v'))

Lexical frequencies with downloadable SUBTLEX database

In [None]:
#read excel file
subtlex = pd.read_excel('SUBTLEX.xlsx')

#get wordlist column
subtlex_wordlist=list(subtlex['Word'])

#get Log10 word frequency column
freqs=np.array(subtlex['Lg10WF'])

#set word frequency dict
word_freqs=dict(zip(subtlex_wordlist,freqs))

In [None]:
print(word_freqs['the'])
print(word_freqs['unscrupulous'])
print(word_freqs['ghost'])

Get phonetic information with importable CMU phoneme dictionary

In [None]:
#get phoneme dictionary
phonemes=cmudict.dict()

#get phonemes
word='halloween'
phons=phonemes[word][0]
print(phons)

#count syllables (phonemes where a 0, 1, or 2 occurs)
syl_phons=[ph for ph in phons if any(char.isdigit() for char in ph)]
num_syls=len(syl_phons)
print(num_syls)

A simple rhyme finder with part-of-speech, plurality, syllable and lexical frequency constraints

In [None]:
#TARGET CRITERIA
#--Singular noun
#--Syllables ≥ 2
#--Lexical frequency > 2.0

#set target word
word1='moon'

#get target phonemes
phon1=phonemes[word1][0]

#strip numerals from phonemes
phon1=[[''.join([c for c in ph if not c.isdigit()])] for ph in phon1]

#loop through phoneme dictionary until rhyme is found
for word2 in phonemes:
    
    #continue loop if not noun
    if 'NN' not in pos_tag([word2])[0][1]:
        continue
        
    #continue loop if not a singular noun
    if word2 != lemmas.lemmatize(word2,'n'):
        continue

    #continue loop if word has frequency < 2.0
    if word2 not in word_freqs or word_freqs[word2] < 2.0:
        continue
        
    #get word2 phonemes
    phon2=phonemes[word2][0]
    
    #count syllables
    syl_phons=[ph for ph in phon2 if any(char.isdigit() for char in ph)]
    num_syls=len(syl_phons)
    
    #continue if word has less than 2 syllables
    if num_syls < 2:
        continue
        
    #strip numerals from phonemes
    phon2=[[''.join([c for c in ph if not c.isdigit()])] for ph in phon2]
    
    #check if last two phonemes match
    if len(phon2) > 1 and phon2[-2]==phon1[-2] and phon2[-1]==phon1[-1]:  
        
        print(word2)
          

What's wrong with rhyme finder output? We can get more accurate part-of-speech info from SUBLTEX.

In [None]:
#get all possible part-of-speech
all_POS=np.array(subtlex['All_PoS_SUBTLEX'])

all_POS_freqs=np.array(subtlex['All_freqs_SUBTLEX'])

pos_freq_list=[]
for pos,pos_freqs in zip(all_POS,all_POS_freqs):
    
    #if word has multiple possible parts of speech
    if '.' in str(pos_freqs):
        
        #split lists into items
        pos=pos.split('.')
        pos_freqs=pos_freqs.split('.')
        
        #change freqs from string to float
        pos_freqs=[float(pf) for pf in pos_freqs]
        
        #normalize pos frequency proportions
        pos_freqs=[pf/np.sum(pos_freqs) for pf in pos_freqs]
        
        #add to list
        word_pos_freqs=[[p,f] for p,f, in zip(pos,pos_freqs)]
    
    #if word has 1 possible part of speech, set proportion to 100
    else:
        word_pos_freqs=[[pos,1]]
        
    
    pos_freq_list.append(word_pos_freqs)

#set word part-of-speech dict
word_POS_freqs=dict(zip(subtlex_wordlist,pos_freq_list))


print(word_POS_freqs['jump'])

Try the rhyme finder again, targeting words that are nouns > 50% of the time

In [None]:
#TARGET CRITERIA
#--Noun > 50% of occurrences
#--Singular noun
#--Syllables ≥ 2
#--Lexical frequency > 2.0

#set target word
word1='moon'

#get target phonemes
phon1=phonemes[word1][0]

#strip numerals from phonemes
phon1=[[''.join([c for c in ph if not c.isdigit()])] for ph in phon1]

#loop through phoneme dictionary until rhyme is found
for word2 in phonemes:
    
    #################################################NEW CODE
    #continue loop if not noun
    if word2 not in word_POS_freqs:
        continue
    else:
        pos_freqs=word_POS_freqs[word2]
        
        pf_pos=[pf[0] for pf in pos_freqs]
        pf_freq=[pf[1] for pf in pos_freqs]
        
        if 'Noun' not in pf_pos or pf_freq[pf_pos.index('Noun')] < 0.50:
            continue
    #################################################NEW CODE

    #continue loop if not a singular noun
    if word2 != lemmas.lemmatize(word2,'n'):
        continue

    #continue loop if word has frequency < 2.0
    if word2 not in word_freqs or word_freqs[word2] < 2.0:
        continue
        
    #get word2 phonemes
    phon2=phonemes[word2][0]
    
    #count syllables
    syl_phons=[ph for ph in phon2 if any(char.isdigit() for char in ph)]
    num_syls=len(syl_phons)
    
    #continue if word has less than 2 syllables
    if num_syls < 2:
        continue
    
    #strip numerals from phonemes
    phon2=[[''.join([c for c in ph if not c.isdigit()])] for ph in phon2]
    
    #check if last two phonemes match
    if phon2[-2]==phon1[-2] and phon2[-1]==phon1[-1]:  
        
        print(word2)