In [1]:
# Author: Christopher Dare
# Date: 29th October 2020

# Evaluation of Rhymes produced by YouShen poetry model

## Plan of Attack
We need to break each word down into phonemes (graphemes).
Then, we will evaluate how similar the 2 words are by rhyme. (This is quite complex).

## Rhyme evaluation
For any 2 lines of a poem, we consider the last word in each line to score their rhyme similarity.
We can classify two lines as rhyme vs not rhyme, as well as quantify how much they rhyme with each other. (more on this needs to be thought through).

Consider the following poem line pairs:  

*The way he cleaned the bowl was thorough*  
*And then he came up to ask for more*

*In the morning Cinderella would fetch water,  
At night she was the dazzling queen of the gala*

*Everytime the sun rises to shine  
I am reminded of you, beautiful love of mine*

*My brother never trusted santa claus  
So his name always fostered banter wars*

From the pairs above, we can see that there is variable length of phonemes that consitute a valid rhyme. These can range from one phoneme in for a single word to many phonemes in one word...and even many phonemes in phrases.

Key question: What's the best mathematical way of determining that both words rhyme?


## Identifying phonemes
Phonemes can be identified from pronunciation dictionary mappings or prediction models.
Two promising packages with rich datasets that could prove useful for this projects are
1. CMU pronouncing dictionary (potentially faster since it's just a dictionary)
2. Big Phoney (Has preprocessing capabilities. Could be slower since it relies on a predictive model for words not existent in a dicitonary)
Other packages can be found here [on github](https://github.com/topics/phonetics)

To get quickly off the ground, we will use BigPhoney

In [127]:
# imports
from pathlib import Path
from typing import List, Union

In [None]:
!git clone https://github.com/repp/big-phoney.git big_phoney -q 
!cd big_phoney && python setup.py install

In [None]:
!pip install keras

In [None]:
import keras
from big_phoney.big_phoney import BigPhoney
phoney = BigPhoney()
## Looks like the packages has dependency management issues. I'll come back to the code later.

In [2]:
import pronouncing
pronouncing.phones_for_word("permit")

['P ER0 M IH1 T', 'P ER1 M IH2 T']

In [14]:
sample_corpus = "\
The way he cleaned the bowl was thorough\
And then he came up to ask for more\
In the morning Cinderella would fetch water,\
At night she was the dazzling queen of the gala\
Everytime the sun rises to shine\
I am reminded of you, beautiful love of mine\
My brother never trusted santa claus\
So his name always fostered banter wars\
"

In [155]:
with open(Path("samples/sample_rhyme.txt")) as rhyme_sample:
    sample_corpus = rhyme_sample.read()

In [156]:
sample_corpus

"this is all about grandma who's proud\n\nof her years in society's crowd\n\nshe has got a big raise\n\nin those fungal-type ways"

In [7]:
valid_lines = [line for line in sample_corpus.splitlines() if line]
valid_lines

['The way he cleaned the bowl was thorough',
 'And then he came up to ask for more',
 'In the morning Cinderella would fetch water,',
 'At night she was the dazzling queen of the gala',
 'Everytime the sun rises to shine',
 'I am reminded of you, beautiful love of mine',
 'My brother never trusted santa claus',
 'So his name always fostered banter wars']

In [8]:
last_words = [line.split()[-1] for line in valid_lines]
last_words

['thorough', 'more', 'water,', 'gala', 'shine', 'mine', 'claus', 'wars']

### Pseudocode for scoring rhymes
for each poem:
    break down into verses
    for each verse, break down into lines
    
represent each word in every line as a set of phonemes
reverse the phoneme order for each word

for each pair of phonemes in a pair of lines to compare:
    get the max number (n_max) of syllables for all reversed order phonemes
    trim each phoneme set to the first n_max phonemes
    create an empty list of rhyming scores all possible phoneme pairs
    for each possible phoneme set A construction:
        for each possible phoneme set B construction:
        compare each corresponding phoneme. If they match, assign a one, else 0.
    



In [188]:
class Poem:
    
    def __init__(self, text: str, rhyme_patterns: List):
        self.word_lines = [line for line in text.splitlines() if line]
        
        last_words = [line.split()[-1] for line in self.word_lines]
        self.last_word_rhyming_part_pairs = {word:self.__get_rhyming_parts(word) for word in last_words}
        self.rhyme_patterns = rhyme_patterns
 
    def __get_phonemes(self, text:Union[str, List]):
        """returns all possible pronunciation of a word as phonemes
        Language used: American English. Style: Arpabet
        """
        if type(text) == str:
            phonemes = pronouncing.phones_for_word(text)
        else:
            phonemes = [pronouncing.phones_for_word(word) for word in text]
        return phonemes
    
    def __get_rhyming_parts(self, word:str):
        phonemes = self.__get_phonemes(word)
        rhyming_parts = [pronouncing.rhyming_part(phoneme) for phoneme in phonemes]
        return rhyming_parts
    
    
    def __get_valid_rhyme_patterns(self):
        valid_patterns = [pattern for pattern in self.rhyme_patterns if 
            not any(i > len(self.word_lines)-1 for i in pattern)
        ]
        return valid_patterns
    
    
    def score(self, line_pair: List):
        print("scoring")
        first_word = self.word_lines[line_pair[0]]
        first_word_rhymes = self.__get_rhyming_parts(first_word) 
        second_word = self.word_lines[line_pair[1]]
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_score = 0
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                is_rhyming = first_word_rhyme == second_word_rhyme
                print(is_rhyming)
                if (is_rhyming):
                    rhyme_score = 1
        return rhyme_score
    
        
    def get_rhyme_score(self):
        """returns a rhyming score for the poem between 0 and 1.
        """
        valid_patterns = self.__get_valid_rhyme_patterns()
        scores = [self.score(pattern) for pattern in valid_patterns]
        return scores
        
            

In [189]:
limerick_pattern = [
    [0,1], [2,3], [0,4]
]
limerick = Poem(sample_corpus, rhyme_patterns=limerick_pattern)

In [190]:
limerick.get_rhyme_score()

scoring
scoring


[0, 0]

In [182]:
pronouncing.rhyming_part(pronouncing.phones_for_word("crowd")[0]) == pronouncing.rhyming_part(pronouncing.phones_for_word("proud")[0])

True

In [27]:
poem.word_lines[0]

'The way he cleaned the bowl was thorough'

In [28]:
pronouncing.phones_for_word(poem.word_lines[0])

[]

In [None]:
def read_poem(file_path: Path):
    """reads a file containing poems and returns a 
    """

In [46]:
[pronouncing.phones_for_word(last_word) for last_word in last_words]

[['TH ER1 OW0', 'TH AO1 R OW0'],
 ['M AO1 R'],
 [],
 ['G AE1 L AH0', 'G EY1 L AH0'],
 ['SH AY1 N'],
 ['M AY1 N'],
 ['K L AO1 Z'],
 ['W AO1 R Z']]