In [1]:
# Author: Christopher Dare
# Date: 29th October 2020

# Evaluation of Rhymes produced by YouShen poetry model

## Plan of Attack
We need to break each word down into phonemes (graphemes).
Then, we will evaluate how similar the 2 words are by rhyme. (This is quite complex).

## Rhyme evaluation
For any 2 lines of a poem, we consider the last word in each line to score their rhyme similarity.
We can classify two lines as rhyme vs not rhyme, as well as quantify how much they rhyme with each other. (more on this needs to be thought through).

Consider the following poem line pairs:  

*The way he cleaned the bowl was thorough*  
*And then he came up to ask for more*

*In the morning Cinderella would fetch water,  
At night she was the dazzling queen of the gala*

*Everytime the sun rises to shine  
I am reminded of you, beautiful love of mine*

*My brother never trusted santa claus  
So his name always fostered banter wars*

From the pairs above, we can see that there is variable length of phonemes that consitute a valid rhyme. These can range from one phoneme in for a single word to many phonemes in one word...and even many phonemes in phrases.

Key question: What's the best mathematical way of determining that both words rhyme?


## Identifying phonemes
Phonemes can be identified from pronunciation dictionary mappings or prediction models.
Two promising packages with rich datasets that could prove useful for this projects are
1. CMU pronouncing dictionary (potentially faster since it's just a dictionary)
2. Big Phoney (Has preprocessing capabilities. Could be slower since it relies on a predictive model for words not existent in a dicitonary)
Other packages can be found here [on github](https://github.com/topics/phonetics)

To get quickly off the ground, we will use BigPhoney

In [327]:
# imports
import re
from pathlib import Path
from typing import List, Union

import pronouncing

In [218]:
# drop big phoney..for now. 
# Package has internal dependency/import issues or works with a particular python version 
# Package has some good concepts like predicting phonemes for words not found in CMU's pronouncing dictionary. 
# TODO: revisit
# !git clone https://github.com/repp/big-phoney.git big_phoney -q 
# !cd big_phoney && python setup.py install
# !pip install keras
# import keras
# from big_phoney.big_phoney import BigPhoney
# phoney = BigPhoney()

### Pseudocode for scoring rhymes
for each poem:
    break down into verses
    for each verse, break down into lines
    
represent each word in every line as a set of phonemes
reverse the phoneme order for each word

Scoring rhymes in verses (discrete approach)  
for each pair of phonemes in a pair of lines to compare:  
-    get the max number (n_max) of syllables for all reversed order phonemes  
-    trim each phoneme set to the first n_max phonemes  
-    create an empty list of rhyming scores all possible phoneme pairs  
-    for each possible phoneme set A construction:  
        1. for each possible phoneme set B construction:  
        2. compare each corresponding phoneme. If they match, assign a one, else 0.  
        3. choose the max of all possible scores as the rhyme score  
    



In [447]:
class Verse:
    
    def __init__(self, lines: str, rhyme_patterns: List, max_length:int=None):
        self.verse_lines = lines
        if max_length:
            self.verse_lines = self.verse_lines[0:max_length]
        self.last_words = [line.split()[-1] for line in self.verse_lines]
        self.last_word_rhyming_part_pairs = {word:self.__get_rhyming_parts(word) for word in self.last_words}
        self.rhyme_patterns = rhyme_patterns
 
    def __get_phonemes(self, text:Union[str, List]):
        """returns all possible pronunciation of a word as phonemes
        Language used: American English. Style: Arpabet
        """
        if type(text) == str:
            phonemes = pronouncing.phones_for_word(text)
        else:
            phonemes = [pronouncing.phones_for_word(word) for word in text]
        return phonemes
    
    def __get_rhyming_parts(self, word:str):
        phonemes = self.__get_phonemes(word)
        rhyming_parts = [pronouncing.rhyming_part(phoneme) for phoneme in phonemes]
        return rhyming_parts
    
    
    def __get_valid_rhyme_patterns(self):
        valid_patterns = [pattern for pattern in self.rhyme_patterns if 
            not any(i > len(self.verse_lines)-1 for i in pattern)
        ]
        return valid_patterns
    
    
    def score(self, line_pair: List):
        first_word = self.last_words[line_pair[0]]
        second_word = self.last_words[line_pair[1]]
        first_word_rhymes = self.__get_rhyming_parts(first_word) 
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_score = 0
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                is_rhyming = first_word_rhyme == second_word_rhyme
                if (is_rhyming):
                    rhyme_score = 1
                    status = "successfully matched"
                else:
                    status = "could not match"
                print(f" {status} -> {first_word}({first_word_rhyme}) and {second_word}({second_word_rhyme})")
        return int(rhyme_score)
    
        
    def get_rhyme_score(self):
        """returns a rhyming score for the poem between 0 and 1.
        """
        valid_patterns = self.__get_valid_rhyme_patterns()
        scores = [self.score(pattern) for pattern in valid_patterns]
        return sum(scores)/len(scores)

In [468]:
class Limerick:
    def __init__(self, text:str,  rhyme_patterns: List, verse_length:int, blacklist:str=None,):
        self.blacklist = blacklist
        if blacklist:
            for term in blacklist:
                text = re.sub(term, "", text)
        self.poem_lines = [line for line in text.splitlines() if line]
        self.verse_length = verse_length
        self.rhyme_patterns = rhyme_patterns
        intervals = list(range(0,len(lines),verse_length))
        verse_lines_list = [self.poem_lines[x:x+5] for x in intervals]
        print(verse_lines_list)
        self.verses = [Verse(lines=verse_lines, rhyme_patterns=self.rhyme_patterns, max_length=self.verse_length) 
                       for verse_lines in verse_lines_list if len(verse_lines)>4]
            
    
    def get_rhyme_score(self):
        print(verse.verse_lines for verse in self.verses)
        scores = [verse.get_rhyme_score() for verse in self.verses]
        return sum(scores)/len(scores)

In [455]:
with open(sample_rhyme) as rhyme_sample:
    sample_corpus = rhyme_sample.read()
verse_lines = [line for line in sample_corpus.splitlines() if line]
verse = Verse(verse_lines, rhyme_patterns=limerick_pattern, max_length=5)
print("Scoring verse...")
score = verse.get_rhyme_score()
print(f"Rhyme score is {score}")

Scoring verse...
 successfully matched -> shine(AY1 N) and thine(AY1 N)
 successfully matched -> gate(EY1 T) and late(EY1 T)
 successfully matched -> shine(AY1 N) and mine(AY1 N)
Rhyme score is 1.0


In [450]:
limerick_pattern = [
    [0,1], [2,3], [0,4]
]
sample_gen = Path("samples/.tmp/samples-200")
sample_rhyme = Path("samples/sample_rhyme.txt")
blacklist =[
    "=+.*=+",
    "(<.endoftext)*.>"
]

In [467]:
with open(sample_rhyme) as rhyme_sample:
    sample_corpus = rhyme_sample.read()
limerick = Limerick(text=sample_corpus, rhyme_patterns=limerick_pattern, verse_length=5, blacklist=blacklist)
limerick.get_rhyme_score()

[['Everytime the sun rises to shine', 'I am reminded of you and the beautiful love of thine', 'Onward ye soldiers to the gate', 'For with the siege, I cannot be late', 'The city is truly mine'], [], [], []]
<generator object Limerick.get_rhyme_score.<locals>.<genexpr> at 0x7fc2f166ce50>
 successfully matched -> shine(AY1 N) and thine(AY1 N)
 successfully matched -> gate(EY1 T) and late(EY1 T)
 successfully matched -> shine(AY1 N) and mine(AY1 N)
[1.0]


In [241]:
pronouncing.rhyming_part(pronouncing.phones_for_word("crowd")[0]) == pronouncing.rhyming_part(pronouncing.phones_for_word("proud")[0])

True

In [440]:
def read_poem(file_path: Path):
    """reads a file containing poems and returns a 
    """
    pass