In [1]:
# Author: LickMyRicks
# Date: 29th March 2023

# Evaluation of Rhymes produced by YouShen poetry model

## Plan of Attack
We need to break each word down into phonemes (graphemes).
Then, we will evaluate how similar the 2 words are by rhyme. (This is quite complex).

## Rhyme evaluation
For any 2 lines of a poem, we consider the last word in each line to score their rhyme similarity.
We can classify two lines as rhyme vs not rhyme, as well as quantify how much they rhyme with each other. (more on this needs to be thought through).

Consider the following poem line pairs:  

*The way he cleaned the bowl was thorough*  
*And then he came up to ask for more*

*In the morning Cinderella would fetch water,  
At night she was the dazzling queen of the gala*

*Everytime the sun rises to shine  
I am reminded of you, beautiful love of mine*

*My brother never trusted santa claus  
So his name always fostered banter wars*

From the pairs above, we can see that there is variable length of phonemes that consitute a valid rhyme. These can range from one phoneme in for a single word to many phonemes in one word...and even many phonemes in phrases.

Key question: What's the best mathematical way of determining that both words rhyme?


## Identifying phonemes
Phonemes can be identified from pronunciation dictionary mappings or prediction models.
Two promising packages with rich datasets that could prove useful for this projects are
1. CMU pronouncing dictionary (potentially faster since it's just a dictionary)
2. Big Phoney (Has preprocessing capabilities. Could be slower since it relies on a predictive model for words not existent in a dicitonary)
Other packages can be found here [on github](https://github.com/topics/phonetics)

To get quickly off the ground, we will use BigPhoney

In [1]:
# uncomment to download required package. easier and faster this way
!pip install pronouncing -q

In [2]:
# imports
import re
from pathlib import Path
from typing import List, Union

import numpy as np
import pronouncing

In [4]:
# drop big phoney..for now. 
# Package has internal dependency/import issues or works with a particular python version 
# Package has some good concepts like predicting phonemes for words not found in CMU's pronouncing dictionary. 
# TODO: revisit
# !git clone https://github.com/repp/big-phoney.git big_phoney -q 
# !cd big_phoney && python setup.py install
# !pip install keras
# import keras
# from big_phoney.big_phoney import BigPhoney
# phoney = BigPhoney()

### Pseudocode for scoring rhymes
for each poem:
    break down into verses
    for each verse, break down into lines
    
represent each word in every line as a set of phonemes
reverse the phoneme order for each word

Scoring rhymes in verses (discrete approach)  
for each pair of phonemes in a pair of lines to compare:  
-    get the max number (n_max) of syllables for all reversed order phonemes  
-    trim each phoneme set to the first n_max phonemes  
-    create an empty list of rhyming scores all possible phoneme pairs  
-    for each possible phoneme set A construction:  
        1. for each possible phoneme set B construction:  
        2. compare each corresponding phoneme. If they match, assign a one, else 0.  
        3. choose the max of all possible scores as the rhyme score  
    



### Real deal

In [3]:
class Limerick:
    
    def __init__(self, lines: str, rhyme_patterns: List, max_length:int=None):
        self.verse_lines = lines
        if max_length:
            self.verse_lines = self.verse_lines[0:max_length]
        self.last_words = [line.split()[-1] for line in self.verse_lines]
        self.last_word_rhyming_part_pairs = {word:self.__get_rhyming_parts(word) for word in self.last_words}
        self.rhyme_patterns = rhyme_patterns
 
    def __get_phonemes(self, text:Union[str, List]):
        """returns all possible pronunciation of a word as phonemes
        Language used: American English. Style: Arpabet
        """
        if type(text) == str:
            phonemes = pronouncing.phones_for_word(text)
        else:
            phonemes = [pronouncing.phones_for_word(word) for word in text]
        return phonemes
    
    def __get_rhyming_parts(self, word:str):
        phonemes = self.__get_phonemes(word)
        rhyming_parts = [pronouncing.rhyming_part(phoneme) for phoneme in phonemes]
        return rhyming_parts
    
    
    def __get_valid_rhyme_patterns(self):
        valid_patterns = [pattern for pattern in self.rhyme_patterns if 
            not any(i > len(self.verse_lines)-1 for i in pattern)
        ]
        return valid_patterns
    
    
    def score(self, line_pair: List):
        first_word = self.last_words[line_pair[0]]
        second_word = self.last_words[line_pair[1]]
        first_word_rhymes = self.__get_rhyming_parts(first_word) 
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_score = 0
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                is_rhyming = first_word_rhyme == second_word_rhyme
                if (is_rhyming):
                    rhyme_score = 1
                    status = "successfully matched"
                else:
                    status = "could not match"
                # uncomment to debug
                print(f" {status} -> {first_word}({first_word_rhyme}) and {second_word}({second_word_rhyme})")
        return int(rhyme_score)
    
    
    def score_edit_distance(self, line_pair: List):
        first_word = self.last_words[line_pair[0]]
        second_word = self.last_words[line_pair[1]]
        first_word_rhymes = self.__get_rhyming_parts(first_word) 
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_scores = []
#         import pdb; pdb.set_trace()
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                distance = calculate_edit_distance(first_word_rhyme,second_word_rhyme)
                rhyme_scores.append(distance)
        # there's a possibility that rhyme_scores will be an empty list.
        # this will be if no rhyming parts for a given set of words is found.
        # in that case, return None
        if len(rhyme_scores) > 0:
            rhyme_score =  min(rhyme_scores)
        else:
            rhyme_score = None
        return rhyme_score
    
        
    def get_rhyme_score(self):
        """returns a rhyming score for the poem between 0 and 1.
        """
        valid_patterns = self.__get_valid_rhyme_patterns()
        scores = [self.score(pattern) for pattern in valid_patterns]
        return sum(scores)/len(scores)

    
    def __repr__(self):
        return repr("\n".join(self.verse_lines))

In [4]:
class SamplePoem:
    def __init__(self, text:str,  rhyme_patterns: List, verse_length:int, blacklist:List=None,):
        self.lines = [line for line in text.splitlines() if line]
        self.verse_length = verse_length
        self.rhyme_patterns = rhyme_patterns
        intervals = list(range(0,len(self.lines),verse_length))
        verse_lines_list = [self.lines[x:x+5] for x in intervals]
        self.verses = [Limerick(lines=verse_lines, rhyme_patterns=self.rhyme_patterns, max_length=self.verse_length) 
                       for verse_lines in verse_lines_list]
        
    
    def __get_item__(self, key):
        return self.verses[key]


    def get_rhyme_score(self):
        if len(self.verses):
            scores = [verse.get_rhyme_score() for verse in self.verses]
            score = sum(scores)/len(scores)
        else:
            score = None
        return score
    
    
    def __repr__(self):
        return repr(self.verses)

In [5]:
def clean(text: str, blacklist:List):
    for term in blacklist:
        text = re.sub(term, "", text)
    return text

In [6]:
def read_poems(file_path: Path, blacklist):
    """reads a file containing poems and returns a list of limerick samples found in the file
    """
    with open(file_path) as file:
        text = file.read()
    poem_samples = text.split("<|endoftext|>")
    poems = [clean(sample, blacklist) for sample in poem_samples if len(sample)>0]
    return [poem for poem in poems if len(poem)>0]

In [7]:
def test_scoring_limerick(sample_rhyme: Path):
    """Sanity check to test scoring of a single limerick
    """
    with open(sample_rhyme) as rhyme_sample:
        sample_corpus = rhyme_sample.read()
    limerick_lines = [line for line in sample_corpus.splitlines() if line]
    limerick = Limerick(lines=limerick_lines, rhyme_patterns=limerick_pattern, max_length=5)
    print("Scoring limerick...")
    score = limerick.get_rhyme_score()
    print(f"Rhyme score is {score}")
    assert(type(score) == float )

In [8]:
def score_poems(file_path: Path, last_word_pattern:str, blacklist:List ):
    """Reads limericks in generated samples and scores them between 0 and 1
    """
    poems = [SamplePoem(text=poem_sample, rhyme_patterns=limerick_pattern, verse_length=5) 
             for poem_sample in read_poems(file_path, blacklist=blacklist) if len(poem_sample) >0]
    poems = [poem for poem in poems if poem.lines]
    poem_scores = [poem.get_rhyme_score() for poem in poems]
    return poem_scores

### Testing and sampling

In [9]:
limerick_pattern = [
    [0,1], [2,3], [0,4]
]
sample_gen = Path("samples/.tmp/samples-200") #path to file containing sample GPT2 poems 
blacklist =[
    "=+.*=+",
    "(<.endoftext)*.>"
]

In [10]:
sample_rhyme = Path("samples/sample_rhyme.txt")
test_scoring_limerick(sample_rhyme)

FileNotFoundError: [Errno 2] No such file or directory: 'samples/sample_rhyme.txt'

In [None]:
poem_scores = score_poems(file_path=sample_gen, last_word_pattern=limerick_pattern, blacklist=blacklist)
print(f"\n\nScores for detected limerick attemps -> {poem_scores}")

### Next steps
1. Improve rhyming similarities. Shine does rhyme with time although their rhyming parts are not a 100% match. Give a smoothed score for rhymes so that checking for e.g. AA is not just 0 or 1 but any float within the range.
2. Crack the code to identifying the rhyme parts of a word.

In [None]:
def sigmoid(X):
   return 1/(1+np.exp(-X))

In [17]:
def calculate_edit_distance(phoneme_set_a: List[str], phoneme_set_b: List[str], levenshtein=True):
    """Calculates edit distance between 2 sets of phonemes
    
    Parameters
    ----------
    phoneme_set_a: list
        word or rhyming part to be compared to. 
        This is represented as a string or list of phonemes representing a word or its rhyming part.
    phoneme_set_a: list
        word or rhyming part for which we want compute how different it is from phoneme_set_a
        This is also represented as a string or a list of phonemes representing a word or its rhyming part.
    levenshtein: bool, default = True
        Boolean indicating whether the distance should be conputed as Levenshtein distance or not
        
    Examples
    --------
    wonder = ["AH1","N","D","ER0"]
    one = ["AH1","N"]
    
    difference = calculate_edit_distance(wonder, one, levenshtein=False)
    
    This can be updated with a faster, dynamic program approach
    """
#     aligned_phoneme_set_a = []
    substitution_cost = 0
    insertion_cost = 0
    deletion_cost = 0
    aligned_phoneme_set_b = list(phoneme_set_b)
    
    index_counter = 0
    while index_counter < len(phoneme_set_b) - 1:
#         if phoneme_set_a[index_counter] == aligned_phoneme_set_b[index_counter]:
#             continue
            # aligned_phoneme_set_b[index_counter] = phoneme_set_b[index_counter]
        if index_counter > 0:
            if phoneme_set_a[index_counter-1] == aligned_phoneme_set_b[index_counter]:
                aligned_phoneme_set_b.insert(index_counter, None)
        # else, skip. it requires a substitution
        index_counter = index_counter + 1
    
    deletion_cost = abs(len(phoneme_set_a) - len(aligned_phoneme_set_b))
    aligned_phoneme_set_b = aligned_phoneme_set_b[-len(phoneme_set_a):]
    
#     index_counter = len(aligned_phoneme_set_b) - 1
    for i in range(len(aligned_phoneme_set_b)):
        if aligned_phoneme_set_b[i] == None:
            insertion_cost = insertion_cost + 1
        elif phoneme_set_a[i] != aligned_phoneme_set_b[i]:
            substitution_cost = substitution_cost + 1
        # else, continue
    
    # compute total costs
    if levenshtein:
        substitution_cost = substitution_cost * 2
        
    print(f"aligned_phoneme_set_b: {aligned_phoneme_set_b}")
        
    print(f"deletion cost: {deletion_cost}")
    print(f"insertion cost: {insertion_cost}")
    print(f"substitution cost: {substitution_cost}")
    
    total_cost = deletion_cost + insertion_cost + substitution_cost
    
    return total_cost

In [None]:
wonder = ["AH1","N","D","ER0"]
one = ["AH1","N"]

difference = calculate_edit_distance(wonder, one, levenshtein=True)
difference

In [None]:
difference = calculate_edit_distance("execution", "intention")
difference

In [None]:
difference = calculate_edit_distance("ry", "hey")
difference

In [None]:
temp = "execution"
temp2 = "inte*ntion"
temp2[-len(temp):]
abs(len(temp) - len(temp2))

In [None]:
not None