In [None]:
import numpy as np
import csv
import os 
import sys
import pdb
import string

## Chris's Rhyming Scorer

In [None]:
# imports
import re
from pathlib import Path
from typing import List, Union

import numpy as np
import pronouncing

from youshen.util import calculate_edit_distance, clean

LIMERICK_PATTERN = [[0, 1], [2, 3], [0, 4]]
BLACKLIST =[
    "=+.*=+",
    "(<.endoftext)*.>"
]


def get_top_limerick(text: str):
    """
    Parameters
    ----------
    text: Str
        Samples of text generated by GPT-2

    Returns
    ----------
    top_limerick: str
        The best rhyming limerick from a set of samples generated by GPT-2

    Examples
    ----------
    >>> from pathlib import Path
    >>> from youshen.poem import get_top_limerick
    >>> samples = Path("samples/samples.txt")
    >>> with open(samples) as file:
    >>>     text = file.read()
    >>> top = get_top_limerick(text)
    """
    poem_samples = text.split("<|endoftext|>")

    poems = [
        clean(poem_sample, BLACKLIST) 
        for poem_sample in poem_samples 
        if len(poem_sample) > 0
    ]

    poems = [poem for poem in poems if len(poem) > 0]
    grouped_verses = [[line for line in poem.splitlines() if line] for poem in poems ]
    poem_text = ["\n".join(grouped_verse) for grouped_verse in grouped_verses if len(grouped_verse)>4] 
    poems = [SamplePoem(text=poem_sample, rhyme_patterns=LIMERICK_PATTERN, verse_length=5) 
                 for poem_sample in poem_text if len(poem_sample) >4]
    indexed_scores = {poems.index(poem): poem.get_rhyme_score() for poem in poems}
    top_poem_index = max(indexed_scores)
    top_limerick = poems[top_poem_index].lines
    return "\n".join(top_limerick)

class Limerick:
    def __init__(self, lines: str, rhyme_patterns: List, max_length: int = None):
        self.verse_lines = lines
        if max_length:
            self.verse_lines = self.verse_lines[0:max_length]
#         pdb.set_trace()
        try:
            self.last_words = [line.split()[-1] for line in self.verse_lines] # Error here
        except:
            pdb.set_trace()
        self.last_word_rhyming_part_pairs = {
            word: self.__get_rhyming_parts(word) for word in self.last_words
        }
        self.rhyme_patterns = rhyme_patterns

    def __get_phonemes(self, text: Union[str, List]):
        """returns all possible pronunciation of a word as phonemes
        Language used: American English. Style: Arpabet
        """
        if type(text) == str:
            phonemes = pronouncing.phones_for_word(text)
        else:
            phonemes = [pronouncing.phones_for_word(word) for word in text]
        return phonemes

    def __get_rhyming_parts(self, word: str):
        phonemes = self.__get_phonemes(word)
        rhyming_parts = [pronouncing.rhyming_part(phoneme) for phoneme in phonemes]
        return rhyming_parts

    def __get_valid_rhyme_patterns(self):
        valid_patterns = [
            pattern
            for pattern in self.rhyme_patterns
            if not any(i > len(self.verse_lines) - 1 for i in pattern)
        ]
        return valid_patterns

    def score(self, line_pair: List):
        first_word = self.last_words[line_pair[0]]
        second_word = self.last_words[line_pair[1]]
        first_word_rhymes = self.__get_rhyming_parts(first_word)
        second_word_rhymes = self.__get_rhyming_parts(second_word)
        rhyme_score = 0
        for first_word_rhyme in first_word_rhymes:
            for second_word_rhyme in second_word_rhymes:
                is_rhyming = first_word_rhyme == second_word_rhyme
                if is_rhyming:
                    rhyme_score = 1
                    status = "successfully matched"
                else:
                    status = "could not match"
                # uncomment to debug
                print(
                    f" {status} -> {first_word}({first_word_rhyme}) and {second_word}({second_word_rhyme})"
                )
        return int(rhyme_score)

    def get_rhyme_score(self):
        """returns a rhyming score for the poem between 0 and 1.
        """
        valid_patterns = self.__get_valid_rhyme_patterns()
        scores = [self.score(pattern) for pattern in valid_patterns]
        if len(scores) == 0:
            return 0
            # pdb.set_trace()
        return sum(scores) / len(scores)

    def __repr__(self):
        return repr("\n".join(self.verse_lines))


class SamplePoem:
    def __init__(
        self, text: str, rhyme_patterns: List, verse_length: int, blacklist: List = None
    ):
#         pdb.set_trace()
        self.lines = [line for line in text.splitlines() if line]
        self.verse_length = verse_length
        self.rhyme_patterns = rhyme_patterns
        intervals = list(range(0, len(self.lines), verse_length))
        verse_lines_list = [self.lines[x : x + 5] for x in intervals]
        self.verses = [
            Limerick(
                lines=verse_lines,
                rhyme_patterns=self.rhyme_patterns,
                max_length=self.verse_length,
            )
            for verse_lines in verse_lines_list
        ]

    def __get_item__(self, key):
        return self.verses[key]

    def get_rhyme_score(self):
        if len(self.verses):
            scores = [verse.get_rhyme_score() for verse in self.verses]
            score = sum(scores) / len(scores)
        else:
            score = None
        return score

    def __repr__(self):
        return repr(self.verses)


def read_poems(file_path: Path, blacklist):
    """reads a file containing poems and returns a list of limerick samples found in the file
    """
    with open(file_path) as file:
        text = file.read()
    poem_samples = text.split("<|endoftext|>")
    poems = [clean(sample, blacklist) for sample in poem_samples if len(sample) > 0]
    return [poem for poem in poems if len(poem) > 0]


def score_poems(file_path: Path, last_word_pattern: str, blacklist: List):
    """Reads limericks in generated samples and scores them between 0 and 1
    """
    poems = [
        SamplePoem(text=poem_sample, rhyme_patterns=LIMERICK_PATTERN, verse_length=5)
        for poem_sample in read_poems(file_path, blacklist=blacklist)
        if len(poem_sample) > 0
    ]
    poems = [poem for poem in poems if poem.lines]
    poem_scores = [poem.get_rhyme_score() for poem in poems]
    return poem_scores


def test_scoring_limerick(sample_rhyme: Path, limerick_pattern: List):
    """Sanity check to test scoring of a single limerick
    """
    with open(sample_rhyme) as rhyme_sample:
        sample_corpus = rhyme_sample.read()
    limerick_lines = [line for line in sample_corpus.splitlines() if line]
    limerick = Limerick(
        lines=limerick_lines, rhyme_patterns=limerick_pattern, max_length=5
    )
    print("Scoring limerick...")
    score = limerick.get_rhyme_score()
    print(f"Rhyme score is {score}")
    assert type(score) == float


## Tony's Coref Scorer

In [None]:
"""# Environmental Setup"""
# !git clone https://github.com/huggingface/neuralcoref.git
# !cd neuralcoref
# !pip install -r neuralcoref/requirements.txt
# !pip install -e neuralcoref/
# !pip install -U spacy
# !python -m spacy download en

In [None]:
# imports
import neuralcoref
import spacy
import nltk

nlp = spacy.load('en_core_web_sm')
nltk.download('punkt')

# Do not resolve 1st/2nd person pronouns
neuralcoref.add_to_pipe(nlp, blacklist=True, greedyness=0.5)

In [None]:
def clean_tokenize(poem_set):
    poems_lines_tokens = [[nltk.word_tokenize(line) for line in poem] for poem in poem_set]
    poems = [" \n ".join([" ".join(line) for line in poem]) for poem in poems_lines_tokens]

    # Ensure that every token is surrounded by white spaces
    poems = [(" " + poem.replace("``", "''") + " ") for poem in poems]
    return poems


def coref_score(poem):
    # origin_poem = poem
    resolved = []
    poem_coref = nlp(poem)
    for i in poem_coref._.coref_clusters:
        resolved += [i.mentions[j].text for j in range(len(i.mentions))]
    resolved = [(" " + token + " ") for token in resolved]

    # Remove resolved coreference
    for k in resolved:
        poem = poem.replace(k, " ", 1)

    # Regex pattern matching with lookahead
    # All 3rd person pronouns
    pronoun_3p = r'(?=(( he )|( him )|( his )|( himself )|( she )|( her )|( hers )|( herself )|( it )|( its )|( itself )|( they )|( them )|( their )|( theirs )|( themself )|( themselves )))'

    # Find remaining unresolved 3rd person pronouns
    matches_3p = re.findall(pronoun_3p, poem)

    # Non-positive score, maximum at 0
    score = -len(matches_3p)

    # if (VERBOSE):
    #     if score < 0:
    #         for n in matches_3p:
    #             print(n[0])
    #         print(origin_poem)
    #         print(poem)
    return score

## Sperate Text and Score

In [None]:
filename = "samples_singh.txt"
limericks = []
limerick = []
with open(filename, "r") as file:
    
    for x in file:
        
        if "======================================== " in x:
            continue 
        elif "<|endoftext|>" in x :
            if len(limerick) == 5:
                limericks.append(limerick)
            limerick = []
            continue
        elif x == '\n' or x == ' \n':
            continue
        
#         x = x.replace('\n', '')
        limerick.append(x)
            
poems = [SamplePoem(text=''.join(poem_sample), rhyme_patterns=LIMERICK_PATTERN, verse_length=5) 
         for poem_sample in limericks if len(poem_sample) >4]
indexed_scores = {poems.index(poem): poem.get_rhyme_score() for poem in poems} 

In [None]:
inds = [k for k,v in indexed_scores.items() if v == 1.0]

In [None]:
out = [limericks[i] for i in inds]

In [None]:
next_out = []
next_poems = clean_tokenize(out)
for pi, p in enumerate(next_poems):
    if coref_score(p) >= 0:
        next_out.append(out[pi])

# len(out)=1097
# len(next_out)=773

In [None]:
filename = "best_limericks.txt"
with open(filename, "w") as file:
    for lim in next_out:
        file.write(''.join(lim)+'\r\n')
#         pdb.set_trace()