# News to poems

## 1. Grammar

In [26]:
import nltk
from pprint import pprint
from nltk.parse.stanford import StanfordDependencyParser

In [27]:
partial_grammar = """
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
"""
title = "British scientist says memories of spending the night with Marilyn Monroe could be implanted into the brain"
title_words = nltk.word_tokenize(title)
title_pos = nltk.pos_tag(title_words)
print title_pos

[('British', 'JJ'), ('scientist', 'NN'), ('says', 'VBZ'), ('memories', 'NNS'), ('of', 'IN'), ('spending', 'VBG'), ('the', 'DT'), ('night', 'NN'), ('with', 'IN'), ('Marilyn', 'NNP'), ('Monroe', 'NNP'), ('could', 'MD'), ('be', 'VB'), ('implanted', 'VBN'), ('into', 'IN'), ('the', 'DT'), ('brain', 'NN')]


In [28]:
def lex_grammar(words_pos):
    lexicon = {}
    for word, pos in sorted(words_pos, key=lambda (a, b): b):
        if pos not in lexicon:
            lexicon[pos] = []
        lexicon[pos].append(word)
    return "\n".join("{pos} -> {words}".format(pos=pos, words="|".join(ws)) for pos, ws in lexicon.iteritems())

In [29]:
second_part_of_grammar = lex_grammar(title_pos)
grammar = partial_grammar + second_part_of_grammar

In [30]:
print grammar


S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
MD -> could
VB -> be
VBG -> spending
NN -> scientist|night|brain
VBN -> implanted
JJ -> British
IN -> of|with|into
VBZ -> says
DT -> the|the
NNS -> memories
NNP -> Marilyn|Monroe


Before proceeding one should download the zip-archive of the stanford parser:

http://nlp.stanford.edu/software/stanford-parser-full-2015-04-20.zip

We need only two files from this folder:
    - stanford/stanford-parser.jar
    - stanford/stanford-parser-3.5.2-models.jar

Create a directory called 'stanford' and copy them there.

In [31]:
parser = StanfordDependencyParser(path_to_jar='stanford/stanford-parser.jar',\
                                  path_to_models_jar='stanford/stanford-parser-3.5.2-models.jar')
parsed_tree = list(parser.parse(title_words))

print title
pprint(list(parsed_tree[0].triples()))
    

British scientist says memories of spending the night with Marilyn Monroe could be implanted into the brain
[((u'says', u'VBZ'), u'nsubj', (u'scientist', u'NN')),
 ((u'scientist', u'NN'), u'amod', (u'British', u'JJ')),
 ((u'says', u'VBZ'), u'dobj', (u'memories', u'NNS')),
 ((u'memories', u'NNS'), u'nmod', (u'spending', u'NN')),
 ((u'spending', u'NN'), u'case', (u'of', u'IN')),
 ((u'says', u'VBZ'), u'nmod:tmod', (u'night', u'NN')),
 ((u'night', u'NN'), u'det', (u'the', u'DT')),
 ((u'says', u'VBZ'), u'advcl', (u'implanted', u'VBN')),
 ((u'implanted', u'VBN'), u'mark', (u'with', u'IN')),
 ((u'implanted', u'VBN'), u'nsubjpass', (u'Monroe', u'NNP')),
 ((u'Monroe', u'NNP'), u'compound', (u'Marilyn', u'NNP')),
 ((u'implanted', u'VBN'), u'aux', (u'could', u'MD')),
 ((u'implanted', u'VBN'), u'auxpass', (u'be', u'VB')),
 ((u'implanted', u'VBN'), u'nmod', (u'brain', u'NN')),
 ((u'brain', u'NN'), u'case', (u'into', u'IN')),
 ((u'brain', u'NN'), u'det', (u'the', u'DT'))]


In [32]:
test_sent = "A ball is put into the box"
pprint(list(list(parser.parse(test_sent.split(' ')))[0].triples()))

[((u'put', u'VBN'), u'nsubjpass', (u'ball', u'NN')),
 ((u'ball', u'NN'), u'det', (u'A', u'DT')),
 ((u'put', u'VBN'), u'auxpass', (u'is', u'VBZ')),
 ((u'put', u'VBN'), u'nmod', (u'box', u'NN')),
 ((u'box', u'NN'), u'case', (u'into', u'IN')),
 ((u'box', u'NN'), u'det', (u'the', u'DT'))]


## 2. Morphological analysis

In [33]:
import re
def tokenize(s, pattern=r'\W+'):
    return filter(None, re.split(pattern, s))

In [40]:
from nltk.corpus import cmudict
from itertools import izip_longest, chain
stress_dictionary = cmudict.dict()

poem_file = open('data/guinea_pig.txt')
corpus = poem_file.readlines()
poem_file.close()

#list of poem structures
poem_structure_1 = [
    (8, [1,3,5,7],),
    (8, [1,3,5,7],),
    (8, [1,3,5,7],),
    (8, [1,3,5,7],),
]
# for line in corpus:
#     for token in tokenize(line):
#         print token, d.get(token.lower())

In the cell below, 3 python files are imported. The first one is called phonetics.py and contains the following functions:
    - def get_phonemes(word)
    - def is_vowel(phoneme)
    - def is_consonant(phoneme)
    - def num_of_syllables(phoneme)
    - def is_stressed(phoneme)
    - def is_rhyme(phonemes1, phonemes2)
    - def stress_mask(phonemes)
    ((- def get_phonetic_combinations(words)))
The second file is called utils.py and merely contains the function called
    - def combinations(*args)
Finally there is a file named validate.py. This script contains 2 functions:
    - def validate_line(phonemes, structure)
    - def validate_couplet(couplet, structure)
For more information, consult the files themselves. As they contain docstrings, it becomes clear what input they require and what output they provide.

Under # EXAMPLES, one can see how couplets are validated. Each validate function has as input both phonemes or a couplet respectively, as well as a structure. The latter defines the kind of poem we would like to validate. For example, a structure can consist of a rhyme scheme(abab) and a stress pattern(stressed/true, unstressed/false, stressed/true ...).
In the output below the caption, the line which fits the requirements from the pattern is chosen from all combinations possible provided by cmudict'. However, cmudict predicted for words consisting of one syllable to be stressed at any time. Therefore,a function making sure one-syllable words can be both stressed ans unstressed has been added.

In [45]:
import phonetics as ph
import utils
#import validate


# helper function of validate_couplet(x,y)
def validate_line(line, structure):
    line_phonemes = [] 
    words = line.split(' ')
    for word in words:
        word_phonemes = ph.get_phonemes(word)
        if len(word_phonemes) == 1:
            if ph.num_of_syllables(word_phonemes[0]) == 1:
                word_phonemes.append([sound if not ph.is_stressed(sound) else sound.replace('1', '0') for sound in word_phonemes[0]])
        line_phonemes.append(word_phonemes)
    for candidate in utils.combinations(*line_phonemes):
        stresses = ph.stress_mask(chain.from_iterable(candidate))
        #print candidate, stresses
        if stresses == structure:
            return candidate
        

# returns true or false; does a couplet answer to the above mentioned scheme or not? Do lines (have to) rhyme?
def validate_couplet(couplet, structure):
    pass


# EXAMPLES
utils.combinations([1,2,3],[4],[5,6],[7,8,9])
print validate_line("there was a little guinea pig", [False, True, False, True, False, True, False, True])
print
print validate_line("who being little was not big", [False, True, False, True, False, True, False, True])
print
print ph.is_rhyme(validate_line("there was a little guinea pig", [False, True, False, True, False, True, False, True])[-1],
               validate_line("who being little was not big",  [False, True, False, True, False, True, False, True])[-1])

[[u'DH', u'EH0', u'R'], [u'W', u'AA1', u'Z'], [u'AH0'], [u'L', u'IH1', u'T', u'AH0', u'L'], [u'G', u'IH1', u'N', u'IY0'], [u'P', u'IH1', u'G']]

[[u'HH', u'UW0'], [u'B', u'IY1', u'IH0', u'NG'], [u'L', u'IH1', u'T', u'AH0', u'L'], [u'W', u'AA1', u'Z'], [u'N', u'AA0', u'T'], [u'B', u'IH1', u'G']]

True
