# News to poems

In [20]:
import nltk
from pprint import pprint
from nltk.parse.stanford import StanfordDependencyParser

In [21]:
partial_grammar = """
S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
"""
title = "British scientist says memories of spending the night with Marilyn Monroe could be implanted into the brain"
title_words = nltk.word_tokenize(title)
title_pos = nltk.pos_tag(title_words)
print title_pos

[('British', 'JJ'), ('scientist', 'NN'), ('says', 'VBZ'), ('memories', 'NNS'), ('of', 'IN'), ('spending', 'VBG'), ('the', 'DT'), ('night', 'NN'), ('with', 'IN'), ('Marilyn', 'NNP'), ('Monroe', 'NNP'), ('could', 'MD'), ('be', 'VB'), ('implanted', 'VBN'), ('into', 'IN'), ('the', 'DT'), ('brain', 'NN')]


In [22]:
def lex_grammar(words_pos):
    lexicon = {}
    for word, pos in sorted(words_pos, key=lambda (a, b): b):
        if pos not in lexicon:
            lexicon[pos] = []
        lexicon[pos].append(word)
    return "\n".join("{pos} -> {words}".format(pos=pos, words="|".join(ws)) for pos, ws in lexicon.iteritems())

In [23]:
second_part_of_grammar = lex_grammar(title_pos)
grammar = partial_grammar + second_part_of_grammar

In [24]:
print grammar


S -> NP VP
PP -> P NP
NP -> Det N | Det N PP | 'I'
VP -> V NP | VP PP
MD -> could
VB -> be
VBG -> spending
NN -> scientist|night|brain
VBN -> implanted
JJ -> British
IN -> of|with|into
VBZ -> says
DT -> the|the
NNS -> memories
NNP -> Marilyn|Monroe


Before proceeding you should download zip-archive with stanford parser:

http://nlp.stanford.edu/software/stanford-parser-full-2015-04-20.zip

We need two files from there:
- stanford/stanford-parser.jar
- stanford/stanford-parser-3.5.2-models.jar

Create a directory 'stanford' and put them there.

In [25]:
parser = StanfordDependencyParser(path_to_jar='stanford/stanford-parser.jar',\
                                  path_to_models_jar='stanford/stanford-parser-3.5.2-models.jar')
parsed_tree = list(parser.parse(title_words))

print title
pprint(list(parsed_tree[0].triples()))
    

British scientist says memories of spending the night with Marilyn Monroe could be implanted into the brain
[((u'says', u'VBZ'), u'nsubj', (u'scientist', u'NN')),
 ((u'scientist', u'NN'), u'amod', (u'British', u'JJ')),
 ((u'says', u'VBZ'), u'dobj', (u'memories', u'NNS')),
 ((u'memories', u'NNS'), u'nmod', (u'spending', u'NN')),
 ((u'spending', u'NN'), u'case', (u'of', u'IN')),
 ((u'says', u'VBZ'), u'nmod:tmod', (u'night', u'NN')),
 ((u'night', u'NN'), u'det', (u'the', u'DT')),
 ((u'says', u'VBZ'), u'advcl', (u'implanted', u'VBN')),
 ((u'implanted', u'VBN'), u'mark', (u'with', u'IN')),
 ((u'implanted', u'VBN'), u'nsubjpass', (u'Monroe', u'NNP')),
 ((u'Monroe', u'NNP'), u'compound', (u'Marilyn', u'NNP')),
 ((u'implanted', u'VBN'), u'aux', (u'could', u'MD')),
 ((u'implanted', u'VBN'), u'auxpass', (u'be', u'VB')),
 ((u'implanted', u'VBN'), u'nmod', (u'brain', u'NN')),
 ((u'brain', u'NN'), u'case', (u'into', u'IN')),
 ((u'brain', u'NN'), u'det', (u'the', u'DT'))]


In [26]:
test_sent = "A ball is put into the box"
pprint(list(list(parser.parse(test_sent.split(' ')))[0].triples()))

[((u'put', u'VBN'), u'nsubjpass', (u'ball', u'NN')),
 ((u'ball', u'NN'), u'det', (u'A', u'DT')),
 ((u'put', u'VBN'), u'auxpass', (u'is', u'VBZ')),
 ((u'put', u'VBN'), u'nmod', (u'box', u'NN')),
 ((u'box', u'NN'), u'case', (u'into', u'IN')),
 ((u'box', u'NN'), u'det', (u'the', u'DT'))]


## Morphological analysis

In [27]:
import re
def tokenize(s, pattern=r'\W+'):
    return filter(None, re.split(pattern, s))

In [28]:
from nltk.corpus import cmudict
d = cmudict.dict()

poem_file = open('data/guinea_pig.txt')
corpus = poem_file.readlines()
poem_file.close()

poem_structure = [
    (8, [1,3,5,7],),
    (8, [1,3,5,7],),
    (8, [1,3,5,7],),
    (8, [1,3,5,7],),
]
for line in corpus:
    for token in tokenize(line):
        print token, d.get(token.lower())

There [[u'DH', u'EH1', u'R']]
was [[u'W', u'AA1', u'Z'], [u'W', u'AH1', u'Z'], [u'W', u'AH0', u'Z'], [u'W', u'AO1', u'Z']]
a [[u'AH0'], [u'EY1']]
little [[u'L', u'IH1', u'T', u'AH0', u'L']]
guinea [[u'G', u'IH1', u'N', u'IY0']]
pig [[u'P', u'IH1', u'G']]
Who [[u'HH', u'UW1']]
being [[u'B', u'IY1', u'IH0', u'NG']]
little [[u'L', u'IH1', u'T', u'AH0', u'L']]
was [[u'W', u'AA1', u'Z'], [u'W', u'AH1', u'Z'], [u'W', u'AH0', u'Z'], [u'W', u'AO1', u'Z']]
not [[u'N', u'AA1', u'T']]
big [[u'B', u'IH1', u'G']]
He [[u'HH', u'IY1']]
always [[u'AO1', u'L', u'W', u'EY2', u'Z'], [u'AO1', u'L', u'W', u'IY0', u'Z']]
walked [[u'W', u'AO1', u'K', u'T']]
upon [[u'AH0', u'P', u'AA1', u'N']]
his [[u'HH', u'IH1', u'Z'], [u'HH', u'IH0', u'Z']]
feet [[u'F', u'IY1', u'T']]
And [[u'AH0', u'N', u'D'], [u'AE1', u'N', u'D']]
never [[u'N', u'EH1', u'V', u'ER0']]
fasted [[u'F', u'AE1', u'S', u'T', u'IH0', u'D']]
when [[u'W', u'EH1', u'N'], [u'HH', u'W', u'EH1', u'N'], [u'W', u'IH1', u'N'], [u'HH', u'W', u'IH1', u'N']

In [45]:
# TODO:
def num_of_syllables(phonetic_translation):
    number_of_syllables = 0
    for el in phonetic_translation:
        if el[-1].isdigit() == True:
            number_of_syllables += 1
        else:
            pass
    return number_of_syllables
        

def is_stressed(phoneme):
    if phoneme[-1] == '1':
        return True
    else:
        return False

def apply_structure(line):
    pass

def is_vowel(sound):
    return sound[-1].isdigit()

def is_rhyme(pron1, pron2):
    vowels = [(i, sound) for i, sound in enumerate(pron1) if is_vowel(sound)]
    if vowels:
        idx, last_vowel = vowels[-1]
        return pron1[-(idx + 1):] == pron2[-(idx + 1):]
    else:
        return False

for pron1 in d.get('was'):
    for pron2 in d.get('because'):
        print pron1, pron2, is_rhyme(pron1, pron2)

[u'W', u'AA1', u'Z'] [u'B', u'IH0', u'K', u'AO1', u'Z'] False
[u'W', u'AA1', u'Z'] [u'B', u'IH0', u'K', u'AH1', u'Z'] False
[u'W', u'AA1', u'Z'] [u'B', u'IH0', u'K', u'AA1', u'Z'] True
[u'W', u'AA1', u'Z'] [u'B', u'IH0', u'K', u'AH0', u'Z'] False
[u'W', u'AH1', u'Z'] [u'B', u'IH0', u'K', u'AO1', u'Z'] False
[u'W', u'AH1', u'Z'] [u'B', u'IH0', u'K', u'AH1', u'Z'] True
[u'W', u'AH1', u'Z'] [u'B', u'IH0', u'K', u'AA1', u'Z'] False
[u'W', u'AH1', u'Z'] [u'B', u'IH0', u'K', u'AH0', u'Z'] False
[u'W', u'AH0', u'Z'] [u'B', u'IH0', u'K', u'AO1', u'Z'] False
[u'W', u'AH0', u'Z'] [u'B', u'IH0', u'K', u'AH1', u'Z'] False
[u'W', u'AH0', u'Z'] [u'B', u'IH0', u'K', u'AA1', u'Z'] False
[u'W', u'AH0', u'Z'] [u'B', u'IH0', u'K', u'AH0', u'Z'] True
[u'W', u'AO1', u'Z'] [u'B', u'IH0', u'K', u'AO1', u'Z'] True
[u'W', u'AO1', u'Z'] [u'B', u'IH0', u'K', u'AH1', u'Z'] False
[u'W', u'AO1', u'Z'] [u'B', u'IH0', u'K', u'AA1', u'Z'] False
[u'W', u'AO1', u'Z'] [u'B', u'IH0', u'K', u'AH0', u'Z'] False
