In [5]:
import nltk
import numpy as np
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import string
from newsapi import NewsApiClient
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

So, this notebook will be working on taking summaries made up of the indicative sentences from a text and trying to maniuplate them so they can be fit into a limerick type form. These sentences are likely to be long, made up of multiple clauses. I'm going to try to assemble some rules based approaches to extract sub-sentences. Perhaps simply throwing out the subordinate clauses or paring down sentences by removing some auxillary words.

First some auxillary functions that I'm copying in from elsewhere. I will definitely need to:
1. scrape out news text
2. parse the text appropriately so that I can:
3. run it through my text summarizer and extract the key sentences

In [2]:
nyt_api = 'key goes here'
alt_key = 'key goes here'

def scrape_articles_text(url):

    session = requests.Session()

    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')

    paragraph_tags = soup.find_all('p', class_= 'css-exrw3m evys1bk0')
    if paragraph_tags == []:
        paragraph_tags = soup.find_all('p', itemprop = 'articleBody')

    article = ''
    for p in paragraph_tags:
        article = article + ' ' + p.get_text()

    # Clean article replacing unicode characters
    article = article.replace(u'\u2018', u"'").replace(u'\u2019', u"'").replace(u'\u201c', u'"').replace(u'\u201d', u'"')

    return article

In [22]:
def text_sent_parser(text):
    temp_sentences = []
    #using SpaCy to parse the text for sentences
    text = nlp(text)
    #unfortunately will need to reconvert back into strings
    for sent in text.sents:
        temp = str()
        for token in sent:
            temp += token.text
            temp += ' '
        temp_sentences.append(temp)
    #Lastly cutting out repeated sentences, to get rid of things like 'subscribe here'
    #That come up depending on where you've scraped the text
    indices_to_avoid = []
    #first identify where the repeated sentences are
    for n in range(0,len(temp_sentences)):
        for i in range(n+1,len(temp_sentences)):
            if temp_sentences[n] == temp_sentences[i]:
                indices_to_avoid.append(n)
                indices_to_avoid.append(i)
    sentences = []
    #and then exclude them
    for n in range(0,len(temp_sentences)):
        if n not in indices_to_avoid:
            sentences.append(temp_sentences[n])
    return sentences

def sent_comparer(sentence_list):
    #sklearn vectorizer
    vectorizer = TfidfVectorizer()
    transformed = vectorizer.fit_transform(sentence_list)
    #Using cosine similarity on the whole list of vectors produces a matrix
    #where each sentence's vector is compared to every other sentence
    similars = cosine_similarity(transformed,transformed)
    #now we collapse into single values
    averages = []
    for i in similars:
        averages.append(i.mean())
    #And return the indices of the highest values
    #Sorted by index, so that our summary is chronological
    n_sents = int(len(sentence_list)**.5)
#    sent_indices = sorted(list(np.argsort(averages)[-n_sents:]))
    sent_indices = list(np.argsort(averages)[-n_sents:])
    return sent_indices

def summarizer(text):
    sentence_list = text_sent_parser(text)
    summary_indices = sent_comparer(sentence_list)
    return [sentence_list[n] for n in summary_indices]

Ok, here's an example article. When I first looked at the NYT homepage, the only article that seemed to be not entirely depressing was this one about a possible way to make small amounts of renewable power at night.

For this example, I'll work with the sentence identified as being most similar with the rest of the sentences in the article.

In [30]:
url = 'https://www.nytimes.com/2019/09/12/science/solar-energy-power-electricity.html'
text = scrape_articles_text(url)

summary = summarizer(text)
summary


['In the paper , Dr. Raman described how the device , when connected to a voltage converter , was able to power a white LED . ',
 'Earlier this year , they also tested an infrared photodiode , similar to the technology used in most solar cells but which uses warmth , not sunlight , to generate wisps of electricity in the darkness . ',
 'The village was not equipped with electricity , and Dr. Raman , an electrical engineer at the University of California , Los Angeles , was unaware he was in a village until he heard the voices of shadowed human figures . ',
 'At its heart is an off - the shelf gadget called a thermoelectric generator , which uses the difference in temperature between opposite sides of the device to generate a current . ',
 'This is a neat combination of radiative cooling — a technique where Raman has pioneered real working devices — with thermoelectric materials that generate electricity if one side is hotter than the other side , " said Ellen D. Williams , a physics pr

In [31]:
example = nlp(summary[-1])

In [32]:
example

Dr. Raman wondered whether he could use all that darkness to make something to light it up , not unlike the way that solar panels generate electricity from the sun 's heat and light .   

This is sort of a complicated sentence when you think about it! We have most of the actual content of the sentence in subordinate clauses - the core of the independent clause is really just "Dr. Rman wondered", which illustrates both the difficulty of compressing news-speech into short summaries and the limitations of each line of the limerick form, since those three words already take up 6 syllables.

Mapping out the parts of speech really illustrate the challenges:

In [35]:
for token in example:
    print(f'word: {token.text:{15}} POS: {token.pos_:{10}} {token.tag_:{5}} {token.dep_:{7}} {spacy.explain(token.dep_)}')

word: Dr.             POS: PROPN      NNP   compound compound
word: Raman           POS: PROPN      NNP   nsubj   nominal subject
word: wondered        POS: VERB       VBD   ROOT    None
word: whether         POS: ADP        IN    mark    marker
word: he              POS: PRON       PRP   nsubj   nominal subject
word: could           POS: VERB       MD    aux     auxiliary
word: use             POS: VERB       VB    ccomp   clausal complement
word: all             POS: DET        DT    predet  None
word: that            POS: DET        DT    det     determiner
word: darkness        POS: NOUN       NN    dobj    direct object
word: to              POS: PART       TO    aux     auxiliary
word: make            POS: VERB       VB    xcomp   open clausal complement
word: something       POS: NOUN       NN    dobj    direct object
word: to              POS: PART       TO    aux     auxiliary
word: light           POS: VERB       VB    relcl   relative clause modifier
word: it              PO

In [33]:
displacy.render(example)

What are the different ways we might extract useful fragments, that both looked grammatical and contained actual content? If I were tasked with that challenge, I would think of things like "he could use all that darkness", "solar panels generate" or even just "generate electricity".

The added challenge is then making sure that each of these fragments conforms to either the 5 or 7 syllable line lengths. My thinking is that first you create a list of acceptable sub-clauses, taking each verb, say, and extracting it's appropriate N-V-O phrase and then you prune these down in various ways, first cutting extraneous adjectives, say, or chopping off the object.

You could build up a long list of these sub-parts and then simply search all the possible permutations to look for any rhymes that naturally occured. That just might work on a sufficiently long article! If not, you could then maybe do something similar on multiple articles and build a summary limerick of the news that has a line from each from multiple stories.

Spacy conveniently groups noun chunks together, but doesn't do something similar for 'verb-phrases'. I think the first step is a function that will take in a noun and search for all the appropriate others parts of the phrase using the dependencies.

Spacy has a good sense of sentence structure, but unfortunately for my purposes, the tree structure into which the sentence is parsed is not super helpful, since the sub-tree for the main verb is essentially the whole sentence:

In [107]:
noun_index = []
for n in range(0,len(example)):
    if example[n].dep_ == 'nsubj':
        noun_index.append(n)
        
for index in noun_index:
    print(example[index].head)

wondered
use
generate


In [72]:
for token in example[2].subtree:
    print(token)

Dr.
Raman
wondered
whether
he
could
use
all
that
darkness
to
make
something
to
light
it
up
,
not
unlike
the
way
that
solar
panels
generate
electricity
from
the
sun
's
heat
and
light
.
  


I think I can build on the semi-recursive nature of the structure however. I'll parse the sentence from the bottom up, and then I can keep the different verb groups distinct

In [61]:
for token in example[6].subtree:
    print(token)

whether
he
could
use
all
that
darkness
to
make
something
to
light
it
up
,
not
unlike
the
way
that
solar
panels
generate
electricity
from
the
sun
's
heat
and
light


In [70]:
for token in example[11].subtree:
    print(token)

to
make
something
to
light
it
up
,


In [77]:
#Let's gather up all the verbs
verb_index = []
for n in range(0,len(example)):
    if example[n].pos_ == 'VERB':
        verb_index.append(n)
        
#and see what sort of subtrees they have        
for index in verb_index:
    print(' '.join([token.text for token in example[index].subtree]))

Dr. Raman wondered whether he could use all that darkness to make something to light it up , not unlike the way that solar panels generate electricity from the sun 's heat and light .   
could
whether he could use all that darkness to make something to light it up , not unlike the way that solar panels generate electricity from the sun 's heat and light
to make something to light it up ,
to light it up
that solar panels generate electricity from the sun 's heat and light


In [194]:
#Let's try breaking apart the tree by making cuts where the next sub tree starts
for n in range(0,len(verb_index)-1):
    index = verb_index[n]
    end_i = example[verb_index[n+1]].left_edge.i
    print(' '.join([token.text for token in example[index].subtree if token.i < end_i]))
print(' '.join([token.text for token in example[verb_index[-1]].subtree]))

Dr. Raman wondered whether he

whether he could use all that darkness
to make something
to light it up
that solar panels generate electricity from the sun 's heat and light


The challenge here is the nested/overlapping nature of the tree. The word "could" is a subtree unto itself, as an auxilliary in the phrase "whether he could use". First task is probably to simply throw away any sub trees that are that short.

I also want to look for a way to break out noun phrases, not just the chunks that spacy identifies. Spacy captures adjectives and articles, but doesn't keep things joined by a conjuction together. I want to automatically pull out "the sun's heat and light" as one phrase. So far the only instance I know I want to overcome is this one, so it should be easy to fix.

First I need to bring my pronunciation parsers back in so I can also test these things for syllable length.

In [234]:
#NLTK pronunciation dictionary to search for length (and also rhymes later on)
entries = nltk.corpus.cmudict.entries()
pron_dict = {}
for entry in entries:
    pron_dict[entry[0]]=entry[1]

def word_pronouncer(word):
    pron = pron_dict[word.lower()]
    stresses = str()
    for phoneme in pron:
        if phoneme[-1].isdigit():
            #NLTK has markings of 0 for unstressed and 1 and 2 for stressed (primary and secondary)
            if phoneme[-1] == '0':
                stresses += phoneme[-1]
            else:
                stresses += '1'
    return {'stresses':stresses, 'syllables':len(stresses)}

def span_syllables(span):
    syllables = 0
    for n in range(0,len(span)):
        #One time fix to handle spacy breaking out the possisve 's and 
        #it adding syllables to the overall count
        if span[n].text.lower() == "'s":
            pass
        elif span[n].pos_ != 'PUNCT':
            syllables += word_pronouncer(span[n].text.lower())['syllables']
    return syllables


#Challenge here when the text contains words not in the pronunciation dictionary!
#I'm currently using this alternate version that essentially disregards any span with a word it can't find
# def span_syllables(span):
#     syllables = 0
#     for n in range(0,len(span)):
#         #One time fix to handle spacy breaking out the possisve 's and 
#         #it adding syllables to the overall count
#         if span[n].text.lower() == "'s":
#             pass
#         elif span[n].pos_ != 'PUNCT':
#             try:
#                 syllables += word_pronouncer(span[n].text.lower())['syllables']
#             except:
#                 return 0
#     return syllables


#Here's another version that also takes into account the stresses in case I get there
def span_syllables_meter(span):
    syllables = 0
    temp = []
    for n in range(0,len(span)):
        temp.append(word_pronouncer(span[n].text.lower()))
    stresses = str()
    for item in temp:
        stresses += item['stresses']
    return {'stresses':stresses,'syllables':len(stresses)}

In [451]:
def test_conjuction(text,noun_chunk):
    if text[noun_chunk.end+1].tag == 'CC':
        return True

def noun_phrase_extract(text):
    phrases = []
    chunks = [chunk for chunk in text.noun_chunks]
    for n in range(0,len(chunks)):
        if span_syllables(chunks[n]) == 5 or span_syllables(chunks[n]) == 7:
            phrases.append(chunks[n])
        if chunks[n].conjuncts != ():
            end = max([span.i for span in chunks[n].conjuncts])+1
            temp = text[chunks[n].start:end]
            if span_syllables(temp) == 5 or span_syllables(temp) == 7:
                   phrases.append(temp)
    return [phrase.text for phrase in phrases]

In [292]:
noun_phrase_extract(example)

[electricity, the sun 's heat and light]

Ok, now to handle the verb phrases.
1. compile master list of all spans that make sense
2. add any that align syllable wise to a running list
3. throw out any that are far too short
4. consider ways to pare down longer phrases

In [210]:
def verb_spans(text):
    verb_index = []
    for n in range(0,len(example)):
        if example[n].pos_ == 'VERB':
            verb_index.append(n)

    sub_spans = []
    for n in range(0,len(verb_index)-1):
        index = verb_index[n]
        end_i = example[verb_index[n+1]].left_edge.i
        temp = text[min([token.i for token in text[index].subtree]):end_i]
        if len(temp) > 0:
            sub_spans.append(temp)
    sub_spans.append(text[min([token.i for token in text[verb_index[-1]].subtree]):
                          max([token.i for token in text[verb_index[-1]].subtree])])
    return sub_spans

In [211]:
verb_spans(example)

[Dr. Raman wondered whether he,
 whether he could use all that darkness,
 to make something,
 to light it up , not unlike the way,
 that solar panels generate electricity from the sun 's heat and]

How can we pare these fragments down if need be? What's the bare S-V-O structure minimum? If we're only one or two syllables too long, we should try a minor fix, if we're way over, try paring all the way back
1. Get rid of initial auxilliaries (to, whether)
2. Pare down descriptors of children in the grammatical structure (strip out 'all that' from 'use all that darkness'
3. Is there a smart way to remove something like the 'dr' from Dr Raman's name?
4. S-V-O
5. S-V
6. V-O

In [375]:
def find_phrase_subj(span):
    for token in span:
        if token.dep_ == 'nsubj':
            subj = token
    chunks = [chunk for chunk in span.noun_chunks]
    for chunk in chunks:
        if subj in chunk:
            return chunk.text
        
def find_phrase_obj(span):
    dependencies = [token.dep_ for token in span]
    if 'dobj' not in dependencies:
        return ''
    for token in span:
        if token.dep_ == 'dobj':
            subj = token
    chunks = [chunk for chunk in span.noun_chunks]
    for chunk in chunks:
        if subj in chunk:
            return chunk.text

def find_verb(span):
    for token in span:
        if token.pos_ == 'VERB':
            return token.text
        
def count_adj(span):
    poses = [token.pos_ for token in span]
    return poses.count('ADJ')

def drop_adj(span):
    poses = [token.pos_ for token in span]
    index = poses.index('ADJ')
    if index == 0:
        return span[1:]
    else:
        return nlp(span[:index].text+' '+span[index+1:].text)

In [376]:
drop_adj(spans[4])

that panels generate electricity from the sun 's heat and

In [452]:
def phrase_shortener(span):
    options = []
    if span_syllables(span) < 5:
        return options
    dependencies = [token.dep_ for token in span]
    if dependencies[0] == 'aux':
        if 'compound' in dependencies:
            temp = nlp(span[1:dependencies.index('compound')].text+' '+span[dependencies.index('compound')].text)
            if span_syllables(temp) == 5 or span_syllables(temp) == 7:
                options.append(temp)
        temp = span[1:]
        if span_syllables(temp) == 5 or span_syllables(temp) == 7:
            options.append(temp)
    if 'compound' in dependencies:
        if dependencies.index('compound') == 0:
            temp = span[1:]
        else:
            temp = nlp(span[:dependencies.index('compound')].text+' '+span[dependencies.index('compound')].text)
        if span_syllables(temp) == 5 or span_syllables(temp) == 7:
            options.append(temp)
    
    temp = span
    for n in range(0,count_adj(span)):
        temp = drop_adj(temp)
        if span_syllables(temp) == 5 or span_syllables(temp) == 7:
            options.append(temp)
    try:        
        temp = nlp(find_phrase_subj(span)+' '+find_verb(span)+' '+find_phrase_obj(span))
        if span_syllables(temp) == 5 or span_syllables(temp) == 7:
            options.append(temp)
        if count_adj(temp)>0:
            for n in range(0,count_adj(temp)):
                temp = drop_adj(temp)
                if span_syllables(temp) == 5 or span_syllables(temp) == 7:
                    options.append(temp)
    
    
        temp = nlp(find_phrase_subj(span)+' '+find_verb(span))
        if span_syllables(temp) == 5 or span_syllables(temp) == 7:
            options.append(temp)
        if count_adj(temp)>0:
            for n in range(0,count_adj(temp)):
                temp = drop_adj(temp)
                if span_syllables(temp) == 5 or span_syllables(temp) == 7:
                    options.append(temp)
    
        temp = nlp(find_verb(span)+' '+find_phrase_obj(span))
        if span_syllables(temp) == 5 or span_syllables(temp) == 7:
            options.append(temp)
        if count_adj(temp)>0:
            for n in range(0,count_adj(temp)):
                temp = drop_adj(temp)
                if span_syllables(temp) == 5 or span_syllables(temp) == 7:
                    options.append(temp)
    except:
        pass
    return [option.text for option in options]

In [379]:
phrase_shortener(spans[0])

[Raman wondered whether he]

In [378]:
phrase_shortener(spans[4])

[solar panels generate, panels generate]

In [460]:
def verb_phrase_extract(text):
    phrases = []
    spans = verb_spans(text)
    for span in spans:
        if span_syllables(span) == (5 or 7):
            phrases.append(span.text)
        phrases+=phrase_shortener(span)
    return phrases

In [386]:
verb_phrase_extract(example)

[Raman wondered whether he,
 could all that darkness,
 solar panels generate,
 panels generate]

In [387]:
def find_all_phrases(text):
    phrases = []
    phrases += noun_phrase_extract(text)
    phrases += verb_phrase_extract(text)
    return phrases


In [388]:
find_all_phrases(example)

[electricity,
 the sun 's heat and light,
 Raman wondered whether he,
 could all that darkness,
 solar panels generate,
 panels generate]

Ok, now to assemble a limerick of sorts. I don't know how many possible rhyming lines we're actually going get in any given piece, so first I'm going to parse all sentences together for any usable phrases, then I'll assemble an overall rhyming list and hope that I actually have some hits. With only a handful of lines being created from any given sentence in the text, however, I don't know if I'm super sanguine...

In [465]:
def complete_text_phrases(text):
    sentences = text_sent_parser(text)
    
    phrases = []
    for sentence in sentences:
        span = nlp(sentence.strip())
        try:
            phrases += find_all_phrases(span)
        except:
            pass
    syllable_dict = {5:[],7:[]}
    for phrase in set(phrases):
        temp = nlp(phrase)
        if span_syllables(temp) == 5:
            syllable_dict[5].append(temp)
        else:
            syllable_dict[7].append(temp)
    return syllable_dict

In [466]:
complete_text_phrases(text)

{5: [research published on,
  described a device,
  the device described,
  electricity,
  a combination,
  will be improving,
  electrical grids,
  they are so much heat,
  in the journal Joule,
  no clouds are present,
  an idea came,
  panels generate,
  , an electrical,
  could all that darkness,
  at Stanford and an,
  battery storage,
  solar - powered lights,
  remote areas,
  the paper , Dr.,
  Ellen D. Williams,
  jacket - less people,
  Dr. Raman said,
  the most common type,
  about five minutes,
  of Technology,
  changing batteries,
  an alternative,
  Modern scientists,
  about three orders,
  the device described ,
  typical solar,
  a materials,
  of Earth turns away,
  the sun 's heat and light,
  us about five,
  and Afghanistan,
  magnitude than what,
  an idea came ,
  to trap warmth , objects,
  a way to go if,
  thousand years ago ,,
  the technology,
  Jeffrey C. Grossman,
  the development,
  Dr. Raman said ,
  Sierra Leone,
  realize we were],
 7: [Fan 's team 

In [402]:
sentences = text_sent_parser(text)

In [445]:
summary = summarizer(text)
sentence = nlp(summary[1].strip())
find_all_phrases(sentence)

[the technology, electricity]

In [443]:
summary[1]

'Earlier this year , they also tested an infrared photodiode , similar to the technology used in most solar cells but which uses warmth , not sunlight , to generate wisps of electricity in the darkness . '

In [444]:
def span_syllables(span):
    syllables = 0
    for n in range(0,len(span)):
        #One time fix to handle spacy breaking out the possisve 's and 
        #it adding syllables to the overall count
        if span[n].text.lower() == "'s":
            pass
        elif span[n].pos_ != 'PUNCT':
            try:
                syllables += word_pronouncer(span[n].text.lower())['syllables']
            except:
                return 0
    return syllables