In [5]:
import nltk
import numpy as np
from nltk.corpus import cmudict
from nltk.corpus import stopwords
import re
import requests
from bs4 import BeautifulSoup
import pandas as pd
import json
import string
from newsapi import NewsApiClient
import spacy
nlp = spacy.load('en_core_web_sm')
from spacy import displacy
from spacy.lang.en.stop_words import STOP_WORDS
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

So, this notebook will be working on taking summaries made up of the indicative sentences from a text and trying to maniuplate them so they can be fit into a limerick type form. These sentences are likely to be long, made up of multiple clauses. I'm going to try to assemble some rules based approaches to extract sub-sentences. Perhaps simply throwing out the subordinate clauses or paring down sentences by removing some auxillary words.

First some auxillary functions that I'm copying in from elsewhere. I will definitely need to:
1. scrape out news text
2. parse the text appropriately so that I can:
3. run it through my text summarizer and extract the key sentences

In [2]:
nyt_api = 'key goes here'
alt_key = 'key goes here'

def scrape_articles_text(url):

    session = requests.Session()

    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')

    paragraph_tags = soup.find_all('p', class_= 'css-exrw3m evys1bk0')
    if paragraph_tags == []:
        paragraph_tags = soup.find_all('p', itemprop = 'articleBody')

    article = ''
    for p in paragraph_tags:
        article = article + ' ' + p.get_text()

    # Clean article replacing unicode characters
    article = article.replace(u'\u2018', u"'").replace(u'\u2019', u"'").replace(u'\u201c', u'"').replace(u'\u201d', u'"')

    return article

In [22]:
def text_sent_parser(text):
    temp_sentences = []
    #using SpaCy to parse the text for sentences
    text = nlp(text)
    #unfortunately will need to reconvert back into strings
    for sent in text.sents:
        temp = str()
        for token in sent:
            temp += token.text
            temp += ' '
        temp_sentences.append(temp)
    #Lastly cutting out repeated sentences, to get rid of things like 'subscribe here'
    #That come up depending on where you've scraped the text
    indices_to_avoid = []
    #first identify where the repeated sentences are
    for n in range(0,len(temp_sentences)):
        for i in range(n+1,len(temp_sentences)):
            if temp_sentences[n] == temp_sentences[i]:
                indices_to_avoid.append(n)
                indices_to_avoid.append(i)
    sentences = []
    #and then exclude them
    for n in range(0,len(temp_sentences)):
        if n not in indices_to_avoid:
            sentences.append(temp_sentences[n])
    return sentences

def sent_comparer(sentence_list):
    #sklearn vectorizer
    vectorizer = TfidfVectorizer()
    transformed = vectorizer.fit_transform(sentence_list)
    #Using cosine similarity on the whole list of vectors produces a matrix
    #where each sentence's vector is compared to every other sentence
    similars = cosine_similarity(transformed,transformed)
    #now we collapse into single values
    averages = []
    for i in similars:
        averages.append(i.mean())
    #And return the indices of the highest values
    #Sorted by index, so that our summary is chronological
    n_sents = int(len(sentence_list)**.5)
#    sent_indices = sorted(list(np.argsort(averages)[-n_sents:]))
    sent_indices = list(np.argsort(averages)[-n_sents:])
    return sent_indices

def summarizer(text):
    sentence_list = text_sent_parser(text)
    summary_indices = sent_comparer(sentence_list)
    return [sentence_list[n] for n in summary_indices]

Ok, here's an example article. When I first looked at the NYT homepage, the only article that seemed to be not entirely depressing was this one about a possible way to make small amounts of renewable power at night.

For this example, I'll work with the sentence identified as being most similar with the rest of the sentences in the article.

In [30]:
url = 'https://www.nytimes.com/2019/09/12/science/solar-energy-power-electricity.html'
text = scrape_articles_text(url)

summary = summarizer(text)
summary


['In the paper , Dr. Raman described how the device , when connected to a voltage converter , was able to power a white LED . ',
 'Earlier this year , they also tested an infrared photodiode , similar to the technology used in most solar cells but which uses warmth , not sunlight , to generate wisps of electricity in the darkness . ',
 'The village was not equipped with electricity , and Dr. Raman , an electrical engineer at the University of California , Los Angeles , was unaware he was in a village until he heard the voices of shadowed human figures . ',
 'At its heart is an off - the shelf gadget called a thermoelectric generator , which uses the difference in temperature between opposite sides of the device to generate a current . ',
 'This is a neat combination of radiative cooling — a technique where Raman has pioneered real working devices — with thermoelectric materials that generate electricity if one side is hotter than the other side , " said Ellen D. Williams , a physics pr

In [31]:
example = nlp(summary[-1])

In [32]:
example

Dr. Raman wondered whether he could use all that darkness to make something to light it up , not unlike the way that solar panels generate electricity from the sun 's heat and light .   

This is sort of a complicated sentence when you think about it! We have most of the actual content of the sentence in subordinate clauses - the core of the independent clause is really just "Dr. Rman wondered", which illustrates both the difficulty of compressing news-speech into short summaries and the limitations of each line of the limerick form, since those three words already take up 6 syllables.

Mapping out the parts of speech really illustrate the challenges:

In [35]:
for token in example:
    print(f'word: {token.text:{15}} POS: {token.pos_:{10}} {token.tag_:{5}} {token.dep_:{7}} {spacy.explain(token.dep_)}')

word: Dr.             POS: PROPN      NNP   compound compound
word: Raman           POS: PROPN      NNP   nsubj   nominal subject
word: wondered        POS: VERB       VBD   ROOT    None
word: whether         POS: ADP        IN    mark    marker
word: he              POS: PRON       PRP   nsubj   nominal subject
word: could           POS: VERB       MD    aux     auxiliary
word: use             POS: VERB       VB    ccomp   clausal complement
word: all             POS: DET        DT    predet  None
word: that            POS: DET        DT    det     determiner
word: darkness        POS: NOUN       NN    dobj    direct object
word: to              POS: PART       TO    aux     auxiliary
word: make            POS: VERB       VB    xcomp   open clausal complement
word: something       POS: NOUN       NN    dobj    direct object
word: to              POS: PART       TO    aux     auxiliary
word: light           POS: VERB       VB    relcl   relative clause modifier
word: it              PO

In [33]:
displacy.render(example)

What are the different ways we might extract useful fragments, that both looked grammatical and contained actual content? If I were tasked with that challenge, I would think of things like "he could use all that darkness", "solar panels generate" or even just "generate electricity".

The added challenge is then making sure that each of these fragments conforms to either the 5 or 7 syllable line lengths. My thinking is that first you create a list of acceptable sub-clauses, taking each verb, say, and extracting it's appropriate N-V-O phrase and then you prune these down in various ways, first cutting extraneous adjectives, say, or chopping off the object.

You could build up a long list of these sub-parts and then simply search all the possible permutations to look for any rhymes that naturally occured. That just might work on a sufficiently long article! If not, you could then maybe do something similar on multiple articles and build a summary limerick of the news that has a line from each from multiple stories.

Spacy conveniently groups noun chunks together, but doesn't do something similar for 'verb-phrases'. I think the first step is a function that will take in a noun and search for all the appropriate others parts of the phrase using the dependencies.

Spacy has a good sense of sentence structure, but unfortunately for my purposes, the tree structure into which the sentence is parsed is not super helpful, since the sub-tree for the main verb is essentially the whole sentence:

In [107]:
noun_index = []
for n in range(0,len(example)):
    if example[n].dep_ == 'nsubj':
        noun_index.append(n)
        
for index in noun_index:
    print(example[index].head)

wondered
use
generate


In [72]:
for token in example[2].subtree:
    print(token)

Dr.
Raman
wondered
whether
he
could
use
all
that
darkness
to
make
something
to
light
it
up
,
not
unlike
the
way
that
solar
panels
generate
electricity
from
the
sun
's
heat
and
light
.
  


I think I can build on the semi-recursive nature of the structure however. I'll parse the sentence from the bottom up, and then I can keep the different verb groups distinct

In [61]:
for token in example[6].subtree:
    print(token)

whether
he
could
use
all
that
darkness
to
make
something
to
light
it
up
,
not
unlike
the
way
that
solar
panels
generate
electricity
from
the
sun
's
heat
and
light


In [70]:
for token in example[11].subtree:
    print(token)

to
make
something
to
light
it
up
,


In [77]:
#Let's gather up all the verbs
verb_index = []
for n in range(0,len(example)):
    if example[n].pos_ == 'VERB':
        verb_index.append(n)
        
#and see what sort of subtrees they have        
for index in verb_index:
    print(' '.join([token.text for token in example[index].subtree]))

Dr. Raman wondered whether he could use all that darkness to make something to light it up , not unlike the way that solar panels generate electricity from the sun 's heat and light .   
could
whether he could use all that darkness to make something to light it up , not unlike the way that solar panels generate electricity from the sun 's heat and light
to make something to light it up ,
to light it up
that solar panels generate electricity from the sun 's heat and light


In [132]:
#Let's try breaking apart the tree by making cuts where the next sub tree starts
for n in range(0,len(verb_index)-1):
    index = verb_index[n]
    end_i = example[verb_index[n+1]].right_edge.i
    print(' '.join([token.text for token in example[index].subtree if token.i < end_i]))
print(' '.join([token.text for token in example[verb_index[-1]].subtree]))

Dr. Raman wondered whether he
could
whether he could use all that darkness to make something to light it up
to make something to light it
to light it up
that solar panels generate electricity from the sun 's heat and light


The challenge here is the nested/overlapping nature of the tree. The word "could" is a subtree unto itself, as an auxilliary in the phrase "whether he could use". First task is probably to simply throw away any sub trees that are that short.

I also want to look for a way to break out noun phrases, not just the chunks that spacy identifies. Spacy captures adjectives and articles, but doesn't keep things joined by a conjuction together. I want to automatically pull out "the sun's heat and light" as one phrase. So far the only instance I know I want to overcome is this one, so it should be easy to fix.

First I need to bring my pronunciation parsers back in so I can also test these things for syllable length.

In [140]:
#NLTK pronunciation dictionary to search for length (and also rhymes later on)
entries = nltk.corpus.cmudict.entries()
pron_dict = {}
for entry in entries:
    pron_dict[entry[0]]=entry[1]

def word_pronouncer(word):
    pron = pron_dict[word.lower()]
    stresses = str()
    for phoneme in pron:
        if phoneme[-1].isdigit():
            #NLTK has markings of 0 for unstressed and 1 and 2 for stressed (primary and secondary)
            if phoneme[-1] == '0':
                stresses += phoneme[-1]
            else:
                stresses += '1'
    return {'stresses':stresses, 'syllables':len(stresses)}

def span_syllables(span):
    syllables = 0
    for n in range(0,len(span)):
        syllables += word_pronouncer(span[n].text.lower())['syllables']
    return syllables

#Here's another version that also takes into account the stresses in case I get there
def span_syllables_meter(span):
    syllables = 0
    temp = []
    for n in range(0,len(span)):
        temp.append(word_pronouncer(span[n].text.lower()))
    stresses = str()
    for item in temp:
        stresses += item['stresses']
    return {'stresses':stresses,'syllables':len(stresses)}

In [None]:
def test_conjuction(text,noun_chunk):
    if text[noun_chunk.end+1].tag == 'CC':
        return True

def noun_phrase_extract(text):
    phrases = []
    chunks = [chunk for chunk in text.noun_chunks]
    for n in range(0,len(chunks):
        if span_syllables(chunks[n]) == (5 or 7):
            phrases.append(chunks[n])
        if chunks[n].conjuncts != ():
            end = max([span.i for span in chunks[n].conjuncts])
            temp = text[chunks[n].start:end]
            if span_syllables(temp) == (5 or 7):

In [148]:
for chunk in example.noun_chunks:
    print(chunk,span_syllables(chunk),chunk.conjuncts != ())

Dr. Raman 4 False
he 1 False
all that darkness 4 False
something 2 False
it 1 False
the way 2 False
solar panels 4 False
electricity 5 False
the sun 's heat 4 True
light 1 True


In [143]:
chunks = [chunk for chunk in example.noun_chunks]

In [150]:
max([span.i for span in chunks[-2].conjuncts])

33