In [1]:
import numpy as np
import pandas as pd
import nltk
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer
import spacy
nlp = spacy.load('en_core_web_lg')
import requests

In [2]:
from bs4 import BeautifulSoup

In [3]:
def NPR_text(id_number):
    html_page = requests.get('https://text.npr.org/s.php?sId={}'.format(id_number))
    soup = BeautifulSoup(html_page.content, 'html.parser')
    text = soup.text.split('\n')[15:-10]
    tokens = []
    for block in text:
        tokens += nltk.word_tokenize(block)
    return text

A simple text summarizer, comparing sentence vectors with cosine similarity and returning the sentense that seem most similar to the others in the text. First a walk through of the different parts, then wrapped in a function.

In [9]:
vectorizer = TfidfVectorizer()

In [11]:
transformed = vectorizer.fit_transform(NPR_text(745418569))

In [15]:
np.shape(transformed.toarray())

(22, 354)

In [20]:
from sklearn.metrics.pairwise import cosine_similarity

In [22]:
similars = cosine_similarity(transformed,transformed)

In [25]:
averages = []
for i in similars:
    averages.append(i.mean())

In [33]:
sent_indices = sorted(list(np.argsort(averages)[-4:]))

In [34]:
sent_indices

[2, 7, 12, 14]

In [37]:
text = NPR_text(745418569)
for n in sent_indices:
    print(text[n])

The fast-fashion giant pledged that by 2025, all of its eight brands will only use cotton, linen and polyester that's organic, sustainable or recycled, which is 90% of the raw materials its uses. CEO and executive chairman Pablo Isla said that renewable sources will power 80% of the energy consumed by the conglomerate's distribution centers, offices and stores. It also plans to transition to zero landfill waste.
Cline says that even if Zara is using materials that are more ethically sourced or have a lower environmental impact, the vast majority of the carbon footprint of fashion comes from the manufacturers who supply brands with their materials. When a business is built on a fast turnover of styles, making those products still swallows a lot of energy, regardless of whether it's using organic cotton or selling products in more eco-efficient stores.
"The fashion industry isn't actually just one industry, it's a whole raft of other industries that are used and exploited to deliver the 

An example text scraped from the NYT was generated here. Finding text not really the focus of this notebook

In [45]:
nyt_api = 'key goes here'

In [46]:
nyt_url = f'https://api.nytimes.com/svc/topstories/v2/science.json?api-key={nyt_api}'

In [49]:
resp = requests.get(nyt_url)

In [58]:
def scrape_articles_text(url):

    session = requests.Session()

    req = session.get(url)
    soup = BeautifulSoup(req.text, 'lxml')

    paragraph_tags = soup.find_all('p', class_= 'css-exrw3m evys1bk0')
    if paragraph_tags == []:
        paragraph_tags = soup.find_all('p', itemprop = 'articleBody')

    article = ''
    for p in paragraph_tags:
        article = article + ' ' + p.get_text()

    # Clean article replacing unicode characters
    article = article.replace(u'\u2018', u"'").replace(u'\u2019', u"'").replace(u'\u201c', u'"').replace(u'\u201d', u'"')

    return article

In [87]:
url = resp.json()['results'][9]['url']

In [94]:
text = scrape_articles_text(url)

In [89]:
text = nlp(text)

In [92]:
for sent in text.sents:
    print(sent)

 The chief executive of Novartis on Wednesday defended the company's decision to delay telling the Food and Drug Administration about manipulated data involving its $2.1 million gene therapy treatment, saying that it "thoroughly, aggressively" investigated the issue and that patient safety was never threatened.
Vas Narasimhan, the chief executive, also indicated in a call with investors that the company was forcing out a small number of scientists who were involved in the manipulated data.  
The F.D.A. on Tuesday issued an unusual public rebuke of Novartis for failing to report the falsified data before its gene therapy treatment, Zolgensma, was approved in May, even though the company had known about the issue since March.
The agency said it was continuing to investigate and the company could face civil or criminal penalties.
Novartis and the F.D.A. have said the falsified data did not affect the safety, quality or efficacy of Zolgensma, a therapy that treats a rare genetic disease kn

In [113]:
def text_sent_parser(text):
    temp_sentences = []
    #using SpaCy to parse the text for sentences
    text = nlp(text)
    #unfortunately will need to reconvert back into strings
    for sent in text.sents:
        temp = str()
        for token in sent:
            temp += token.text
            temp += ' '
        temp_sentences.append(temp)
    #Lastly cutting out repeated sentences, to get rid of things like 'subscribe here'
    #That come up depending on where you've scraped the text
    indices_to_avoid = []
    #first identify where the repeated sentences are
    for n in range(0,len(temp_sentences)):
        for i in range(n+1,len(temp_sentences)):
            if temp_sentences[n] == temp_sentences[i]:
                indices_to_avoid.append(n)
                indices_to_avoid.append(i)
    sentences = []
    #and then exclude them
    for n in range(0,len(temp_sentences)):
        if n not in indices_to_avoid:
            sentences.append(temp_sentences[n])
    return sentences

In [117]:
sentence_list = text_sent_parser(text)

In [284]:
len(sentence_list)

23

In [128]:
def sent_comparer(sentence_list):
    #sklearn vectorizer
    vectorizer = TfidfVectorizer()
    transformed = vectorizer.fit_transform(sentence_list)
    #Using cosine similarity on the whole list of vectors produces a matrix
    #where each sentence's vector is compared to every other sentence
    similars = cosine_similarity(transformed,transformed)
    #now we collapse into single values
    averages = []
    for i in similars:
        averages.append(i.mean())
    #And return the indices of the highest values
    #Sorted by index, so that our summary is chronological
    n_sents = int(len(sentence_list)**.5)
    sent_indices = sorted(list(np.argsort(averages)[-n_sents:]))
    return sent_indices

In [129]:
sent_comparer(sentence_list)

[0, 2, 8, 16]

In [130]:
def summarizer(text):
    sentence_list = text_sent_parser(text)
    summary_indices = sent_comparer(sentence_list)
    return [sentence_list[n] for n in summary_indices]

In [132]:
summary = summarizer(text)
summary

['  The chief executive of Novartis on Wednesday defended the company \'s decision to delay telling the Food and Drug Administration about manipulated data involving its $ 2.1 million gene therapy treatment , saying that it " thoroughly , aggressively " investigated the issue and that patient safety was never threatened . ',
 'The F.D.A. on Tuesday issued an unusual public rebuke of Novartis for failing to report the falsified data before its gene therapy treatment , Zolgensma , was approved in May , even though the company had known about the issue since March . ',
 'The data manipulation also threatened to tarnish the image of Zolgensma , only the second gene therapy treatment to be approved by the F.D.A. , and one that is being closely watched as dozens of other gene therapies are in the works . ',
 'The company investigation , they said , showed that the falsified data was limited to experiments on mice used in early phases of the research , and that the tests in question were disc

In [133]:
test_sentence = summary[0]
test_sentence = nlp(test_sentence)

In [134]:
test_sentence

  The chief executive of Novartis on Wednesday defended the company 's decision to delay telling the Food and Drug Administration about manipulated data involving its $ 2.1 million gene therapy treatment , saying that it " thoroughly , aggressively " investigated the issue and that patient safety was never threatened . 

In [144]:
for token in test_sentence:
    print(f'word: {token.text:{15}} POS: {token.pos_:{10}} {token.tag_:{5}} 
          {token.dep_:{7}} {spacy.explain(token.dep_)}')

word:                 POS: SPACE      _SP           None
word: The             POS: DET        DT    det     determiner
word: chief           POS: ADJ        JJ    amod    adjectival modifier
word: executive       POS: NOUN       NN    nsubj   nominal subject
word: of              POS: ADP        IN    prep    prepositional modifier
word: Novartis        POS: PROPN      NNP   pobj    object of preposition
word: on              POS: ADP        IN    prep    prepositional modifier
word: Wednesday       POS: PROPN      NNP   pobj    object of preposition
word: defended        POS: VERB       VBD   ROOT    None
word: the             POS: DET        DT    det     determiner
word: company         POS: NOUN       NN    poss    possession modifier
word: 's              POS: PART       POS   case    case marking
word: decision        POS: NOUN       NN    dobj    direct object
word: to              POS: PART       TO    aux     auxiliary
word: delay           POS: VERB       VB    acl     claus

In [167]:
summary_1 = summarizer(' '.join(NPR_text(748416223)))

In [166]:
summarizer(' '.join(NPR_text(749224941)))

['NPR.org , August 7 , 2019 · Scientists are adding a new creature to a list of giant , prehistoric animals that were previously unknown : The Heracles inexpectus , a supersize parrot , estimated to have been as tall as a small human child , was discovered by Australian researchers in New Zealand , according to a study published in Biology Letters .   ',
 'Michael Archer , a paleontologist at the University of New South Wales who was also involved in the study , remarked that the bird \'s stature would make it " able to pick the lint out of your bellybutton . ',
 'Allison Shultz , associate curator of ornithology at the Natural History Museum of Los Angeles County , told NPR the Heracles \' story fits the pattern of what has happened to bird species over the ages : " They get to an island , lose the ability to fly and get really big . ',
 'Likewise there was a crocodile in the fauna , but one expects a parrot to be more clever than those and unlikely to be caught very often , " Worthy 

In [169]:
summary_1

['The panel of scientists looked at the climate change effects of agriculture , deforestation and other land use , such as harvesting peat and managing grasslands and wetlands . ',
 'At that time , the panel broadly suggested that farmland would need to shrink and forests would need to grow to keep Earth from getting more than 1.5 degrees Celsius hotter than it was in the preindustrial era . ',
 "Scientists say the only way to achieve that reduction is to significantly increase the amount of land that 's covered in trees and other vegetation and significantly reduce the amount of methane and other greenhouse gases that come from raising livestock such as cows , sheep and goats . ",
 'The U.N. panel is the latest group of experts to grapple with a global conundrum : how to reduce greenhouse gas emissions from agriculture , deforestation and other land use without creating food shortages or displacing people whose livelihoods rely on practices that are unsustainable globally . ']

In [213]:
sent_1 = nlp(summary_1[0])

In [222]:
#Once you have a good summary, finding a specific subject, or subject chunk becomes pretty easy
#Simply use spaCy to identify the grammatical subject of your summary sentence
def find_subj(nlp_text):
    for token in nlp_text:
        if token.dep_ == 'nsubj':
            return token


def complete_subject_chunk(nlp_text):
    subj = find_subj(nlp_text)
    span_start = subj.left_edge.i
    span_end = subj.right_edge.i
    return nlp_text[span_start:span_end+1]



In [189]:
sent_1[1:subj.right_edge.i]

panel of

In [230]:
subj_chunk = complete_subject_chunk(sent_1)
subj_chunk

The panel of scientists

In [243]:
#Now to adjust my haiku generator, getting vocab only from the summary
def find_vocab(sentence_list):
    subj = find_subj(nlp(sentence_list[0]))
    subj_chunk = complete_subject_chunk(nlp(sentence_list[0]))
    
    complete_text = nlp(' '.join(sentence_list))
    vocab = []
    for token in complete_text:
        if token not in subj_chunk:
            if token.pos_ == 'ADJ':
                vocab.append(token.text.lower())
            if token.pos_ == 'NOUN':
                vocab.append(token.text.lower())
    vocab = list(set(vocab))
    return subj.text, subj_chunk.text, vocab

In [244]:
subj, subj_chunk, vocab = find_vocab(summary_1)

In [254]:
from nltk.corpus import cmudict
pronounciations = cmudict.dict()

def syllable_count(word):
    word = word.split()
    count = 0
    for item in word:
        pron = pronounciations[item.lower()][0]
        for syl in pron:
            if syl[-1].isdigit():
                count +=1
    return count

In [250]:
subj_chunk.split()

['The', 'panel', 'of', 'scientists']

In [255]:
syllable_count(subj_chunk)

7

In [273]:
def master_dict_maker(vocab):
    master_dict = {}
    for word in vocab:
        try:
            count = syllable_count(word)
            master_dict[count] = master_dict.get(count,[]) + [word]
        except:
            pass
        
    return master_dict


def find_word_for_line(max_syllables, master_dict):
    word_list = []
    for n in master_dict.keys():
        if n <= max_syllables:
            word_list = word_list + master_dict[n]

    word = word_list[np.random.randint(1,len(word_list)+1)]
    return word

In [275]:
syllable_dict = master_dict_maker(vocab)

In [276]:
def fill_out_line(num_syllables, syllable_dict, line = [], already_used=[]):
    while num_syllables > 0:
        new_word = find_word_for_line(num_syllables, syllable_dict)
        if new_word not in already_used: #make sure we don't use the same word more than once
            line.append(new_word)
            already_used.append(new_word)
            num_syllables -= syllable_count(new_word)
        
    return line, already_used

In [278]:
fill_out_line(7, syllable_dict, line = [], already_used=[])

(['land', 'amount', 'shortages', 'cows'],
 ['land', 'amount', 'shortages', 'cows'])

In [279]:
def write_haiku(id_num):
    summary = summarizer(' '.join(NPR_text(id_num)))
    subj, subj_chunk, vocab = find_vocab(summary)
    syllable_dict = master_dict_maker(vocab)
    
    
    line_1 = []
    line_2 = []
    line_3 = []
    already_used = []
    if syllable_count(subj_chunk) <=5:
        line_1.append(subj_chunk)
        line_1, already_used = fill_out_line(5-syllable_count(subj_chunk), syllable_dict, line_1, already_used)
        line_2, already_used = fill_out_line(7, syllable_dict, line_2, already_used)
        line_3, already_used = fill_out_line(5, syllable_dict, line_3, already_used)
        
    elif syllable_count(subj_chunk) <=7:
        line_2.append(subj_chunk)
        line_1, already_used = fill_out_line(5, syllable_dict, line_1, already_used)
        line_2, already_used = fill_out_line(7-syllable_count(subj_chunk), syllable_dict, line_2, already_used)
        line_3, already_used = fill_out_line(5, syllable_dict, line_3, already_used)
    elif syllable_count(subj) <=7:
        line_2.append(subj)
        line_1, already_used = fill_out_line(5, syllable_dict, line_1, already_used)
        line_2, already_used = fill_out_line(7-syllable_count(subj), syllable_dict, line_2, already_used)
        line_3, already_used = fill_out_line(5, syllable_dict, line_3, already_used)
        


    return line_1, line_2, line_3

In [285]:
write_haiku(748416223)

(['sheep', 'global', 'need', 'food'],
 ['The panel of scientists'],
 ['new', 'effects', 'countries'])

In [287]:
write_haiku(749224941)

(['Scientists', 'small', 'lint'],
 ['bird', 'fauna', 'island', 'story'],
 ['ornithology'])