In [2]:
import os
from collections import Counter
import string 

from IPython.display import display, Markdown
import networkx as nx
import nltk
import numpy as np
import pandas as pd
from pprint import pprint
import spacy
from spacy import displacy

from src.scraping import extract_cbc_article_info
from src.urls import cbc_urls
from src.nlp import lemmatize_words, remove_stopwords

In [3]:
cbc_urls

['https://www.cbc.ca/news/business/starbucks-greener-cup-1.5063861',
 'https://www.cbc.ca/news/business/rogers-media-magazines-1.5064054',
 'https://www.cbc.ca/news/business/budget-cmhc-home-buyers-1.5063204',
 'https://www.cbc.ca/news/business/eu-regulators-fine-google-online-ads-1.5063806',
 'https://www.cbc.ca/news/canada/nova-scotia/air-canada-max-8s-grounded-july-1-1.5062354',
 'https://www.cbc.ca/news/business/shoppers-drug-mart-superstore-self-checkout-loblaw-1.5056800',
 'https://www.cbc.ca/news/business/volkswagen-charged-with-defrauding-investors-1.5058925',
 'https://www.cbc.ca/news/technology/facebook-instagram-outage-cause-1.5056807']

### Scrape CBC Website for Articles

In [4]:
articles = []
for article_url in cbc_urls:
    _article_text_dict = extract_cbc_article_info(article_url)
    articles.append(_article_text_dict)

In [5]:
pprint(articles)

[{'article': ' Starbucks\xa0announced Wednesday\xa0it will pilot\xa0new '
             'greener to-go cups\xa0this year in Vancouver that will be both '
             'recyclable and compostable.\xa0 . Vancouver will join New York, '
             'San Francisco, Seattle and London to trial different cup options '
             'that will be\xa0chosen from the NextGen Cup Challenge winners '
             'that were announced earlier this month. . "We know how important '
             'this issue is to Canadians," said Michael Conway, executive vice '
             'president and president of Starbucks Canada in a media release. '
             '"We\'re committed to being a part of the solution.\xa0I\'m '
             'excited and proud that our customers in Vancouver will be among '
             'the first to sip coffee from a greener to-go cup." . In addition '
             'to the greener cups, the coffee company will roll out new '
             'recyclable strawless lids to stores across

In [6]:
article = articles[7]

In [7]:
pprint(article)

{'article': ' Facebook Inc. said on Thursday it had restored service to its '
            "main app and Instagram, after the world's largest social network "
            'suffered a major outage that frustrated users across the globe '
            'for about 24 hours. . The company blamed the outage on a server '
            'configuration change. . Yesterday, as a result of a server '
            'configuration change, many people had trouble accessing our apps '
            "and services. We've now resolved the issues and our systems are "
            'recovering. We’re very sorry for the inconvenience and appreciate '
            'everyone’s patience. . It also said it was considering whether to '
            'refund advertisers for lost exposure due to the problems, which '
            'internet outage trackers showed affected users in Europe, Japan, '
            'and North and South America. . "Yesterday, a server configuration '
            'issue made it difficult for people to

 # Natural Language Processing Steps

### Clean Article Text

In [9]:
# Remove unicode for clarity
cleaned_article = article['article'].replace('\xa0', ' ')

In [10]:
cleaned_article

' Facebook Inc. said on Thursday it had restored service to its main app and Instagram, after the world\'s largest social network suffered a major outage that frustrated users across the globe for about 24 hours. . The company blamed the outage on a server configuration change. . Yesterday, as a result of a server configuration change, many people had trouble accessing our apps and services. We\'ve now resolved the issues and our systems are recovering. We’re very sorry for the inconvenience and appreciate everyone’s patience. . It also said it was considering whether to refund advertisers for lost exposure due to the problems, which internet outage trackers showed affected users in Europe, Japan, and North and South America. . "Yesterday, a server configuration issue made it difficult for people to access our apps and services. We are 100 per cent back up and running and apologize for any inconvenience," a Facebook spokesperson said. . "We are still investigating the overall impact of

### NLP Model

In [11]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_lg')

nlp_data = nlp(cleaned_article)

### Named-Entity Extraction

In [13]:
displacy.render(nlp_data, style="ent", jupyter=True)

In [14]:
# Extract Entities
entities =  nlp_data.ents

In [15]:
# Obtain top 10 organizations mentioned in article
organizations = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'ORG']
top_orgs = Counter(organizations).most_common(10)

top_orgs

[('Facebook', 5),
 ('Facebook Inc.', 1),
 ('Twitter', 1),
 ('DownDetector', 1),
 ('BBC', 1),
 ('Reuters', 1),
 ('the New York Times', 1),
 ('Amazon.com Inc.', 1),
 ('Apple Inc.', 1),
 ('the U.S. Federal Trade Commission', 1)]

### Document Summarization

#### Clean Text

In [16]:
# Tokenize by Sentence
article_sentence_list = nltk.tokenize.sent_tokenize(cleaned_article)

In [17]:
# Remove Punctuation
punctuation_table = str.maketrans('', '', string.punctuation)

removed_punctuation = [w.translate(punctuation_table) for w in article_sentence_list]
removed_mdash = [sentence.replace('—', '') for sentence in removed_punctuation]

In [18]:
# Convert words to lowercase
lower_sentences = [s.lower() for s in removed_mdash]

In [19]:
# Remove stopwords
stop_words =  nltk.corpus.stopwords.words('english')

removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [20]:
# Convert words to root word (e.g., 'according' to 'accord')
lemmatize_sentences = [lemmatize_words(sentence) for sentence in removed_stopwords]

In [21]:
for lemmatize_sentence, raw_sentence in zip(lemmatize_sentences, article_sentence_list):
    if raw_sentence != '.':
        display(Markdown(f"**Original**: {raw_sentence}"))
        display(Markdown(f"**Cleaned**: {lemmatize_sentence}"))
        display(Markdown(f"---"))



**Original**:  Facebook Inc. said on Thursday it had restored service to its main app and Instagram, after the world's largest social network suffered a major outage that frustrated users across the globe for about 24 hours.

**Cleaned**: facebook inc say thursday restore service main app instagram world large social network suffer major outage frustrate user across globe 24 hour

---

**Original**: The company blamed the outage on a server configuration change.

**Cleaned**: company blame outage server configuration change

---

**Original**: Yesterday, as a result of a server configuration change, many people had trouble accessing our apps and services.

**Cleaned**: yesterday result server configuration change many people trouble access app service

---

**Original**: We've now resolved the issues and our systems are recovering.

**Cleaned**: -PRON- have resolve issue system recover

---

**Original**: We’re very sorry for the inconvenience and appreciate everyone’s patience.

**Cleaned**: -PRON- be sorry inconvenience appreciate everyone ’s patience

---

**Original**: It also said it was considering whether to refund advertisers for lost exposure due to the problems, which internet outage trackers showed affected users in Europe, Japan, and North and South America.

**Cleaned**: also say consider whether refund advertiser lose exposure due problem internet outage tracker show affect user europe japan north south america

---

**Original**: "Yesterday, a server configuration issue made it difficult for people to access our apps and services.

**Cleaned**: yesterday server configuration issue make difficult people access app service

---

**Original**: We are 100 per cent back up and running and apologize for any inconvenience," a Facebook spokesperson said.

**Cleaned**: 100 per cent back run apologize inconvenience facebook spokesperson say

---

**Original**: "We are still investigating the overall impact of this issue, including the possibility of refunds for advertisers."

**Cleaned**: still investigate overall impact issue include possibility refund advertiser

---

**Original**: Facebook makes tens of millions of dollars in advertising revenue every day.

**Cleaned**: facebook make ten million dollar advertising revenue every day

---

**Original**: Media reports earlier said millions of users were affected, and thousands took to Twitter on Wednesday and Thursday to complain under the hashtag #facebookdown.

**Cleaned**: medium report earlier say million user affect thousand take twitter wednesday thursday complain hashtag facebookdown

---

**Original**: DownDetector website — one of the internet's most used sources of numbers on outages — showed the number of complaints had peaked at more than 12,000, gradually falling to about 180 as of 11 a.m.

**Cleaned**: downdetector website one internet use source number outage show number complaint peak more 12000 gradually fall 180 11

---

**Original**: ET on Thursday.

**Cleaned**: et thursday

---

**Original**: The BBC and a handful of other media outlets said it was the platform's longest ever outage.

**Cleaned**: bbc handful medium outlet say platform long ever outage

---

**Original**: Reuters was not immediately able to verify those claims and the company declined to comment beyond the statement on resumption of services.

**Cleaned**: reuter immediately able verify claim company decline comment beyond statement resumption service

---

**Original**: Facebook's shares fell nearly two per cent in morning trading on Thursday.

**Cleaned**: facebook share fall nearly two per cent morning trading thursday

---

**Original**: Separately, the New York Times reported on Wednesday that U.S. federal prosecutors were conducting a criminal investigation into data deals Facebook struck with more than 150 technology companies such as Amazon.com Inc. and Apple Inc. .

**Cleaned**: separately new york time report wednesday -PRON- federal prosecutor conduct criminal investigation datum deal facebook strike more 150 technology company amazoncom inc apple inc

---

**Original**: Facebook is facing a slew of lawsuits and regulatory inquiries over its privacy practices, including ongoing investigations by the U.S. Federal Trade Commission, the Securities and Exchange Commission and two state agencies in New York.

**Cleaned**: facebook face slew lawsuit regulatory inquiry privacy practice include ongoing investigation -PRON- federal trade commission security exchange commission two state agency new york

---

**Original**: A spokesperson for the social network said the company was co-operating with investigators in multiple federal probes, without addressing the grand jury inquiry specifically.

**Cleaned**: spokesperson social network say company cooperate investigator multiple federal probe without address grand jury inquiry specifically

---

#### Calculate Sentence Similarity

In [None]:
word_vectors = spacy.load('en_vectors_web_lg')

In [22]:
similarity_matrix = np.zeros([len(lemmatize_sentences), 
                              len(lemmatize_sentences)])

similarity_matrix
for i, sentence_i in enumerate(lemmatize_sentences):
    nlp_i = nlp(sentence_i)
    for j, sentence_j in enumerate(lemmatize_sentences):
        if i != j:
            nlp_j = nlp(sentence_j)
            similarity_matrix[i][j] = nlp_i.similarity(nlp_j)

#### Impliment PageRank Algorithm

In [23]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [24]:
ranked_sentences = sorted((
        (scores[i], sentence, article_sentence_list[i]) 
        for i,sentence in enumerate(lemmatize_sentences))
        , reverse=True)

In [25]:
N_SENTENCES = 3
generated_summary = [ranked_sentence[2] 
                     for ranked_sentence 
                     in ranked_sentences[0:N_SENTENCES]]

In [26]:
generated_summary

['It also said it was considering whether to refund advertisers for lost exposure due to the problems, which internet outage trackers showed affected users in Europe, Japan, and North and South America.',
 " Facebook Inc. said on Thursday it had restored service to its main app and Instagram, after the world's largest social network suffered a major outage that frustrated users across the globe for about 24 hours.",
 'Media reports earlier said millions of users were affected, and thousands took to Twitter on Wednesday and Thursday to complain under the hashtag #facebookdown.']