In [32]:
import os
from collections import Counter
import string 

from IPython.display import display, Markdown
import gensim
import networkx as nx
import nltk
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy import displacy


from src.scraping import extract_cnbc_article_info
from src.urls import article_urls
from src.nlp import lemmatize_words, remove_stopwords

In [2]:
article_urls

['https://www.cnbc.com/2019/03/15/spielberg-and-netflixs-sarandos-meet-sparking-hope-for-oscars-truce.html',
 'https://www.cnbc.com/2019/03/15/tesla-shares-fall-5-percent-after-company-unveils-model-y.html',
 'https://www.cnbc.com/2019/03/15/arizona-will-go-after-big-tech-if-necessary-says-attorney-general.html',
 'https://www.cnbc.com/2019/03/15/facebook-stocks-falls-after-cox-exit-outage-threats-from-washington.html',
 'https://www.cnbc.com/2019/02/24/kraft-heinz-reviews-options-for-maxwell-house-coffee-including-sale.html',
 'https://www.cnbc.com/2019/03/08/jeff-bezos-to-end-secrecy-over-amazons-role-in-carbon-emissions.html',
 'https://www.cnbc.com/2019/03/15/dieselgate-sec-says-volkswagen-perpetrated-fraud-lied-to-investors.html']

### Scrape CNIB Website for Articles

In [3]:
articles = []
for article_url in article_urls:
    _article_text_dict = extract_cnbc_article_info(article_url)
    articles.append(_article_text_dict)

In [4]:
pprint(articles)

[{'article': '\n'
             'Steven Spielberg, a film industry titan with 58 directorial '
             'credits to his name, has long been vocal about the differences '
             'between theatrical and streaming releases.\n'
             '\n'
             'As the governor of the Academy of Motion Picture Arts and '
             'Sciences directors branch he has said that he would like films '
             'to have a longer theatrical run in order to qualify for an '
             'Academy Award. Those comments have led some to believe that '
             'Spielberg was ready to wage war on platforms like Netflix, '
             'especially after the streaming service\'s film "Roma" took home '
             'three Oscars last month.\n'
             "However, Spielberg reportedly met with Netflix's Chief Content "
             'Officer Ted Sarandos earlier this week, a sign that, perhaps, '
             'these rumors of discord are just that, rumors.\n'
             '\n'
         

In [5]:
article = articles[0]

In [6]:
article['title'] 

"Steven Spielberg reportedly met with Netflix's Ted Sarandos, sparking hopes for Academy Awards' truce"

In [7]:
pprint(article['article'])

('\n'
 'Steven Spielberg, a film industry titan with 58 directorial credits to his '
 'name, has long been vocal about the differences between theatrical and '
 'streaming releases.\n'
 '\n'
 'As the governor of the Academy of Motion Picture Arts and Sciences directors '
 'branch he has said that he would like films to have a longer theatrical run '
 'in order to qualify for an Academy Award. Those comments have led some to '
 'believe that Spielberg was ready to wage war on platforms like Netflix, '
 'especially after the streaming service\'s film "Roma" took home three Oscars '
 'last month.\n'
 "However, Spielberg reportedly met with Netflix's Chief Content Officer Ted "
 'Sarandos earlier this week, a sign that, perhaps, these rumors of discord '
 'are just that, rumors.\n'
 '\n'
 'The two were seen dining together at the San Vincente Bungalows, a private '
 'club in West Hollywood, according to The Hollywood Reporter.\n'
 "It's unclear what was discussed during their meeting. Repr

In [8]:
article['url']

'https://www.cnbc.com/2019/03/15/spielberg-and-netflixs-sarandos-meet-sparking-hope-for-oscars-truce.html'

 # Natural Language Processing Steps

### Clean Article Text

In [9]:
# Remove unnecessary '\n'
cleaned_article = article['article'].replace('\n', ' ')

### Named-Entity Extraction

In [10]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_lg')

nlp_data = nlp(cleaned_article)

In [11]:
displacy.render(nlp_data, style="ent", jupyter=True)

In [12]:
# Extract Entities
entities =  nlp_data.ents

In [13]:
# Obtain top 10 people mentioned in article
people = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'PERSON']
top_people = Counter(people).most_common(10)

top_people

[('Spielberg', 4),
 ('Steven Spielberg', 1),
 ('Ted Sarandos', 1),
 ('Sarandos', 1),
 ('Jeffrey Katzenberg', 1)]

In [14]:
# Obtain top 10 organizations mentioned in article
organizations = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'ORG']
top_orgs = Counter(organizations).most_common(10)

top_orgs

[('Netflix', 3),
 ('the Academy of Motion Picture Arts and Sciences', 1),
 ('Oscars', 1),
 ('The Hollywood Reporter', 1)]

### Document Summarization

#### Clean Text

In [15]:
# Tokenize by Sentence
article_sentence_list = nltk.tokenize.sent_tokenize(cleaned_article)

In [16]:
# Remove Punctuation
punctuation_table = str.maketrans('', '', string.punctuation)

removed_punctuation = [w.translate(punctuation_table) for w in article_sentence_list]
removed_mdash = [sentence.replace('—', '') for sentence in removed_punctuation]

In [17]:
# Convert words to lowercase
lower_sentences = [s.lower() for s in removed_mdash]

In [18]:
# Remove stopwords
stop_words =  nltk.corpus.stopwords.words('english')

removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [19]:
# Convert words to root word (e.g., 'according' to 'accord')
lemmatize_sentences = [lemmatize_words(sentence) for sentence in removed_stopwords]

In [44]:
for lemmatize_sentence, raw_sentence in zip(lemmatize_sentences, article_sentence_list):
    
    display(Markdown(f"**Original**: {raw_sentence}"))
    display(Markdown(f"**Cleaned**: {lemmatize_sentence}"))
    display(Markdown(f"---"))



**Original**:  Steven Spielberg, a film industry titan with 58 directorial credits to his name, has long been vocal about the differences between theatrical and streaming releases.

**Cleaned**: steven spielberg film industry titan 58 directorial credit name long vocal difference theatrical streaming release

---

**Original**: As the governor of the Academy of Motion Picture Arts and Sciences directors branch he has said that he would like films to have a longer theatrical run in order to qualify for an Academy Award.

**Cleaned**: governor academy motion picture art science director branch say would like film longer theatrical run order qualify academy award

---

**Original**: Those comments have led some to believe that Spielberg was ready to wage war on platforms like Netflix, especially after the streaming service's film "Roma" took home three Oscars last month.

**Cleaned**: comment lead believe spielberg ready wage war platform like netflix especially stream service film roma take home three oscar last month

---

**Original**: However, Spielberg reportedly met with Netflix's Chief Content Officer Ted Sarandos earlier this week, a sign that, perhaps, these rumors of discord are just that, rumors.

**Cleaned**: however spielberg reportedly meet netflixs chief content officer ted sarando earlier week sign perhaps rumor discord rumor

---

**Original**: The two were seen dining together at the San Vincente Bungalows, a private club in West Hollywood, according to The Hollywood Reporter.

**Cleaned**: two see dining together san vincente bungalow private club west hollywood accord hollywood reporter

---

**Original**: It's unclear what was discussed during their meeting.

**Cleaned**: unclear discuss meet

---

**Original**: Representatives for Spielberg and Sarandos were not immediately available to comment.

**Cleaned**: representative spielberg sarando immediately available comment

---

**Original**: Earlier this week, media mogul Jeffrey Katzenberg said that Spielberg had no plan to campaign against Netflix, saying at the South By Southwest conference in Texas that "he is not going to the academy in April with some sort of plan," according to a report by the Hollywood Reporter.

**Cleaned**: early week medium mogul jeffrey katzenberg say spielberg plan campaign against netflix say south southwest conference texas go academy april sort plan accord report hollywood reporter

---

**Original**: Read the full report by The Hollywood Reporter.

**Cleaned**: read full report hollywood reporter

---

#### Calculate Sentence Similarity

In [21]:
word_vectors = spacy.load('en_vectors_web_lg')

In [29]:
# Word Similarity Example
X = ['cat', 'dog', 'banana', 'plantain', 'facebook', 'google']
for word_i in X:
    for word_j in X:
            print(word_i, '-',word_j, ':', 
                  round(word_vectors(word_i).similarity(word_vectors(word_j)),2))

cat - cat : 1.0
cat - dog : 0.8
cat - banana : 0.28
cat - plantain : 0.02
cat - facebook : 0.22
cat - google : 0.24
dog - cat : 0.8
dog - dog : 1.0
dog - banana : 0.24
dog - plantain : 0.0
dog - facebook : 0.17
dog - google : 0.22
banana - cat : 0.28
banana - dog : 0.24
banana - banana : 1.0
banana - plantain : 0.52
banana - facebook : 0.14
banana - google : 0.18
plantain - cat : 0.02
plantain - dog : 0.0
plantain - banana : 0.52
plantain - plantain : 1.0
plantain - facebook : -0.1
plantain - google : -0.05
facebook - cat : 0.22
facebook - dog : 0.17
facebook - banana : 0.14
facebook - plantain : -0.1
facebook - facebook : 1.0
facebook - google : 0.73
google - cat : 0.24
google - dog : 0.22
google - banana : 0.18
google - plantain : -0.05
google - facebook : 0.73
google - google : 1.0


In [23]:
similarity_matrix = np.zeros([len(lemmatize_sentences), 
                              len(lemmatize_sentences)])

similarity_matrix
for i, sentence_i in enumerate(lemmatize_sentences):
    nlp_i = nlp(sentence_i)
    for j, sentence_j in enumerate(lemmatize_sentences):
        if i != j:
            nlp_j = nlp(sentence_j)
            similarity_matrix[i][j] = nlp_i.similarity(nlp_j)

In [24]:
similarity_matrix

array([[0.        , 0.7902801 , 0.79117085, 0.64559904, 0.58940274,
        0.42111417, 0.62151528, 0.70289835, 0.60373117],
       [0.7902801 , 0.        , 0.83430138, 0.73435571, 0.67062155,
        0.60547106, 0.68502817, 0.80599006, 0.65170639],
       [0.79117085, 0.83430138, 0.        , 0.77410935, 0.70949974,
        0.60266612, 0.69675199, 0.85659173, 0.68316482],
       [0.64559904, 0.73435571, 0.77410935, 0.        , 0.61327055,
        0.68515174, 0.69267565, 0.80578375, 0.66502025],
       [0.58940274, 0.67062155, 0.70949974, 0.61327055, 0.        ,
        0.44695763, 0.49607458, 0.81001306, 0.66927214],
       [0.42111417, 0.60547106, 0.60266612, 0.68515174, 0.44695763,
        0.        , 0.56788706, 0.60338968, 0.4947186 ],
       [0.62151528, 0.68502817, 0.69675199, 0.69267565, 0.49607458,
        0.56788706, 0.        , 0.62917975, 0.60409866],
       [0.70289835, 0.80599006, 0.85659173, 0.80578375, 0.81001306,
        0.60338968, 0.62917975, 0.        , 0.7379883 ],


#### Impliment PageRank Algo

In [25]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [26]:
ranked_sentences = sorted((
        (scores[i], sentence, article_sentence_list[i]) 
        for i,sentence in enumerate(lemmatize_sentences))
        , reverse=True)

In [27]:
N_SENTENCES = 3
generated_summary = [ranked_sentence[2] 
                     for ranked_sentence 
                     in ranked_sentences[0:N_SENTENCES]]

In [28]:
generated_summary 

['Earlier this week, media mogul Jeffrey Katzenberg said that Spielberg had no plan to campaign against Netflix, saying at the South By Southwest conference in Texas that "he is not going to the academy in April with some sort of plan," according to a report by the Hollywood Reporter.',
 'Those comments have led some to believe that Spielberg was ready to wage war on platforms like Netflix, especially after the streaming service\'s film "Roma" took home three Oscars last month.',
 'As the governor of the Academy of Motion Picture Arts and Sciences directors branch he has said that he would like films to have a longer theatrical run in order to qualify for an Academy Award.']