In [46]:
import os
from collections import Counter
import string 

import gensim
import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import spacy

from src.scraping import extract_cnbc_article_info
from src.nlp import nlp_analysis
from src.urls import article_urls

In [76]:
article_urls

['https://www.cnbc.com/2019/03/15/spielberg-and-netflixs-sarandos-meet-sparking-hope-for-oscars-truce.html',
 'https://www.cnbc.com/2019/03/15/tesla-shares-fall-5-percent-after-company-unveils-model-y.html',
 'https://www.cnbc.com/2019/03/15/arizona-will-go-after-big-tech-if-necessary-says-attorney-general.html',
 'https://www.cnbc.com/2019/03/15/facebook-stocks-falls-after-cox-exit-outage-threats-from-washington.html',
 'https://www.cnbc.com/2019/02/24/kraft-heinz-reviews-options-for-maxwell-house-coffee-including-sale.html',
 'https://www.cnbc.com/2019/03/08/jeff-bezos-to-end-secrecy-over-amazons-role-in-carbon-emissions.html',
 'https://www.cnbc.com/2019/03/15/dieselgate-sec-says-volkswagen-perpetrated-fraud-lied-to-investors.html']

### Scrape CNIB Website for Articles

In [79]:
articles = []
for article_url in article_urls:
    _article_text_dict = extract_cnbc_article_info(article_url)
    articles.append(_article_text_dict)

In [80]:
article = articles[2]

In [84]:
article['article'] # Change article to content

'\nArizona Attorney General Mark Brnovich told CNBC on Friday he is prepared to go after big tech companies.\n\nAnd he\'s not alone.\n"When you have these tech companies dominate the market share, they essentially are akin to the monopolies of old," Brnovich said on "Closing Bell." \n\n"We as state AGs we are taking a look at maybe whether we should do something and if so what should be done."\nBrnovich is one of several state attorneys general who spoke recently to the Washington Post about their willingness to take action against Facebook, Google and other tech giants, which they say have grown too powerful.\nBrnovich said they are "worried about this massive amount of data that is being collected, manipulated. Sometimes it\'s misleading and maybe ends up maybe compromising some of our privacy rights."\nThe states are stepping up because the federal government isn\'t, said Brnovich. "Washington D.C. has been — at least in the last decade — where good ideas go to die."\nWhat\'s being 

 # Natural Language Processing Steps

In [85]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_lg')

In [86]:
stop_words =  nltk.corpus.stopwords.words('english')

In [87]:
# Extract word vectors -- GloVe is a pretrained word-vector model
word_embeddings = {}
glove_model_path = os.path.join('data', 'models', 'external', 'glove', 'glove.6B.100d.txt')
f = open(glove_model_path, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    word_embeddings[word] = np.asarray(values[1:], dtype='float32')
f.close()

In [11]:
punctuation_table = str.maketrans('', '', string.punctuation)

### Clean Article Text

In [12]:
# Remove unnecessary '/n'
cleaned_article = article['article'].replace('\n', ' ')

### Named-Entity Extraction

In [88]:
nlp_data = nlp(cleaned_article)

entities =  nlp_data.ents

organizations = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'ORG']
people = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'PERSON']

top_orgs = Counter(organizations).most_common(10)
top_people = Counter(people).most_common(10)

In [89]:
top_people

[('Brnovich', 5), ('Mark Brnovich', 1)]

In [90]:
top_orgs

[('Facebook', 3), ('Google', 3), ('CNBC', 2), ('the Washington Post', 2)]

In [74]:
def remove_stopwords(sentence):
    word_list = sentence.split()
    non_stopwords = " ".join([word for word in word_list if word not in stop_words])
    return non_stopwords

In [75]:
def lemmatize_words(sentence):
    lemma_words = " ".join([word.lemma_ for word in nlp(sentence)])
    return lemma_words

### Document Summarization

#### Clean Text

In [91]:
article_sentence_list = nltk.tokenize.sent_tokenize(cleaned_article)

In [92]:
removed_punctuation = [w.translate(punctuation_table) for w in article_sentence_list]
removed_mdash = [sentence.replace('—', '') for sentence in removed_punctuation]

In [94]:
lower_sentences = [s.lower() for s in removed_mdash]

In [95]:
removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [96]:
lemmatize_sentences = [lemmatize_words(sentence) for sentence in removed_stopwords]

In [97]:
for lemmatize_sentence, raw_sentence in zip(lemmatize_sentences, article_sentence_list):
    print("Original: ", raw_sentence)
    print("Cleaned: ", lemmatize_sentence)
    print('---')

Original:   Arizona Attorney General Mark Brnovich told CNBC on Friday he is prepared to go after big tech companies.
Cleaned:  arizona attorney general mark brnovich tell cnbc friday prepare go big tech company
---
Original:  And he's not alone.
Cleaned:  -PRON- s alone
---
Original:  "When you have these tech companies dominate the market share, they essentially are akin to the monopolies of old," Brnovich said on "Closing Bell."
Cleaned:  tech company dominate market share essentially akin monopoly old brnovich say close bell
---
Original:  "We as state AGs we are taking a look at maybe whether we should do something and if so what should be done."
Cleaned:  state ag take look maybe whether something do
---
Original:  Brnovich is one of several state attorneys general who spoke recently to the Washington Post about their willingness to take action against Facebook, Google and other tech giants, which they say have grown too powerful.
Cleaned:  brnovich one several state attorney gen

#### Calculate Sentence Embeddings

In [98]:
final_sentence = lemmatize_sentences
empty_vector = np.zeros((100,))
sentence_vectors = []
for sentence in final_sentence:
    if len(sentence) > 0:
        word_list = sentence.split()
        word_vector =  [word_embeddings.get(word, empty_vector) for word in word_list]
        summed_vector = sum(word_vector)
        normalized_vector = summed_vector/ (len(sentence.split()))
    else:
        normalized_vector = empty_vector
    sentence_vectors.append(normalized_vector)

#### Create Similarity Matrix between Sentences

In [24]:
sim_mat = np.zeros([len(article_sentence_list), len(article_sentence_list)])

for i in range(len(article_sentence_list)):
    for j in range(len(article_sentence_list)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), 
                                              sentence_vectors[j].reshape(1,100))[0,0]

#### Impliment PageRank Algo

In [99]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [100]:
ranked_sentences = sorted((
        (scores[i], sentence, article_sentence_list[i]) 
        for i,sentence in enumerate(final_sentence))
        , reverse=True)

In [101]:
N_SENTENCES = 3
generated_summary = [ranked_sentence[2] for ranked_sentence in ranked_sentences[0:N_SENTENCES]]

In [102]:
generated_summary 

['Brnovich is one of several state attorneys general who spoke recently to the Washington Post about their willingness to take action against Facebook, Google and other tech giants, which they say have grown too powerful.',
 'What\'s being seen is the "inaction or inability of the bureaucrats in Washington D.C. to do anything about protecting individual Americans, their privacy rights, how they are being manipulated when it comes to news feeds and news coverage," he said.',
 'Google also gave the paper a statement that said, "Privacy and security are built into all of our products, and we will continue to engage constructively with state attorneys general on policy issues."']

### Topic Modeling

In [103]:
tokenized_sentences = [nltk.tokenize.word_tokenize(sentence) for sentence in final_sentence]

In [104]:
article_dictionary = gensim.corpora.Dictionary(tokenized_sentences)  # This needs describing!
article_corpus = [article_dictionary.doc2bow(text) for text in tokenized_sentences] # This needs describing!

In [105]:
NUM_TOPICS = 5

article_ldamodel = gensim.models.ldamodel.LdaModel(article_corpus, 
                                                   num_topics = NUM_TOPICS, 
                                                   id2word=article_dictionary, 
                                                   passes=15)
topics = article_ldamodel.print_topics(num_words=5)

In [106]:
topics

[(0,
  '0.054*"maybe" + 0.030*"privacy" + 0.030*"facebook" + 0.030*"right" + 0.030*"google"'),
 (1,
  '0.042*"company" + 0.029*"brnovich" + 0.029*"look" + 0.029*"tech" + 0.029*"say"'),
 (2,
  '0.049*"state" + 0.049*"say" + 0.026*"washington" + 0.026*"statement" + 0.026*"brnovich"'),
 (3,
  '0.035*"not" + 0.035*"individual" + 0.035*"brnovich" + 0.035*"s" + 0.035*"comment"'),
 (4,
  '0.027*"go" + 0.027*"take" + 0.027*"say" + 0.027*"washington" + 0.027*"right"')]