In [4]:
import os
from collections import Counter
import string 

import gensim
import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import spacy

from src.scraping import extract_cnbc_article_info
from src.nlp import nlp_analysis
from src.urls import article_urls

### Scrape CNIB Website for Articles

In [6]:
article_list = []
for article_url in article_urls:
    _article_text_dict = extract_cnbc_article_info(article_url)
    article_list.append(_article_text_dict)

In [7]:
article = article_list[2]

 # Natural Language Processing Steps

In [8]:
# Load English tokenizer, tagger, parser, NER and word vectors
# English multi-task CNN trained on OntoNotes, with GloVe vectors trained on Common Crawl. 
# Assigns word vectors, context-specific token vectors, POS tags, dependency parse and named entities.
nlp = spacy.load('en_core_web_lg')

In [9]:
stop_words =  nltk.corpus.stopwords.words('english')

In [10]:
# Extract word vectors
# GloVe is a pretrained word-vector 
#GloVe is an unsupervised learning algorithm for obtaining vector representations for words. 
# Training is performed on aggregated global word-word co-occurrence statistics from a corpus, and 
#the resulting representations showcase interesting linear substructures of the word vector space.
word_embeddings = {}
glove_model_path = os.path.join('data', 'models', 'external', 'glove', 'glove.6B.100d.txt')
f = open(glove_model_path, encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [11]:
punctuation_table = str.maketrans('', '', string.punctuation)

### Clean Article Text

In [12]:
cleaned_article = article['article'].replace('\n', ' ')

### Named-Entity Extraction

In [13]:
nlp_data = nlp(cleaned_article)
entities =  nlp_data.ents

organizations = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'ORG']
people = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'PERSON']

top_orgs = Counter(organizations).most_common(10)
top_people = Counter(people).most_common(10)

In [14]:
top_people

[('Brnovich', 5), ('Mark Brnovich', 1)]

In [15]:
top_orgs

[('Facebook', 3), ('Google', 3), ('CNBC', 2), ('the Washington Post', 2)]

In [16]:
def remove_stopwords(sentence):
    word_list = sentence.split()
    _non_stopwords = " ".join([word for word in word_list if word not in stop_words])
    return _non_stopwords

In [44]:
def lemmatize_words(sentence):
    # we can probably do better than this! 
    lemma_words = " ".join([word.lemma_ for word in nlp(sentence)])
    return lemma_words
    

### Document Summarization

#### Clean Text

In [18]:
article_sentence_list = nltk.tokenize.sent_tokenize(cleaned_article)

In [19]:
removed_punctuation = [w.translate(punctuation_table) for w in article_sentence_list]
removed_mdash = [sentence.replace('—', '') for sentence in removed_punctuation]

In [20]:
lower_sentences = [s.lower() for s in removed_mdash]

In [21]:
removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [45]:
lemmatize_sentences = [lemmatize_words(sentence) for sentence in removed_stopwords]

#### Calculate Sentence Embeddings

In [23]:
final_sentence = lemmatize_sentences
empty_vector = np.zeros((100,))
sentence_vectors = []
for sentence in final_sentence:
    if len(sentence) > 0:
        _word_list = sentence.split()
        _word_vector =  [word_embeddings.get(word, empty_vector) for word in _word_list]
        _summed_vector = sum(_word_vector)
        normalized_vector = _summed_vector/ (len(sentence.split()))
    else:
        normalized_vector = empty_vector
    sentence_vectors.append(normalized_vector)

#### Create Similarity Matrix between Sentences

In [24]:
sim_mat = np.zeros([len(article_sentence_list), len(article_sentence_list)])

for i in range(len(article_sentence_list)):
    for j in range(len(article_sentence_list)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), 
                                              sentence_vectors[j].reshape(1,100))[0,0]

#### Impliment PageRank Algo

In [25]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [26]:
ranked_sentences = sorted(((scores[i], s, article_sentence_list[i]) for i,s in enumerate(final_sentence)), reverse=True)

In [27]:
N_SENTENCES = 3
generated_summary = [ranked_sentence[2] for ranked_sentence in ranked_sentences[0:N_SENTENCES]]

In [28]:
generated_summary 

['Brnovich is one of several state attorneys general who spoke recently to the Washington Post about their willingness to take action against Facebook, Google and other tech giants, which they say have grown too powerful.',
 'What\'s being seen is the "inaction or inability of the bureaucrats in Washington D.C. to do anything about protecting individual Americans, their privacy rights, how they are being manipulated when it comes to news feeds and news coverage," he said.',
 'Google also gave the paper a statement that said, "Privacy and security are built into all of our products, and we will continue to engage constructively with state attorneys general on policy issues."']

### Topic Modeling

In [29]:
tokenized_sentences = [nltk.tokenize.word_tokenize(sentence) for sentence in final_sentence]

In [33]:
final_sentence

['arizona attorney general mark brnovich tell cnbc friday prepare go big tech company',
 '-PRON- s alone',
 'tech company dominate market share essentially akin monopoly old brnovich say close bell',
 'state ag take look maybe whether something do',
 'brnovich one several state attorney general speak recently washington post willingness take action against facebook google tech giant say grow powerful',
 'brnovich say worried massive amount datum collect manipulate',
 'sometimes mislead maybe end maybe compromise privacy right',
 'state step federal government be not say brnovich',
 'washington dc least last decade good idea go die',
 'what s see inaction inability bureaucrat washington dc anything protect individual american privacy right manipulate come news feed news coverage say',
 'facebook google immediately respond cnbcs request comment',
 'however statement washington post facebook vice president state local public policy say company productive conversation state ag',
 'many off

In [30]:
article_dictionary = gensim.corpora.Dictionary(tokenized_sentences)  # This needs describing!
article_corpus = [article_dictionary.doc2bow(text) for text in tokenized_sentences] # This needs describing!

In [31]:
NUM_TOPICS = 5

article_ldamodel = gensim.models.ldamodel.LdaModel(article_corpus, 
                                                   num_topics = NUM_TOPICS, 
                                                   id2word=article_dictionary, 
                                                   passes=15)
topics = article_ldamodel.print_topics(num_words=5)

In [32]:
topics

[(0,
  '0.048*"-PRON-" + 0.026*"protect" + 0.026*"say" + 0.026*"look" + 0.026*"company"'),
 (1,
  '0.040*"say" + 0.028*"google" + 0.028*"privacy" + 0.028*"state" + 0.028*"news"'),
 (2,
  '0.039*"go" + 0.039*"take" + 0.027*"big" + 0.027*"company" + 0.027*"brnovich"'),
 (3,
  '0.037*"maybe" + 0.037*"state" + 0.037*"washington" + 0.020*"facebook" + 0.020*"ag"'),
 (4,
  '0.061*"brnovich" + 0.042*"say" + 0.042*"company" + 0.023*"comment" + 0.023*"not"')]