In [None]:
%load_ext autoreload

In [10]:
%autoreload 2

In [379]:
import os
from collections import Counter
import string 

import gensim
import networkx as nx
import nltk
import numpy as np
import pandas as pd
from sklearn.metrics.pairwise import cosine_similarity
import spacy


from src.scraping import extract_cnbc_article_info
from src.nlp import nlp_analysis
from src.constants import article_url_list

### Get Scrape CNIB Website for Articles

In [34]:
article_url_list

['https://www.cnbc.com/2019/03/15/spielberg-and-netflixs-sarandos-meet-sparking-hope-for-oscars-truce.html',
 'https://www.cnbc.com/2019/03/15/tesla-shares-fall-5-percent-after-company-unveils-model-y.html',
 'https://www.cnbc.com/2019/03/15/arizona-will-go-after-big-tech-if-necessary-says-attorney-general.html',
 'https://www.cnbc.com/2019/03/15/facebook-stocks-falls-after-cox-exit-outage-threats-from-washington.html',
 'https://www.cnbc.com/2019/02/24/kraft-heinz-reviews-options-for-maxwell-house-coffee-including-sale.html',
 'https://www.cnbc.com/2019/03/08/jeff-bezos-to-end-secrecy-over-amazons-role-in-carbon-emissions.html',
 'https://www.cnbc.com/2019/03/15/dieselgate-sec-says-volkswagen-perpetrated-fraud-lied-to-investors.html']

In [204]:
article_list = []
for article_url in article_url_list:
    
    _article_text_dict = extract_cnbc_article_info(article_url)
    _article_ml_dict = nlp_analysis(_article_text_dict)
    
    _article_dict = {**_article_text_dict, **_article_ml_dict}
    article_list.append(_article_dict)

In [336]:
article_list[3]

{'title': 'Facebook stock falls after the executive in charge of all its products steps down',
 'reporter': 'Sara Salinas',
 'summary': ['Insiders told CNBC that Chris Cox was the "heart and soul" of Facebook.',
  "His departure hangs uncertainty on Facebook's recent announcement to integrate its family of apps and prioritize private communication.",
  'Separately Friday, Facebook drew criticism related to a terrorist incident in New Zealand in which a shooter live-streamed his attack on Facebook.'],
 'article': '\nFacebook stock fell Friday, the day after Chief Product Officer Chris Cox announced he was leaving the company in light of its recent pivot to privacy.\n\nShares fell as much as 4.5 percent in early trading, sinking below $163 at its lowest point. The stock eased off by mid-morning and ended the day 2.5 percent down.\nInsiders told CNBC that Cox was the "heart and soul" of Facebook. He was among the earlier employees and reported directly to CEO Mark Zuckerberg. His departur

 # Natural Language Processing Steps

In [207]:
nlp = spacy.load('en_core_web_lg') # TODO: Give detailed explaination about what this is

In [289]:
stop_words =  nltk.corpus.stopwords.words('english')

# Remove certain keywords
stop_words.remove('more')
stop_words.remove('against')

In [292]:
# Extract word vectors
word_embeddings = {}
f = open(os.path.join('data', 'models', 'external', 'glove', 'glove.6B.100d.txt'), encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [293]:
punctuation_table = str.maketrans('', '', string.punctuation)

In [337]:
article = article_list[3]

### Clean Article Text

In [338]:
cleaned_article = article['article'].replace('\n', ' ')

### Named-Entity Extraction

In [339]:
nlp_data = nlp(cleaned_article)
entities =  nlp_data.ents

organizations = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'ORG']
people = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'PERSON']

top_orgs = Counter(organizations).most_common(10)
top_people = Counter(people).most_common(10)

In [340]:
top_people

[('Chris Cox', 1),
 ('Cox', 1),
 ('Mark Zuckerberg', 1),
 ('Elizabeth Warren', 1),
 ('Warren', 1),
 ('Obama', 1)]

In [341]:
top_orgs

[('Facebook', 10),
 ('CNBC', 3),
 ('Netflix', 2),
 ('WhatsApp', 1),
 ('FAANG', 1),
 ('Apple', 1),
 ('Amazon', 1),
 ('Google', 1),
 ('the New York Times', 1),
 ('the Department of Justice', 1)]

In [342]:
def remove_stopwords(sentence):
    word_list = sentence.split()
    _non_stopwords = " ".join([word for word in word_list if word not in stop_words])
    return _non_stopwords

In [343]:
def lemmatize_words(sentence):
    # we can probably do better than this! 
    _lemma_words = " ".join([word.lemma_ for word in nlp(sentence)])
    return _lemma_words
    

### Document Summarization

#### Clean Text

In [344]:
article_sentence_list = nltk.tokenize.sent_tokenize(cleaned_article)

In [401]:
removed_punctuation = [w.translate(punctuation_table) for w in article_sentence_list]
removed_mdash = [sentence.replace('—', '') for sentence in removed_punctuation]

In [402]:
lower_sentences = [s.lower() for s in removed_mdash]

In [403]:
removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [404]:
lemmatize_sentences = [lemmatize_words(sentence) for sentence in removed_stopwords]

#### Calculate Sentence Embeddings

In [405]:
final_sentence = lemmatize_sentences
empty_vector = np.zeros((100,))
sentence_vectors = []
for sentence in final_sentence:
    if len(sentence) > 0:
        _word_list = sentence.split()
        _word_vector =  [word_embeddings.get(word, empty_vector) for word in _word_list]
        _summed_vector = sum(_word_vector)
        normalized_vector = _summed_vector/ (len(sentence.split()))
    else:
        normalized_vector = empty_vector
    sentence_vectors.append(normalized_vector)

#### Create Similarity Matrix between Sentences

In [406]:
sim_mat = np.zeros([len(article_sentence_list), len(article_sentence_list)])

for i in range(len(article_sentence_list)):
    for j in range(len(article_sentence_list)):
        if i != j:
            sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), 
                                              sentence_vectors[j].reshape(1,100))[0,0]

#### Impliment PageRank Algo

In [407]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [408]:
ranked_sentences = sorted(((scores[i], s, article_sentence_list[i]) for i,s in enumerate(final_sentence)), reverse=True)

In [409]:
N_SENTENCES = 3
generated_summary = [ranked_sentence[2] for ranked_sentence in ranked_sentences[0:N_SENTENCES]]

In [410]:
generated_summary 

[' Facebook stock fell Friday, the day after Chief Product Officer Chris Cox announced he was leaving the company in light of its recent pivot to privacy.',
 "Meanwhile, federal prosecutors are probing Facebook's data-sharing partnerships with device makers, according to a report by the New York Times published Wednesday night.",
 'The company was alerted to the post by New Zealand police and said it has been taking down re-postings of the video and messages praising the shooter as it becomes aware of them.']

### Topic Modeling

In [411]:
tokenized_sentences = [nltk.tokenize.word_tokenize(sentence) for sentence in final_sentence]

In [413]:
article_dictionary = gensim.corpora.Dictionary(tokenized_sentences)  # This needs describing!
article_corpus = [article_dictionary.doc2bow(text) for text in tokenized_sentences] # This needs describing!

In [414]:
NUM_TOPICS = 5

article_ldamodel = gensim.models.ldamodel.LdaModel(article_corpus, 
                                                   num_topics = NUM_TOPICS, 
                                                   id2word=article_dictionary, 
                                                   passes=15)
topics = article_ldamodel.print_topics(num_words=5)

In [415]:
topics

[(0,
  '0.037*"facebook" + 0.028*"outage" + 0.020*"thursday" + 0.020*"week" + 0.020*"day"'),
 (1,
  '0.022*"recent" + 0.022*"private" + 0.022*"departure" + 0.022*"hang" + 0.022*"integrate"'),
 (2,
  '0.021*"big" + 0.021*"tech" + 0.021*"continue" + 0.021*"netflix" + 0.011*"percent"'),
 (3,
  '0.036*"facebook" + 0.025*"stock" + 0.014*"end" + 0.014*"percent" + 0.014*"cox"'),
 (4,
  '0.034*"say" + 0.018*"new" + 0.018*"report" + 0.018*"cnbc" + 0.018*"company"')]

### Correlate with IA Holdings

In [None]:
ia_holdings = pd.read_csv(os.path.join('data', 'internal', 'investment_advisor_holdings.csv'))

In [None]:
ia_holdings

In [None]:
articles