In [58]:
import re
import urllib 
import random
import pickle

import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
import spacy
from spacy.lang.en import English
import nltk
from nltk.tokenize import sent_tokenize
from nltk.corpus import stopwords
from nltk.corpus import wordnet as wn
from nltk.stem.wordnet import WordNetLemmatizer
import gensim
from gensim import corpora
from sklearn.metrics.pairwise import cosine_similarity
import networkx as nx

from src.utils import extract_cnbc_article_info

In [27]:
nlp = spacy.load('en_core_web_lg')

In [33]:
stop_words = stopwords.words('english')

In [112]:
tesla_page = 'https://www.cnbc.com/2019/03/15/'\
             'tesla-shares-fall-5-percent-after-company-unveils-model-y'\
             '.html'

netflix_page = 'https://www.cnbc.com/2019/03/15/'\
               'spielberg-and-netflixs-sarandos-meet'\
               '-sparking-hope-for-oscars-truce.html'

google_page = 'https://www.cnbc.com/2019/03/15/arizona'\
              '-will-go-after-big-tech-if-necessary-says-'\
              'attorney-general.html'

facebook_page = 'https://www.cnbc.com/2019/03/15/facebook-stocks-falls-'\
            'after-cox-exit-outage-threats-from-washington.html?&'\
            'qsearchterm=facebook'

resturant_page = 'https://www.cnbc.com/2019/02/24/kraft-heinz-reviews-options-for-maxwell-house-coffee-including-sale.html'

amazon_page = 'https://www.cnbc.com/2019/03/08/jeff-bezos-to-end-secrecy'\
              '-over-amazons-role-in-carbon-emissions.html'

vw_page = 'https://www.cnbc.com/2019/03/15/dieselgate-sec-says-volkswagen-'\
          'perpetrated-fraud-lied-to-investors.html'

In [28]:
def tokenize(text):
    lda_tokens = []
    tokens = parser(text)
    for token in tokens:
        if token.orth_.isspace():
            continue
        elif token.like_url:
            lda_tokens.append('URL')
        elif token.orth_.startswith('@'):
            lda_tokens.append('SCREEN_NAME')
        else:
            lda_tokens.append(token.lower_)
    return lda_tokens

In [30]:
def get_lemma(word):
    lemma = wn.morphy(word)
    if lemma is None:
        return word
    else:
        return lemma

In [32]:
def get_lemma2(word):
    return WordNetLemmatizer().lemmatize(word)

In [40]:
def prepare_text_for_lda(text):
    tokens = tokenize(text)
    tokens = [token for token in tokens if len(token) > 4]
    tokens = [token for token in tokens if token not in stop_words]
    tokens = [get_lemma(token) for token in tokens]
    return tokens

### Extract Article Content

In [90]:
article_dict = extract_cnbc_article_info(facebook_page)

In [91]:
nlp_data = nlp(article_dict['article'])

### Identify Organizations

In [92]:
print(set([ent.text.replace('\n','') for ent in nlp_data.ents if ent.label_ == 'ORG']))
# print(set([ent.text.replace('\n','') for ent in nlp_data.ents if ent.label_ == 'PERSON']))

{'', 'Facebook', 'FCC', 'FAANG', 'the Department of Justice', 'Netflix', 'Google', 'Apple', 'WhatsApp', 'YouTube', 'Amazon', 'CNBC', 'the New York Times'}


In [93]:
# [(ent.text, ent.label_ ) for ent in nlp_data.ents]

### Document Summarization

In [94]:
article_sentences = sent_tokenize(article_dict['article'])
clean_sentences = pd.Series(article_sentences).str.replace("[^a-zA-Z]", " ")
lower_sentences = [s.lower() for s in clean_sentences]


In [95]:
removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [96]:
def remove_stopwords(sentence):
    word_list = sentence.split()
    cleaned_words = " ".join([word for word in word_list if word not in stop_words])
    return cleaned_words

In [97]:
# Extract word vectors
word_embeddings = {}
f = open('glove.6B.100d.txt', encoding='utf-8')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    word_embeddings[word] = coefs
f.close()

In [98]:
sentence_vectors = []
for i in removed_stopwords:
  if len(i) != 0:
    v = sum([word_embeddings.get(w, np.zeros((100,))) for w in i.split()])/(len(i.split())+0.001)
  else:
    v = np.zeros((100,))
  sentence_vectors.append(v)

In [99]:
sim_mat = np.zeros([len(article_sentences), len(article_sentences)])


In [100]:
for i in range(len(article_sentences)):
  for j in range(len(article_sentences)):
    if i != j:
      sim_mat[i][j] = cosine_similarity(sentence_vectors[i].reshape(1,100), sentence_vectors[j].reshape(1,100))[0,0]

In [101]:
nx_graph = nx.from_numpy_array(sim_mat)
scores = nx.pagerank(nx_graph)

In [102]:
ranked_sentences = sorted(((scores[i],s) for i,s in enumerate(article_sentences)), reverse=True)

In [103]:
# Extract top 10 sentences as the summary
for i in range(min(10, len(ranked_sentences))):
  print(i, ranked_sentences[i][1])

0 
Facebook stock fell Friday, the day after Chief Product Officer Chris Cox announced he was leaving the company in light of its recent pivot to privacy.
1 Meanwhile, federal prosecutors are probing Facebook's data-sharing partnerships with device makers, according to a report by the New York Times published Wednesday night.
2 The company was alerted to the post by New Zealand police and said it has been taking down re-postings of the video and messages praising the shooter as it becomes aware of them.
3 He was among the earlier employees and reported directly to CEO Mark Zuckerberg.
4 Separately Friday, Facebook drew criticism related to a terrorist incident in New Zealand in which a shooter live-streamed his attack on Facebook.
5 "It has already been reported that there are ongoing federal investigations, including by the Department of Justice," a spokesperson for Facebook said in a statement to CNBC.
6 The service was reinstated Thursday after a widespread outage that last nearly a

### Topic Modeling

In [104]:
text_data = []
for sentence in article_sentences:
    tokens = prepare_text_for_lda(sentence)
    if random.random() > .09:
        print(tokens)
        text_data.append(tokens)


['facebook', 'stock', 'friday', 'chief', 'product', 'officer', 'chris', 'announce', 'leaving', 'company', 'light', 'recent', 'pivot', 'privacy']
['share', 'percent', 'early', 'trading', 'sinking', 'lowest', 'point']
['stock', 'ease', 'morning', 'end', 'percent']
['insider', 'heart', 'facebook']
['among', 'earlier', 'employee', 'report', 'directly', 'zuckerberg']
['departure', 'hang', 'uncertainty', 'facebook', 'recent', 'announcement', 'integrate', 'family', 'prioritize', 'private', 'communication']
['separately', 'friday', 'facebook', 'criticism', 'relate', 'terrorist', 'incident', 'zealand', 'shooter', 'stream', 'attack', 'facebook']
['shooting', 'claim', 'least', 'life']
['company', 'alert', 'zealand', 'police', 'taking', 'posting', 'video', 'message', 'praise', 'shooter', 'become', 'aware']
['facebook', 'suffer', 'long', 'outage']
['service', 'reinstate', 'thursday', 'widespread', 'outage', 'nearly']
['statement', 'thursday', 'company', 'blame', 'server', 'configuration', 'change',

In [105]:
dictionary = corpora.Dictionary(text_data)
corpus = [dictionary.doc2bow(text) for text in text_data]

In [106]:
pickle.dump(corpus, open('corpus.pkl', 'wb'))
dictionary.save('dictionary.gensim')

In [109]:
NUM_TOPICS = 10
ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics = NUM_TOPICS, id2word=dictionary, passes=15)
ldamodel.save('model5.gensim')
topics = ldamodel.print_topics(num_words=5)
for topic in topics:
    print(topic)


(0, '0.037*"outage" + 0.037*"would" + 0.037*"major" + 0.037*"instagram" + 0.037*"whatsapp"')
(1, '0.007*"facebook" + 0.007*"stock" + 0.007*"outage" + 0.007*"percent" + 0.007*"thursday"')
(2, '0.048*"percent" + 0.048*"trading" + 0.048*"thursday" + 0.025*"outage" + 0.025*"lowest"')
(3, '0.032*"facebook" + 0.032*"report" + 0.032*"federal" + 0.017*"company" + 0.017*"privacy"')
(4, '0.007*"facebook" + 0.007*"stock" + 0.007*"percent" + 0.007*"thursday" + 0.007*"company"')
(5, '0.066*"facebook" + 0.066*"friday" + 0.035*"company" + 0.035*"stock" + 0.035*"shooter"')
(6, '0.044*"answer" + 0.044*"provide" + 0.044*"question" + 0.044*"testimony" + 0.044*"pledge"')
(7, '0.068*"facebook" + 0.046*"netflix" + 0.024*"default" + 0.024*"apple" + 0.024*"favor"')
(8, '0.032*"stock" + 0.032*"proposal" + 0.032*"warren" + 0.032*"continue" + 0.032*"raise"')
(9, '0.043*"facebook" + 0.043*"recent" + 0.043*"announcement" + 0.043*"integrate" + 0.043*"departure"')


In [111]:
dictionary = gensim.corpora.Dictionary.load('dictionary.gensim')
corpus = pickle.load(open('corpus.pkl', 'rb'))
lda = gensim.models.ldamodel.LdaModel.load('model5.gensim')
import pyLDAvis.gensim
lda_display = pyLDAvis.gensim.prepare(lda, corpus, dictionary, sort_topics=True)
pyLDAvis.display(lda_display)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  return pd.concat([default_term_info] + list(topic_dfs))


### Connect Organizations with Securities