In [None]:
import os
from collections import Counter
import string 

from IPython.display import display, Markdown
import gensim
import networkx as nx
import nltk
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn.metrics.pairwise import cosine_similarity
import spacy
from spacy import displacy


from src.scraping import extract_cnbc_article_info
from src.urls import article_urls
from src.nlp import lemmatize_words, remove_stopwords

In [None]:
article_urls

### Scrape CNIB Website for Articles

In [None]:
articles = []
for article_url in article_urls:
    _article_text_dict = extract_cnbc_article_info(article_url)
    articles.append(_article_text_dict)

In [None]:
pprint(articles)

In [None]:
article = articles[0]

In [None]:
article['title'] 

In [None]:
article['article']

In [None]:
article['url']

 # Natural Language Processing Steps

### Clean Article Text

In [None]:
# Remove unnecessary '\n'
cleaned_article = article['article'].replace('\n', ' ')

### Named-Entity Extraction

In [None]:
# Load English tokenizer, tagger, parser, NER and word vectors
nlp = spacy.load('en_core_web_lg')

nlp_data = nlp(cleaned_article)

In [None]:
displacy.render(nlp_data, style="ent", jupyter=True)

In [None]:
# Extract Entities
entities =  nlp_data.ents

In [None]:
# Obtain top 10 people mentioned in article
people = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'PERSON']
top_people = Counter(people).most_common(10)

top_people

In [None]:
# Obtain top 10 organizations mentioned in article
organizations = [ent.text.replace('\n','') for ent in entities if ent.label_ == 'ORG']
top_orgs = Counter(organizations).most_common(10)

top_orgs

### Document Summarization

#### Clean Text

In [None]:
# Tokenize by Sentence
article_sentence_list = nltk.tokenize.sent_tokenize(cleaned_article)

In [None]:
# Remove Punctuation
punctuation_table = str.maketrans('', '', string.punctuation)

removed_punctuation = [w.translate(punctuation_table) for w in article_sentence_list]
removed_mdash = [sentence.replace('—', '') for sentence in removed_punctuation]

In [None]:
# Convert words to lowercase
lower_sentences = [s.lower() for s in removed_mdash]

In [None]:
# Remove stopwords
stop_words =  nltk.corpus.stopwords.words('english')

removed_stopwords = [remove_stopwords(sentence) for sentence in lower_sentences]

In [None]:
# Convert words to root word (e.g., 'according' to 'accord')
lemmatize_sentences = [lemmatize_words(sentence) for sentence in removed_stopwords]

In [None]:
for lemmatize_sentence, raw_sentence in zip(lemmatize_sentences, article_sentence_list):
    
    display(Markdown(f"**Original**: {raw_sentence}"))
    display(Markdown(f"**Cleaned**: {lemmatize_sentence}"))
    display(Markdown(f"---"))



#### Calculate Sentence Similarity

In [None]:
word_vectors = spacy.load('en_vectors_web_lg')

In [None]:
# Word Similarity Example
X = ['cat', 'dog', 'banana', 'plantain', 'facebook', 'google']
for word_i in X:
    for word_j in X:
            print(word_i, '-',word_j, ':', 
                  round(word_vectors(word_i).similarity(word_vectors(word_j)),2))

In [None]:
similarity_matrix = np.zeros([len(lemmatize_sentences), 
                              len(lemmatize_sentences)])

similarity_matrix
for i, sentence_i in enumerate(lemmatize_sentences):
    nlp_i = nlp(sentence_i)
    for j, sentence_j in enumerate(lemmatize_sentences):
        if i != j:
            nlp_j = nlp(sentence_j)
            similarity_matrix[i][j] = nlp_i.similarity(nlp_j)

In [None]:
similarity_matrix

#### Impliment PageRank Algo

In [None]:
nx_graph = nx.from_numpy_array(similarity_matrix)
scores = nx.pagerank(nx_graph)

In [None]:
ranked_sentences = sorted((
        (scores[i], sentence, article_sentence_list[i]) 
        for i,sentence in enumerate(lemmatize_sentences))
        , reverse=True)

In [None]:
N_SENTENCES = 3
generated_summary = [ranked_sentence[2] 
                     for ranked_sentence 
                     in ranked_sentences[0:N_SENTENCES]]

In [None]:
generated_summary 