In [1]:
%matplotlib inline
import numpy as np
import pandas as pd
import scipy
import sklearn
import spacy
import matplotlib.pyplot as plt
import seaborn as sns
import re
from collections import Counter
import nltk
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import warnings
warnings.filterwarnings('ignore')
import newspaper
from newspaper import Article

In [2]:
import pickle
pickle_in = open('newslist.pickle','rb')
pickle_in2 = open('news_corpus.pickle','rb')
news_corpus = pickle.load(pickle_in2)
newsList = pickle.load(pickle_in)

In [3]:
# #Split our urls into individual url for later read and downloading
# url_txt = open('tech_url_file.txt','r+',encoding='UTF-8')
# url_list = url_txt.read()
# urls = url_list.split('\n')
# urls = list(set(urls))
# len(urls)

In [4]:
# # Read ,download, and parsing articles, then add them to a list.
# news_corpus = []

# for url in urls:
#     if urls.index(url) % 10 == 0:
#         print(urls.index(url))         # Print numbers just making sure kernel not hang
#     toi_article  = Article(url, language="en")
#     try:
#         toi_article.download()
#     except Exception:
#         pass
#     try:
#         toi_article.parse()
#     except Exception:
#         continue
#     news_corpus.append(toi_article.text)

In [3]:
#Clean up text with regular expression and add them to a new list
pattern = "[\~\^\=\-\+\<\>\#\{\}\\\/\:\(\)\£\—\*\-[\]$\@]"
news_corpus2 = []
for article in news_corpus:
    news = re.sub(pattern, "", article)
    news = re.sub(r'\d', "", news)
    news = re.sub('&', 'and', news)
    news = re.sub('%', ' percent', news)
    news_corpus2.append(news)
    
# # Save our file just in case kernel died, we don't have to download those news again, never!
# news_string = str(news_corpus2)
# with open("newscorpus.txt", "w",encoding = 'UTF-8') as text_file:
#     print(news_string, file=text_file)

In [4]:
news_corpus3 = '\n\n\n'.join(news_corpus2)
news_corpus3 = ' '.join(news_corpus3.split())

In [5]:
nltk.download('wordnet')

import nltk.data
from nltk.stem import WordNetLemmatizer

tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')

#Using nltk tokenizer to separate corpus into sentences

news_sentence = tokenizer.tokenize(news_corpus3)

[nltk_data] Downloading package wordnet to C:\Users\me-
[nltk_data]     fa\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [7]:
nlp = spacy.load('en', disable=['parser', 'ner'])

#Lemmatize each sentence
newsList = []
for sent in news_sentence:
    doc = nlp(sent)
    lemma_output = " ".join([token.lemma_ for token in doc 
                             if (not token.is_punct
                                 and not token.is_stop)])
    newsList.append(lemma_output)

In [22]:
news_corpus[0]

"$55 million approved for Ohio public transportation Copyright 2019 Nexstar Broadcasting, Inc. All rights reserved. This material may not be published, broadcast, rewritten, or redistributed. Video\n\nOhio (WTRF) - The state transportation budget will provide $55 million each year for public transportation.\n\nThe Republican-controlled Senate approved the funding this week, along with a six-cent increase per gallon on the state's gas tax.\n\nThis amount is more than Governor Mike DeWine's proposal for 40-million-dollars but less than the 100 million proposed by the House.\n\nState senator Matt Dolan called public transportation a priority in a statement after pushing for the 55-million-dollar figure in a last-minute amendment."

In [24]:
from spacy import displacy
nlp = spacy.load('en_core_web_lg')
doc = nlp(news_corpus[1])
displacy.render(doc, style='ent', jupyter=True)

In [25]:
nlp = spacy.load('en_core_web_sm')
doc = nlp(news_corpus[1])
displacy.render(doc, style='ent', jupyter=True)

In [1]:
# #Let's save our results, we don't want to scrape everything again
# import pickle
# pickle_out = open('newslist.pickle','wb')
# pickle_out2 = open('news_corpus.pickle','wb')
# pickle.dump(newsList,pickle_out)
# pickle.dump(news_corpus,pickle_out2)
# pickle_out.close()

In [3]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

X_train, X_test = train_test_split(newsList, test_size=0.4, random_state=0)

vectorizer = TfidfVectorizer(max_df=0.5, # drop words that occur in more than half the paragraphs
                             ngram_range = (1,3), 
                             min_df=2, # only use words that appear at least twice
                             stop_words='english', 
                             lowercase=True, #convert everything to lower case
                             use_idf=True,#we definitely want to use inverse document frequencies in our weighting
                             norm=u'l2', #Applies a correction factor so that longer paragraphs and shorter paragraphs get treated equally
                             smooth_idf=True #Adds 1 to all document frequencies, as if an extra document existed that used every word once.  Prevents divide-by-zero errors
                            )


#Applying the vectorizer
news_sents_tfidf=vectorizer.fit_transform(newsList)
print("Number of features: %d" % news_sents_tfidf.get_shape()[1])

#splitting into training and test sets
X_train_tfidf, X_test_tfidf= train_test_split(news_sents_tfidf, test_size=0.4, random_state=0)


#Reshapes the vectorizer output into something people can read
X_train_tfidf_csr = X_train_tfidf.tocsr()

#number of features for training data
n = X_train_tfidf_csr.shape[0]
#A list of dictionaries, one per feature
tfidf_byfeature = [{} for _ in range(0,n)]
#List of features
terms = vectorizer.get_feature_names()
#for each feature, lists the feature words and their tf-idf scores
for i, j in zip(*X_train_tfidf_csr.nonzero()):   #Zipping two numpy array together to create a tuple of
                                                 #indices of non-zero values 
    tfidf_byfeature[i][terms[j]] = X_train_tfidf_csr[i, j]  #Get the feature name and add the tfidf
                                                            #score to the dictionary

#log base 2 of 1 is 0, so a tf-idf score of 0 
#indicates that the word was present once in that sentence.
print('Original sentence:', X_train[1000])  #Because we used the same random_state value for both
print('Tf_idf vector:', tfidf_byfeature[1000])  #news_sents_tfidf and newsList so their indices are 
                                                #aligned

Number of features: 182243
Original sentence: despite these protection , the same constitution also state that nonrenewable natural resource belong to the state .
Tf_idf vector: {'belong state': 0.3950360482318546, 'nonrenewable': 0.3950360482318546, 'constitution state': 0.3771029285046033, 'natural resource': 0.29292206321542946, 'constitution': 0.26612373494674124, 'belong': 0.3059024568540876, 'resource': 0.22163770586773968, 'protection': 0.2307729783222923, 'despite': 0.225160071172665, 'natural': 0.22406576498768177, 'state': 0.3036494147897648}


In [32]:
X_train_tfidf_csr.shape

(51041, 182243)

In [34]:
len(newsList)

85069

In [5]:
from sklearn.decomposition import TruncatedSVD
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import Normalizer

#Our SVD data reducer.  We are going to reduce the feature space from 182243 to 250.
svd= TruncatedSVD(5000)
lsa = make_pipeline(svd, Normalizer(copy=False))
# Run SVD on the training data, then project the training data.
X_train_lsa = lsa.fit_transform(X_train_tfidf)

variance_explained=svd.explained_variance_ratio_
total_variance = variance_explained.sum()
print("Percent variance captured by all components:",total_variance*100)

#Looking at what sorts of paragraphs our solution considers similar, for the first five identified topics
paras_by_component=pd.DataFrame(X_train_lsa,index=X_train)
for i in range(10):
    print('Component {}:'.format(i))
    print(paras_by_component.loc[:,i].sort_values(ascending=False)[0:10])

Percent variance captured by all components: 51.13948348326933
Component 0:
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
applause .    0.999138
Name: 0, dtype: float64
Component 1:
percent .                0.614581
percent to .             0.614581
percent .                0.614581
percent to .             0.614581
percent .                0.614581
percent .                0.614581
percent in to .          0.614581
percent to .             0.614581
percent to .             0.614581
one hundred percent .    0.614581
Name: 1, dtype: float64
Component 2:
percent to .             0.773222
percent .                0.773222
any percent .            0.773222
percent .                0.773222
percent from the .       0.773222
percent .                0.773222
percent .                0.773222
one hundred percent .    0.773222
percent 

### Looks like our first topic is about the word 'applause', second is about the word 'percent', and third topic is still about the word 'percent', fourth topic is about somebody do something. our fifth topic is about somebody say something, and sixth topic is about the word 'laughter', and seventh topic is about somebody know something, eighth topic is about somebody think something, nineth topic is about the word 'billion', tenth topic is about Trump! Pretty interesting!