In [1]:
import newspaper
import feedparser
import numpy as np
import pandas as pd
import requests
import datetime 
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import joblib
import spacy

## Topic Modeling


## Inputs and outputs
There are just two cells for doing topic modeling.  The first one  
specifies the different inputs.  Adjust these to what we need  
the topic modeling to do, ie number of topics, top n words per  
topic that we wish to see, count vs tfidf, and Non-negative  
Matrix Factorization vs Latent Dirichlet Algorithm.  
  
Once done, the below code will create with two dataframes:  
Main output is:  
 - words_in_topics_df - top_n_words per topic  
 - topic_for_doc_df - topic to which a document is identified  
  
Additional outputs of interest:  
  
 - vocab = This is the dict from which you can pull the words, eg vocab['ocean']  
 - terms = Just the list equivalent of vocab, indexed in the same order  
 - doc_term_matrix = Document term matrix   
Now doc_term_matrix is factorized as = W x H.  You can get W and H:    
 - W = This matrix has docs as rows and num_topicss as columns  
 - H = This matrix has num_topics as rows and vocab as columns  
  
***  



In [2]:
final = joblib.load('final_df3.pkl')

In [3]:
# Read stopwords from file
custom_stop_words = []
file = open(file = "stopwords.txt", mode = 'r')
custom_stop_words = file.read().split('\n')


In [None]:
# # How to write to a file
# # Uncomment everything in this cell to write to a file
# file = open('stopwords.txt','w')
# for element in custom_stop_words:
#     file.write(element+'\n')

# file.close()

In [None]:
# Input incoming text as a list called raw_documents

raw_documents= list(final['text'])
# custom_stop_words = joblib.load('stopwords.pkl')
# custom_stop_words = stopwords.words('english') - use this from nltk if custom file not available
num_topics = 10
top_n_words = 20
vectorizer_to_use = 1 # Use 1 for CountVectorizer, and 2 for TFIDF_Vectorizer
NMF_or_LDA = 'nmf' # Use 'nmf' for NMF or 'lda' for LDA
ngram = 2 # 2 for bigrams, 3 for trigrams etc

# Once done, the below code will create with two dataframes:
# Main output is:
#     words_in_topics_df - top_n_words per topic
#     topic_for_doc_df - topic to which a document is identified

# Additional outputs of interest
# vocab = This is the dict from which you can pull the words, eg vocab['ocean']
# terms = Just the list equivalent of vocab, indexed in the same order
# term_frequency_table = dataframe with the frequency of terms
# doc_term_matrix = Document term matrix 
# doc_term_matrix = W x H
# W = This matrix has docs as rows and num_topicss as columns
# H = This matrix has num_topics as rows and vocab as columns

    

In [16]:
# use count based vectorizer
if vectorizer_to_use ==1:
    vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df = 2, analyzer='word', ngram_range=(ngram, ngram))
else:
    # or use TF-IDF based vectorizer
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, analyzer='word', ngram_range=(ngram, ngram))

doc_term_matrix = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d document-term matrix in variable doc_term_matrix\n" % (doc_term_matrix.shape[0], doc_term_matrix.shape[1]) )




vocab = vectorizer.vocabulary_ #This is the dict from which you can pull the words, eg vocab['ocean']
terms = vectorizer.get_feature_names() #Just the list equivalent of vocab, indexed in the same order
print("Vocabulary has %d distinct terms, examples below " % len(terms))
print(terms[500:550], '\n')

term_frequency_table = pd.DataFrame({'term': terms,'freq': list(np.array(doc_term_matrix.sum(axis=0)).reshape(-1))})
term_frequency_table = term_frequency_table.sort_values(by='freq', ascending=False).reset_index()

freq_df = pd.DataFrame(doc_term_matrix.todense(), columns = terms)
freq_df = freq_df.sum(axis=0)
freq_df = freq_df.sort_values(ascending=False)

# create the model
# Pick between NMF or LDA methods (don't know what they are, try whichever gives better results)
if NMF_or_LDA == 'nmf':
    model = NMF( init="nndsvd", n_components=num_topics ) 
else:
    model = LatentDirichletAllocation(n_components=num_topics, learning_method='online') 
    
# apply the model and extract the two factor matrices
W = model.fit_transform( doc_term_matrix ) #This matrix has docs as rows and k-topics as columns
H = model.components_ #This matrix has k-topics as rows and vocab as columns
print('Shape of W is', W.shape, 'docs as rows and', num_topics, 'topics as columns. First row below')
print(W[0].round(1))
print('\nShape of H is', H.shape, num_topics, 'topics as rows and vocab as columns. First row below')
print(H[0].round(1))

# Check which document belongs to which topic, and print value_count
topic_for_doc_df = pd.DataFrame(columns = ['article', 'topic', 'value'])
for i in range(W.shape[0]):
    a = W[i] 
    b = np.argsort(a)[::-1]
    temp_df = pd.DataFrame({'article': [i], 'topic':['Topic_'+str(b[0])], 'value': [a[b[0]]]})
    topic_for_doc_df = pd.concat([topic_for_doc_df, temp_df])

top_docs_for_topic_df = pd.DataFrame(columns = ['topic', 'doc_number', 'weight'])    
for i in range(W.shape[1]):
    topic = i
    temp_df = pd.DataFrame({'topic': ['Topic_'+str(i) for x in range(W.shape[0])], 
                            'doc_number':  list(range(W.shape[0])), 
                            'weight': list(W[:,i])})
    temp_df = temp_df.sort_values(by=['topic', 'weight'], ascending=[True, False])
    top_docs_for_topic_df = pd.concat([top_docs_for_topic_df, temp_df])
# Add text to the top_docs dataframe as a new column
top_docs_for_topic_df['text']=[raw_documents[i] for i in list(top_docs_for_topic_df.doc_number)] 
# Print top two docs for each topic
print('\nTop documents for each topic')
print(top_docs_for_topic_df.groupby('topic').head(2))

print('\n')
print('Topic number and counts of documents against each:')
print(topic_for_doc_df.topic.value_counts())

# Create dataframe with top-10 words for each topic
words_in_topics_df = pd.DataFrame(columns = ['topic', 'words', 'freq'])
for i in range(H.shape[0]):
    a = H[i] 
    b = np.argsort(a)[::-1]
    np.array(b[:top_n_words])
    words = [terms[i] for i in b[:top_n_words]]
    freq = [a[i] for i in b[:top_n_words]]
    temp_df = pd.DataFrame({'topic':'Topic_'+str(i), 'words': words, 'freq': freq})
    words_in_topics_df = pd.concat([words_in_topics_df, temp_df])

print('\n')
print('Top', top_n_words, 'words dataframe with weights')
print(words_in_topics_df.head(10))



# print as list
print('\nSame list as above as a list')
words_in_topics_list = words_in_topics_df.groupby('topic')['words'].apply(list)
lala =[]
for i in range(len(words_in_topics_list)):
    a = [list(words_in_topics_list.index)[i]]
    b = words_in_topics_list[i]
    lala = lala + [a+b]
    print(a + b) 
    
    
# Top terms
print('\nTop 10 most numerous terms:')
term_frequency_table.head(10)


Created 5125 X 199469 document-term matrix in variable doc_term_matrix

Vocabulary has 199469 distinct terms, examples below 
['100 days', '100 dollars', '100 donation', '100 downloads', '100 eagle', '100 efficient', '100 employees', '100 engineering', '100 extraterrestrial', '100 false', '100 final', '100 follow', '100 fortune', '100 frontline', '100 fund', '100 funding', '100 games', '100 gigabits', '100 high', '100 hosts', '100 increase', '100 internet', '100 investment', '100 jobs', '100 kwh', '100 meetings', '100 mid', '100 mile', '100 miles', '100 millisecond', '100 minutes', '100 mw', '100 networks', '100 organizations', '100 participants', '100 party', '100 people', '100 percent', '100 popular', '100 private', '100 proof', '100 quarter', '100 quest', '100 reliable', '100 remote', '100 round', '100 russian', '100 security', '100 series', '100 spam'] 

Shape of W is (5125, 10) docs as rows and 10 topics as columns. First row below
[0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]

Shape of H is (1

Unnamed: 0,index,term,freq
0,47579,cve 2020,1373
1,41456,contact tracing,740
2,163607,social media,620
3,186071,united states,611
4,178819,threat actors,515
5,43422,coronavirus pandemic,508
6,44795,covid pandemic,507
7,34560,code execution,499
8,48532,dark reading,438
9,101063,law enforcement,426


In [17]:
freq_df

cve 2020                 1373
contact tracing           740
social media              620
united states             611
threat actors             515
                         ... 
national cable              2
national broadcasting       2
national banks              2
national approach           2
lamphone practical          2
Length: 199469, dtype: int64

In [12]:
term_frequency_table.sort_values(by='freq', ascending=False)

Unnamed: 0,index,term,freq
30860,24569,security,11899
30859,8062,data,8251
30858,6621,company,6059
30857,683,2020,5440
30856,27848,time,4716
...,...,...,...
3822,25086,shrunk,2
3823,25091,shuffled,2
3824,25001,shook,2
3825,29737,vodavi,2


In [6]:
final

Unnamed: 0,title,summary_x,URL,published,keywords,summary_y,text
0,Friday Squid Blogging: Fishing for Jumbo Squid,Interesting article on the rise of the jumbo s...,https://www.schneier.com/blog/archives/2020/06...,2020-06-26 20:57:09+00:00,"post,squidinteresting,talk,posting,rise,usual,...",Friday Squid Blogging: Fishing for Jumbo Squid...,Friday Squid Blogging: Fishing for Jumbo Squid...
1,The Unintended Harms of Cybersecurity,"Interesting research: ""Identifying Unintended ...",https://www.schneier.com/blog/archives/2020/06...,2020-06-26 12:00:59+00:00,"harms,countermeasures,unintended,consequences,...","Interesting research: ""Identifying Unintended ...","Interesting research: ""Identifying Unintended ..."
2,Analyzing IoT Security Best Practices,"New research: ""Best Practices for IoT Security...",https://www.schneier.com/blog/archives/2020/06...,2020-06-25 12:09:36+00:00,"practices,follow,iot,recommendations,specific,...",Analyzing IoT Security Best PracticesNew resea...,Analyzing IoT Security Best Practices\n\nNew r...
3,COVID-19 Risks of Flying,"I fly a lot. Over the past five years, my aver...",https://www.schneier.com/blog/archives/2020/06...,2020-06-24 17:32:30+00:00,"schneier,wearing,mask,wear,think,person,lot,mo...",This is all a prelude to saying that I have be...,COVID-19 Risks of Flying\n\nI fly a lot. Over ...
4,Cryptocurrency Pump and Dump Scams,"Really interesting research: ""An examination o...",https://www.schneier.com/blog/archives/2020/06...,2020-06-24 11:30:32+00:00,"pump,schemes,schneier,scope,cryptocurrencies,u...","Really interesting research: ""An examination o...","Really interesting research: ""An examination o..."
...,...,...,...,...,...,...,...
341,Amazon confirms a major outbreak of COVID-19 a...,"<A HREF=""https://www.businessinsider.com/amazo...",http://www.techmeme.com/200423/p38#a200423p38,2020-04-23 23:55:23+00:00,"video,past,security,warehouse,outbreak,surge,z...",— Zoom Video Communications Inc. has been lamb...,— Zoom Video Communications Inc. has been lamb...
346,The coronavirus is accelerating the shift to a...,"<A HREF=""https://www.ft.com/content/990e89de-8...",http://www.techmeme.com/200423/p33#a200423p33,2020-04-23 21:30:02+00:00,"shift,financial,right,whiteboards,coronavirus,...",— Miro is a company in the right place at the ...,— Miro is a company in the right place at the ...
356,Doctors are using AI to triage covid-19 patien...,Rizwan Malik had always had an interest in AI....,https://www.technologyreview.com/2020/04/23/10...,2020-04-23 14:00:00+00:00,"pandemic,health,system,triage,using,coronaviru...","The pandemic, in other words, has turned into ...",The Royal Bolton Hospital is among a growing n...
364,The race to save the first draft of coronaviru...,"Eight years ago, Suleika Jaouad was alone in a...",https://www.technologyreview.com/2020/04/21/99...,2020-04-21 09:00:00+00:00,"save,pandemic,life,internet,history,coronaviru...",But getting the internet to archive as much as...,"Within a week, Blair’s tweet got the attention..."


In [14]:
pwd

'C:\\Users\\user'

In [7]:
newspaper.hot()

['Ron Jeremy',
 'Celta Vigo vs Barcelona',
 'Andrew Toles',
 'Mike Henry',
 'Alia Shawkat',
 'Margot Robbie',
 'FA Cup',
 'Milton Glaser',
 'Pierce Brosnan',
 'Mike Pence',
 'Unilever',
 'Phil Mickelson',
 'Dr Disrespect',
 'Eurovision movie',
 'Huey. Rapper',
 'Timothee Chalamet',
 'Siya Kakkar',
 'Imperial County',
 'Washington, D.C',
 'Gap stock']