# Web scraping RSS and Topic Models

In [1]:
import newspaper
import feedparser
import numpy as np
import pandas as pd
import requests
import datetime 
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import joblib

## Web Scraping

In [2]:
# Blank dataframe, based on fields identified later

rss_feeds = pd.DataFrame(columns = ['title',  'summary',  'links',  'link',  'id',  'guidislink',  'published',  
                                    'published_parsed',  'title_detail.type',  'title_detail.language',  
                                    'title_detail.base',  'title_detail.value',  'summary_detail.type',  
                                    'summary_detail.language',  'summary_detail.base',  'summary_detail.value',  
                                    'media_content',  'feedburner_origlink'])


In [3]:
# List of RSS URLs to scrape

rss_urls = [r'http://www.schneier.com/blog/index.rdf', 
            r'http://feeds.feedburner.com/darknethackers', 
            r'http://securityaffairs.co/wordpress/feed', 
            r'http://healthitsecurity.com/feed/', 
            r'http://blog.seanmason.com/feed/', 
            r'http://threatpost.com/feed', 
            r'http://feeds.trendmicro.com/Anti-MalwareBlog/', 
            r'http://www.infosecurity-magazine.com/rss/news/', 
            r'http://krebsonsecurity.com/feed/', 
            r'http://www.darkreading.com/rss/all.xml', 
            r'http://blog.kaspersky.com/feed/', 
            r'http://www.baesystems.com/page/rss?lg=en', 
            r'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 
            r'http://feeds.feedburner.com/scmagazinenews', 
            r'http://taosecurity.blogspot.com/atom.xml', 
            r'http://www.rms.com/blog/feed/', 
            r'http://iscxml.sans.org/rssfeed.xml', 
            r'https://community.qualys.com/blogs/securitylabs/feeds/posts', 
            r'http://googleonlinesecurity.blogspot.com/atom.xml', 
            r'http://thehackernews.com/feeds/posts/default', 
            r'http://www.us-cert.gov/current/index.rdf', 
            r'http://feeds.feedburner.com/Securityweek', 
            r'http://nakedsecurity.sophos.com/feed/', 
            r'http://feeds.arstechnica.com/arstechnica/index/', 
            r'http://www.csoonline.com/feed/attribute/41014', 
            r'http://blogs.rsa.com/feed/', 
            r'http://feeds.feedburner.com/Techcrunch', 
            r'http://recode.net/feed/', 
            r'http://www.techmeme.com/index.xml', 
            r'http://www.technologyreview.com/stream/rss/']

In [4]:
# Get all the feed entries.  But the dataframe resulting from this has only a summary line, 
# not the entire text of the article.  For that we will pull the URL in using the 
# newspaper library later.

for rss in tqdm(rss_urls):
    feed = feedparser.parse(rss)
    rss_feeds=pd.concat([rss_feeds, pd.json_normalize(feed.entries)], axis=0)
print(len(rss_feeds), 'items in rss_feed dataframe')    

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:12<00:00,  2.34it/s]

368 items in rss_feed dataframe





In [5]:
# Remove duplicate URLs

urllist =rss_feeds.link.unique()

In [6]:
# Get full text using scraping from the newspaper library

from newspaper import Article
import pandas as pd
df = pd.DataFrame(columns = ["date",  "URL", "authors", "keywords", "summary", "text"])

for url in tqdm(urllist):
    article = Article(url)
    try:
        article.download()
        article.parse()
        article.nlp()
        dict1 = {"date": article.publish_date, "URL": url, "authors": article.authors, \
             "keywords": article.keywords, "summary": article.summary, "text": article.text}
    #print(dict1)
        df = df.append(dict1, ignore_index=True)
    except:
        print('Something wrong with', url)

print(len(df),'stories in dataframe df')

df.sample(4)

100%|████████████████████████████████████████████████████████████████████████████████| 368/368 [04:58<00:00,  1.23it/s]

368 stories in dataframe df





Unnamed: 0,date,URL,authors,keywords,summary,text
360,2020-04-28 00:00:00,https://www.technologyreview.com/2020/04/28/10...,[],"[disrupting, 5g, potential, industry, manufact...",A continuous stream of emerging technologies i...,A continuous stream of emerging technologies i...
312,,https://www.csoonline.com/article/2130877/the-...,[Dan Swinhoe],"[15, data, passwords, users, biggest, century,...",About 3.5 billion people saw their personal da...,"Not long ago, a breach that compromised the da..."
135,2020-04-22 00:00:00,https://www.nytimes.com/2020/04/22/technology/...,[Nathaniel Popper],"[times, stimulus, scammer, programs, pure, flo...","“It is a little relief, and then you find out ...","“It is a little relief, and then you find out ..."
167,,https://isc.sans.edu/diary/rss/26054,[Sans Internet Storm Center],"[file, simple, mvpblogdidierstevenscom, handle...","In diary entry ""Obfuscated with a Simple 0x0A""...","In diary entry ""Obfuscated with a Simple 0x0A""..."


In [7]:
# Merge the RSS dataframe with the full text obtained from the 
# newspaper library

final = rss_feeds.merge(df,how="right", left_on="link", right_on="URL")
print(len(final),'unique articles in file.')

368 unique articles in file.


In [8]:
# Save the file
final.to_pickle('securitynews_' + datetime.datetime.now().strftime("date_%Y.%m.%d_time_%H.%M") + '.pkl')
print('Pickle file created')

Pickle file created


In [9]:
final.text[3]

"OneZero is tracking thirty countries around the world who are implementing surveillance programs in the wake of COVID-19:\n\nThe most common form of surveillance implemented to battle the pandemic is the use of smartphone location data, which can track population-level movement down to enforcing individual quarantines. Some governments are making apps that offer coronavirus health information, while also sharing location information with authorities for a period of time. For instance, in early March, the Iranian government released an app that it pitched as a self-diagnostic tool. While the tool's efficacy was likely low, given reports of asymptomatic carriers of the virus, the app saved location data of millions of Iranians, according to a Vice report.\n\nOne of the most alarming measures being implemented is in Argentina, where those who are caught breaking quarantine are being forced to download an app that tracks their location. In Hong Kong, those arriving in the airport are give

## Topic Modeling


## Inputs and outputs
There are just two cells for doing topic modeling.  The first one  
specifies the different inputs.  Adjust these to what we need  
the topic modeling to do, ie number of topics, top n words per  
topic that we wish to see, count vs tfidf, and Non-negative  
Matrix Factorization vs Latent Dirichlet Algorithm.  
  
Once done, the below code will create with two dataframes:  
Main output is:  
 - words_in_topics_df - top_n_words per topic  
 - topic_for_doc_df - topic to which a document is identified  
  
Additional outputs of interest:  
  
 - vocab = This is the dict from which you can pull the words, eg vocab['ocean']  
 - terms = Just the list equivalent of vocab, indexed in the same order  
 - doc_term_matrix = Document term matrix   
Now doc_term_matrix is factorized as = W x H.  You can get W and H:    
 - W = This matrix has docs as rows and num_topicss as columns  
 - H = This matrix has num_topics as rows and vocab as columns  
  
***  



In [53]:
# Input incoming text as a list called raw_documents

raw_documents= list(final['text'])
custom_stop_words = joblib.load('stopwords.pkl')
# custom_stop_words = stopwords.words('english') - use this from nltk if custom file not available
num_topics = 10
top_n_words = 10
vectorizer_to_use = 3 # Use 1 for CountVectorizer, and 2 for TFIDF_Vectorizer
NMF_or_LDA = 'nmf' # Use 'nmf' for NMF or 'lda' for LDA
ngram = 3 # 2 for bigrams, 3 for trigrams etc

# Once done, the below code will create with two dataframes:
# Main output is:
#     words_in_topics_df - top_n_words per topic
#     topic_for_doc_df - topic to which a document is identified

# Additional outputs of interest
# vocab = This is the dict from which you can pull the words, eg vocab['ocean']
# terms = Just the list equivalent of vocab, indexed in the same order
# doc_term_matrix = Document term matrix 
# doc_term_matrix = W x H
# W = This matrix has docs as rows and num_topicss as columns
# H = This matrix has num_topics as rows and vocab as columns

    

In [54]:
# use count based vectorizer
if vectorizer_to_use ==1:
    vectorizer = CountVectorizer(stop_words = custom_stop_words, min_df = 2, analyzer='word', ngram_range=(ngram, ngram))
else:
    # or use TF-IDF based vectorizer
    vectorizer = TfidfVectorizer(max_df=0.95, min_df=2, stop_words=custom_stop_words, analyzer='word', ngram_range=(ngram, ngram))

doc_term_matrix = vectorizer.fit_transform(raw_documents)
print( "Created %d X %d document-term matrix in variable doc_term_matrix\n" % (doc_term_matrix.shape[0], doc_term_matrix.shape[1]) )

vocab = vectorizer.vocabulary_ #This is the dict from which you can pull the words, eg vocab['ocean']
terms = vectorizer.get_feature_names() #Just the list equivalent of vocab, indexed in the same order
print("Vocabulary has %d distinct terms, examples below " % len(terms))
print(terms[500:550], '\n')


# create the model
# Pick between NMF or LDA methods (don't know what they are, try whichever gives better results)
if NMF_or_LDA == 'nmf':
    model = NMF( init="nndsvd", n_components=num_topics ) 
else:
    model = LatentDirichletAllocation(n_components=num_topics, learning_method='online') 
    
# apply the model and extract the two factor matrices
W = model.fit_transform( doc_term_matrix ) #This matrix has docs as rows and k-topics as columns
H = model.components_ #This matrix has k-topics as rows and vocab as columns
print('Shape of W is', W.shape, 'docs as rows and', num_topics, 'topics as columns. First row below')
print(W[0].round(1))
print('\nShape of H is', H.shape, num_topics, 'topics as rows and vocab as columns. First row below')
print(H[0].round(1))

# Check which document belongs to which topic, and print value_count
topic_for_doc_df = pd.DataFrame(columns = ['article', 'topic', 'value'])
for i in range(W.shape[0]):
    a = W[i] 
    b = np.argsort(a)[::-1]
    temp_df = pd.DataFrame({'article': [i], 'topic':['Topic_'+str(b[0])], 'value': [a[b[0]]]})
    topic_for_doc_df = pd.concat([topic_for_doc_df, temp_df])

top_docs_for_topic_df = pd.DataFrame(columns = ['topic', 'doc_number', 'weight'])    
for i in range(W.shape[1]):
    topic = i
    temp_df = pd.DataFrame({'topic': ['Topic_'+str(i) for x in range(W.shape[0])], 
                            'doc_number':  list(range(W.shape[0])), 
                            'weight': list(W[:,i])})
    temp_df = temp_df.sort_values(by=['topic', 'weight'], ascending=[True, False])
    top_docs_for_topic_df = pd.concat([top_docs_for_topic_df, temp_df])
# Add text to the top_docs dataframe as a new column
top_docs_for_topic_df['text']=[raw_documents[i] for i in list(top_docs_for_topic_df.doc_number)] 
# Print top two docs for each topic
print('\nTop documents for each topic')
print(top_docs_for_topic_df.groupby('topic').head(2))

print('\n')
print('Topic number and counts of documents against each:')
print(topic_for_doc_df.topic.value_counts())

# Create dataframe with top-10 words for each topic
words_in_topics_df = pd.DataFrame(columns = ['topic', 'words', 'freq'])
for i in range(H.shape[0]):
    a = H[i] 
    b = np.argsort(a)[::-1]
    np.array(b[:top_n_words])
    words = [terms[i] for i in b[:top_n_words]]
    freq = [a[i] for i in b[:top_n_words]]
    temp_df = pd.DataFrame({'topic':'Topic_'+str(i), 'words': words, 'freq': freq})
    words_in_topics_df = pd.concat([words_in_topics_df, temp_df])

print('\n')
print('Top', top_n_words, 'words dataframe with weights')
print(words_in_topics_df.head(10))



# print as list
print('\nSame list as above as a list')
words_in_topics_list = words_in_topics_df.groupby('topic')['words'].apply(list)
lala =[]
for i in range(len(words_in_topics_list)):
    a = [list(words_in_topics_list.index)[i]]
    b = words_in_topics_list[i]
    lala = lala + [a+b]
    print(a + b) 
    
    
# Top docs for a topic


  'stop_words.' % sorted(inconsistent))


Created 368 X 1589 document-term matrix in variable doc_term_matrix

Vocabulary has 1589 distinct terms, examples below 
['exe boot file', 'execupharm pennsylvania based', 'executable malware compiled', 'execute additional payloads', 'execute anytime user', 'execute arbitrary code', 'execute arbitrary commands', 'execute malicious code', 'executives uber discussing', 'expected close quarter', 'experience trade journalism', 'experts threatpost free', 'explained eset researcher', 'exploit control cybersecurity', 'exploit coronavirus isolation', 'exploit crisis financial', 'exploit flaw sending', 'exploit vulnerabilities control', 'exploit vulnerabilities prevents', 'exploit vulnerability control', 'exploitation vulnerability stealthy', 'exploited remote code', 'exploited sql injection', 'exploits privacy leaks', 'exploits target vulnerabilities', 'exposed redis instances', 'extended security updates', 'external authentication systems', 'external internal threats', 'facebook twitter linke