# Web scraping RSS and Topic Models

In [1]:
import newspaper
import feedparser
import numpy as np
import pandas as pd
import requests
import datetime 
from tqdm import tqdm
import nltk
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
import joblib

## Web Scraping

In [2]:
# Blank dataframe, based on fields identified later

rss_feeds = pd.DataFrame(columns = ['title',  'summary',  'links',  'link',  'id',  'guidislink',  'published',  
                                    'published_parsed',  'title_detail.type',  'title_detail.language',  
                                    'title_detail.base',  'title_detail.value',  'summary_detail.type',  
                                    'summary_detail.language',  'summary_detail.base',  'summary_detail.value',  
                                    'media_content',  'feedburner_origlink'])


In [3]:
# List of RSS URLs to scrape

rss_urls = [r'http://www.schneier.com/blog/index.rdf', 
            r'http://feeds.feedburner.com/darknethackers', 
            r'http://securityaffairs.co/wordpress/feed', 
            r'http://healthitsecurity.com/feed/', 
            r'http://blog.seanmason.com/feed/', 
            r'http://threatpost.com/feed', 
            r'http://feeds.trendmicro.com/Anti-MalwareBlog/', 
            r'http://www.infosecurity-magazine.com/rss/news/', 
            r'http://krebsonsecurity.com/feed/', 
            r'http://www.darkreading.com/rss/all.xml', 
            r'http://blog.kaspersky.com/feed/', 
            r'http://www.baesystems.com/page/rss?lg=en', 
            r'http://rss.nytimes.com/services/xml/rss/nyt/Technology.xml', 
            r'http://feeds.feedburner.com/scmagazinenews', 
            r'http://taosecurity.blogspot.com/atom.xml', 
            r'http://www.rms.com/blog/feed/', 
            r'http://iscxml.sans.org/rssfeed.xml', 
            r'https://community.qualys.com/blogs/securitylabs/feeds/posts', 
            r'http://googleonlinesecurity.blogspot.com/atom.xml', 
            r'http://thehackernews.com/feeds/posts/default', 
            r'http://www.us-cert.gov/current/index.rdf', 
            r'http://feeds.feedburner.com/Securityweek', 
            r'http://nakedsecurity.sophos.com/feed/', 
            r'http://feeds.arstechnica.com/arstechnica/index/', 
            r'http://www.csoonline.com/feed/attribute/41014', 
            r'http://blogs.rsa.com/feed/', 
            r'http://feeds.feedburner.com/Techcrunch', 
            r'http://recode.net/feed/', 
            r'http://www.techmeme.com/index.xml', 
            r'http://www.technologyreview.com/stream/rss/']

In [4]:
# Get all the feed entries.  But the dataframe resulting from this has only a summary line, 
# not the entire text of the article.  For that we will pull the URL in using the 
# newspaper library later.

for rss in tqdm(rss_urls):
    feed = feedparser.parse(rss)
    rss_feeds=pd.concat([rss_feeds, pd.json_normalize(feed.entries)], axis=0)
print(len(rss_feeds), 'items in rss_feed dataframe')    

100%|██████████████████████████████████████████████████████████████████████████████████| 30/30 [00:14<00:00,  2.02it/s]

367 items in rss_feed dataframe





In [5]:
# Remove duplicate URLs

urllist =rss_feeds.link.unique()

In [6]:
# Get full text using scraping from the newspaper library

from newspaper import Article
import pandas as pd
df = pd.DataFrame(columns = ["date",  "URL", "authors", "keywords", "summary", "text"])

for url in tqdm(urllist):
    article = Article(url)
    try:
        article.download()
        article.parse()
        article.nlp()
        dict1 = {"date": article.publish_date, "URL": url, "authors": article.authors, \
             "keywords": article.keywords, "summary": article.summary, "text": article.text}
    #print(dict1)
        df = df.append(dict1, ignore_index=True)
    except:
        print('Something wrong with', url)

print(len(df),'stories in dataframe df')

df.sample(4)

100%|████████████████████████████████████████████████████████████████████████████████| 367/367 [06:03<00:00,  1.01it/s]

367 stories in dataframe df





Unnamed: 0,date,URL,authors,keywords,summary,text
41,,https://threatpost.com/revil-ransomware-attack...,[Lindsey O'Donnell],"[company, attack, including, celeb, ransomware...",Cybercriminals used the REvil ransomware to at...,Cybercriminals used the REvil ransomware to at...
221,,http://feedproxy.google.com/~r/TheHackersNews/...,"[Mohit Kumar, April]","[attackers, targeted, scammers, successfully, ...","""Among these high-ranking officer victims, mor...",Have something to say about this article? Comm...
7,,https://www.schneier.com/blog/archives/2020/05...,[],"[sweden, schneier, cryptographic, secret, germ...",This paper describes a SIGINT and code-breakin...,This paper describes a SIGINT and code-breakin...
61,2020-05-13 10:30:00,https://www.infosecurity-magazine.com:443/news...,[Phil Muncaster],"[attackers, sophos, attack, data, pay, ransom,...",Organizations that decide to pay their ransomw...,Organizations that decide to pay their ransomw...


In [7]:
# Merge the RSS dataframe with the full text obtained from the 
# newspaper library

final = rss_feeds.merge(df,how="right", left_on="link", right_on="URL")
print(len(final),'unique articles in file.')

367 unique articles in file.


In [8]:
# Save the file
final.to_pickle('securitynews_' + datetime.datetime.now().strftime("date_%Y.%m.%d_time_%H.%M") + '.pkl')
print('Pickle file created')

Pickle file created


In [9]:
print(final.text[3])

Used Tesla Components Contain Personal Information

Used Tesla components, sold on eBay, still contain personal information, even after a factory reset.

This is a decades-old problem. It's a problem with used hard drives. It's a problem with used photocopiers and printers. It will be a problem with IoT devices. It'll be a problem with everything, until we decide that data deletion is a priority.

Posted on May 8, 2020 at 9:46 AM • 12 Comments
