In [2]:
#Importing libraries
import pandas as pd
import os
from bs4 import BeautifulSoup
import feedparser as fp
import requests
import json
import newspaper
import csv
from newspaper import Article
from time import mktime
from datetime import datetime

In [134]:
# Function that extracts links from rss
# uses newspaper to extract articles and writes or appends to a csv
def extract_from_rss(rss, file_name="master.csv"):
    newsPaper=[]
    d=fp.parse(rss)
    count=1
    for entry in d.entries:
        article={}
        article['link']=entry.link
        try:
            article['published']=entry.published
        except:
            article['published']=""
        try:
            content=Article(entry.link)
            content.download()
            content.parse()
            content.nlp()
            keywords= content.keywords
            summary=content.summary
        except Exception as e:
            print(e)
            print("continuing...")
            continue
        article['title'] = content.title
        article['text']= content.text
        article['summary']=summary
        keywords_together=""
        for k in keywords:
            keywords_together+=k+", "
        article["keywords"]= keywords_together
        newsPaper.append(article)
        print(count, " article scraped")
        count=count+1
    try:
        keys= newsPaper[0].keys() 
    except:
        return list()
    #If file exists we append else create new file
    if os.path.exists(file_name):
        print("Appending file ", file_name)
        with open(file_name,"a") as outfile:
            dict_writer= csv.DictWriter(outfile, keys)
            dict_writer.writerows(newsPaper)
    else:
        print("Creating file ",file_name )
        with open(file_name,"w") as outfile:
            dict_writer= csv.DictWriter(outfile, keys)
            dict_writer.writeheader()
            dict_writer.writerows(newsPaper)

    return newsPaper

# Function that extracts articles from links and wrotes to csv
def extract_from_links(links, file_name="master.csv"):
    newsPaper=[]
    count=1
    for link in links:
        article={}
        article['link']=link
        print(link)
        try:
            content=Article(link)
            content.download()
            content.parse()
            content.nlp()
            keywords= content.keywords
            summary=content.summary
        except Exception as e:
            print(e)
            print("continuing...")
            continue
        print(content.publish_date)
        try:
            article['published']="{:%A, %B %d %Y}".format(content.publish_date)
        except:
            article['published']=""
        
        article['title'] = content.title
        print(content.title)
        article['text']= content.text
        article['summary']=summary
        
        keywords_together=""
        for k in keywords:
            keywords_together+=k+", "
        article["keywords"]= keywords_together
        newsPaper.append(article)
        print(count, " article scraped")
        count=count+1
    try:
        keys= newsPaper[0].keys() 
    except:
        return list()
    #If file exists we append else create new file
    if os.path.exists(file_name):
        print("Appending file ", file_name)
        with open(file_name,"a") as outfile:
            dict_writer= csv.DictWriter(outfile, keys)
            dict_writer.writerows(newsPaper)
    else:
        print("Creating file ",file_name )
        with open(file_name,"w") as outfile:
            dict_writer= csv.DictWriter(outfile, keys)
            dict_writer.writeheader()
            dict_writer.writerows(newsPaper)

    return newsPaper
#Create BS4 parser
def get_soup(link):
    response= requests.get(link)
    content=response.content
    return BeautifulSoup(content, 'html.parser')


In [75]:
#RSS and regular links of relavant news sites
newspaper_links={
    "Google":
    {"link":"https://news.google.com/topics/CAAqJggKIiBDQkFTRWdvSUwyMHZNRGx1YlY4U0FtVnVHZ0pWVXlnQVAB?hl=en-US&gl=US&ceid=US:en",
     "rss":"https://news.google.com/news/rss/headlines/section/topic/WORLD?ned=us&hl=en"        
    },
    
    "Reddit":
    {"rss":"https://www.reddit.com/r/worldnews/.rss",
     "link":"https://www.reddit.com/r/worldnews/"
    },
    
    "BBC":
    {"link":"http://www.bbc.com/news/world",
     "rss":"http://feeds.bbci.co.uk/news/world/rss.xml"
    },
    
    "NPR":
    {"link":"http://www.npr.org/sections/world/",
     "rss":"http://www.npr.org/rss/rss.php?id=1004"
    },
    
    "Spiegel":
    {"link":"http://www.spiegel.de/international/",
     "rss":"http://www.spiegel.de/international/index.rss"
    },
    
    "The Independent":
    {"link":"http://www.independent.co.uk/news/world",
     "rss":"http://www.independent.co.uk/news/world/rss"
    },
    
    "NDTV":
    {"rss":"http://feeds.feedburner.com/ndtvnews-world-news",
     "link":"http://www.ndtv.com/world-news"
    },
    
    "Washington Post":
    {"rss":"http://feeds.washingtonpost.com/rss/world",
     "link":"https://www.washingtonpost.com/world/"
    },
    
    "Reuters":
    {"link":"https://in.reuters.com/news/world",
     "rss":"http://feeds.reuters.com/Reuters/worldNews"
    },
    
    "The Guardian":
    {"link":"https://in.reuters.com/news/world",
     "rss":"http://feeds.reuters.com/Reuters/worldNews"
    },
    
    "CNN":
    {"link":"http://edition.cnn.com/world",
     "rss":"http://rss.cnn.com/rss/edition_world.rss"
    },
    
    "Drudge Report":
    {"link":"https://www.drudgereport.com",
     "rss":"http://www.drudgereportfeed.com/rss"
    },
    
    "Fox News":
    {"link":"http://www.foxnews.com",
     "rss":"http://feeds.foxnews.com/foxnews/latest"
    },
    
    "Good News Network":
    {"link":"https://www.goodnewsnetwork.org/",
     "rss":"https://www.goodnewsnetwork.org/category/news/feed/"
    },
    
    
    "Sunny Skyz":
    {"link":"https://www.sunnyskyz.com/",
     "rss":"https://feeds.feedburner.com/SunnySkyz"
    },
    
    "Positive News":
    {"link":"https://www.positive.news/",
     "rss":"https://www.positive.news/feed"
    },
    
    "Happy News":
    {"link":"https://www.happynews.com/",
     "rss":"https://www.happynews.com/rss/.aspx"
    },
    
    "SA Good News":
    {"link":"https://www.sagoodnews.co.za",
     "rss":"http://feeds.feedburner.com/SAGoodNews"
    },
    
    "SA Good News":
    {"link":"https://www.sagoodnews.co.za",
     "rss":"http://feeds.feedburner.com/SAGoodNews"
    },
    
    "The Better India":
    {"link":"https://www.thebetterindia.com/",
     "rss":"https://www.thebetterindia.com/custom-rss-feed/"
    },
    
    "Global Positive News Network":
    {"link":"https://www.globalpositivenewsnetwork.com",
     "rss":"https://www.globalpositivenewsnetwork.com/feed/"   
    },
    
    "Huffington Post":
    {"link":"https://www.huffingtonpost.com/topic/good-news",
     "rss":"https://www.huffingtonpost.com/topic/good-news/feed"
    },
    
    
   
}

good_newspaper_links={
    "Good News Network":
    {"link":"https://www.goodnewsnetwork.org/",
     "rss":"https://www.goodnewsnetwork.org/category/news/feed/"
    },
    
    
    "Sunny Skyz":
    {"link":"https://www.sunnyskyz.com/",
     "rss":"https://feeds.feedburner.com/SunnySkyz"
    },
    
    "Positive News":
    {"link":"https://www.positive.news/",
     "rss":"https://www.positive.news/feed"
    },
    
    "Happy News":
    {"link":"https://www.happynews.com/",
     "rss":"https://www.happynews.com/rss/.aspx"
    },
    
    "SA Good News":
    {"link":"https://www.sagoodnews.co.za",
     "rss":"http://feeds.feedburner.com/SAGoodNews"
    },
    
    "SA Good News":
    {"link":"https://www.sagoodnews.co.za",
     "rss":"http://feeds.feedburner.com/SAGoodNews"
    },
    
    "The Better India":
    {"link":"https://www.thebetterindia.com/",
     "rss":"https://www.thebetterindia.com/custom-rss-feed/"
    },
    
    "Global Positive News Network":
    {"link":"https://www.globalpositivenewsnetwork.com",
     "rss":"https://www.globalpositivenewsnetwork.com/feed/"   
    },
    
    "Huffington Post":
    {"link":"https://www.huffingtonpost.com/topic/good-news",
     "rss":"https://www.huffingtonpost.com/topic/good-news/feed"
    },
    
    
}

In [None]:
#Extracting news articles from rss found in dictionary newspaper_links
newspapers={}
for key, value in newspaper_links.items():
    newspapers[key]=extract_from_rss(value["rs"],"test.csv")

## Using Beautiful Soup to Extract Articles

### 1.  ABC Good News

In [27]:
response = requests.get("http://www.abc.net.au/news/good-news/")
content =response.content
parser = BeautifulSoup(content, 'html.parser')


In [323]:
abcaugoodlinks=[]
for a in parser.find_all('a', class_="thumb"):
    abcaugoodlinks.append("http://www.abc.net.au/"+a['href'])
abcaugoodlinks

['http://www.abc.net.au//news/2018-07-01/biometrics-koalas-and-wood-glue-fascinating-fingerprint-facts/9920802',
 'http://www.abc.net.au//news/2018-07-01/melbourne-post-whisperer-turning-return-to-sender-mail-into-art/9893426',
 'http://www.abc.net.au//news/2018-06-30/food-cooking-program-teaches-students-with-community-approach/9890076',
 'http://www.abc.net.au//news/2018-06-30/arthur-boyds-hidden-art-collection-to-be-preserved/9926064',
 'http://www.abc.net.au//news/2018-06-30/bondi-hairdresser-charity-funds-new-housing-jilkminggan/9921032',
 'http://www.abc.net.au//news/2018-06-29/cheese-maker-from-tasmania-integrates-raw-milk-dairy-farm/9911714',
 'http://www.abc.net.au//news/2018-06-27/the-mother-daughter-coffee-van-making-more-than-just-coffee/9914622',
 'http://www.abc.net.au//news/2018-06-27/sanitary-pad-initiative-creates-opportunities-for-women/9910102',
 'http://www.abc.net.au//news/2018-06-26/country-boy-sells-coats-off-his-own-back-for-cancer-ward/9908376',
 'http://www.ab

In [365]:
abcaugood= extract_from_links(abcaugoodlinks,"master.csv")

http://www.abc.net.au//news/2018-07-01/biometrics-koalas-and-wood-glue-fascinating-fingerprint-facts/9920802
Koalas, wood glue and the FBI: Fascinating facts about fingerprinting
1  article scraped
http://www.abc.net.au//news/2018-07-01/melbourne-post-whisperer-turning-return-to-sender-mail-into-art/9893426
Instagram 'post whisperer' is turning return-to-sender envelopes into works of art
2  article scraped
http://www.abc.net.au//news/2018-06-30/food-cooking-program-teaches-students-with-community-approach/9890076
WA food and cooking program teaches students impact of farms on produce
3  article scraped
http://www.abc.net.au//news/2018-06-30/arthur-boyds-hidden-art-collection-to-be-preserved/9926064
Painter Arthur Boyd's precious $43m collection to be housed in new fire-proof gallery
4  article scraped
http://www.abc.net.au//news/2018-06-30/bondi-hairdresser-charity-funds-new-housing-jilkminggan/9921032
Bondi hairdresser helps fund Jilkminggan's first new housing in almost two decades


### 2. MSN Good News

In [28]:
with open("ms.txt") as f:
    html=f.read()
    

In [13]:

soup= BeautifulSoup(html, 'html.parser')

In [48]:
li=soup.find_all("li", class_="rcp")
msngoodlinks=[ "https://www.msn.com"+l.a['href'] for l in li]
len(msngoodlinks)

9

In [49]:
msngood= extract_from_links(msngoodlinks,"master.csv")

https://www.msn.com/en-us/news/good-news/anonymous-parking-offender-pays-ticket-44-years-later/ar-AAzng6u
Anonymous parking offender pays ticket 44 years later
1  article scraped
https://www.msn.com/en-us/news/good-news/wheelchair-basketball-changed-this-vets-life/ar-AAzm9GA
Wheelchair basketball changed this vet's life
2  article scraped
https://www.msn.com/en-us/news/good-news/mystery-buyer-snaps-up-dollar1m-of-toys-at-toys-r-us-to-donate/ar-AAzltsL
Mystery buyer snaps up $1M of toys at Toys "R" Us to donate
3  article scraped
https://www.msn.com/en-us/news/good-news/good-samaritan-returns-tourists-purse-along-with-dollar8000-plane-tickets-and-credit-cards/ar-AAzlJeB
Good Samaritan Returns Tourist's Purse Along With $8,000, Plane Tickets and Credit Cards
4  article scraped
https://www.msn.com/en-us/news/good-news/superhero-window-washers-surprise-kids-at-iowa-hospital/ar-AAzjPK5
"Superhero" window washers surprise kids at Iowa hospital
5  article scraped
https://www.msn.com/en-us/new

### Today Good News

In [136]:
response = requests.get("https://www.today.com/news/good-news")
content =response.content
parser = BeautifulSoup(content, 'html.parser')
parser

<!DOCTYPE html>

<!--[if lt IE 8]>      <html class="lt-ie9 lt-ie8"> <![endif]-->
<!--[if IE 8]>         <html class="lt-ie9"> <![endif]-->
<!--[if IE 9]>         <html class="lt-ie10"> <![endif]-->
<!--[if gt IE 9]><!--> <html lang="en"> <!--<![endif]-->
<head>
<meta charset="utf-8"/>
<meta content="width=device-width initial-scale=1 maximum-scale=1 user-scalable=no" name="viewport"/>
<meta content="43434ADDF3F60D7897F819F388E2FADF" name="msvalidate.01"/>
<title>Good News: Inspirational, Hopeful and Uplifting News Stories - TODAY.com</title>
<meta content="Find the latest inspirational, uplifting, funny and happy news, photos, videos and more on TODAY.com." property="og:description"/>
<meta content="Find the latest inspirational, uplifting, funny and happy news, photos, videos and more on TODAY.com." name="description"/>
<meta content="Good News: Inspirational, Hopeful and Uplifting News Stories - TODAY.com" property="og:title"/>
<meta content="Good News: Inspirational, Hopeful and Up

Pattern: div block has class *block-art*

In [55]:
block_art=parser.find_all("div", class_="block-art")

In [66]:
todaygoodlinks= ["https:"+l.a['href'] for l in block_art]
len(todaygoodlinks)

42

In [69]:
todaygoodnews= extract_from_links(todaygoodlinks,"master.csv")

https://www.today.com/pets/former-president-george-h-w-bush-welcomes-new-family-member-t131844
None
Former President George H.W. Bush welcomes new family member: Sully the dog
1  article scraped
https://www.today.com/news/world-cup-japanese-soccer-fans-applauded-cleaning-stadium-after-win-t131353
None
Japanese soccer fans applauded for cleaning up stadium after World Cup win
2  article scraped
https://www.today.com/popculture/chris-pratt-surprised-young-fans-jurassic-world-screening-it-was-t130498
None
Chris Pratt surprised young fans at a 'Jurassic World' screening — and it was so sweet
3  article scraped
https://www.today.com/food/houston-police-officer-s-kind-deed-replaces-man-s-stolen-t131047
None
Houston police officer's kind deed replaces man's stolen groceries
4  article scraped
https://www.today.com/news/after-72-years-two-women-switched-birth-finally-come-face-t130930
None
After 72 years, two women switched at birth finally come face-to-face
5  article scraped
https://www.toda

### Telegraph Good News

In [135]:
soup=get_soup("https://www.telegraph.co.uk/good-news")
soup

<!DOCTYPE html>

<html class="hubRendererHTML no-js" lang="en-GB" xmlns:fb="https://www.facebook.com/2008/fbml" xmlns:og="http://opengraphprotocol.org/schema/">
<head>
<script data-pagespeed-no-defer="">//<![CDATA[
(function(){function d(b){var a=window;if(a.addEventListener)a.addEventListener("load",b,!1);else if(a.attachEvent)a.attachEvent("onload",b);else{var c=a.onload;a.onload=function(){b.call(this);c&&c.call(this)}}}var p=Date.now||function(){return+new Date};window.pagespeed=window.pagespeed||{};var q=window.pagespeed;function r(){this.a=!0}r.prototype.c=function(b){b=parseInt(b.substring(0,b.indexOf(" ")),10);return!isNaN(b)&&b<=p()};r.prototype.hasExpired=r.prototype.c;r.prototype.b=function(b){return b.substring(b.indexOf(" ",b.indexOf(" ")+1)+1)};r.prototype.getData=r.prototype.b;r.prototype.f=function(b){var a=document.getElementsByTagName("script"),a=a[a.length-1];a.parentNode.replaceChild(b,a)};r.prototype.replaceLastScript=r.prototype.f;
r.prototype.g=function(b){var a=

Pattern: h3 has class *list-of-entities\__item-body-headline* . We delete last 4 links as they are clearly sponsored.

In [113]:
telegraphlinks=soup.find_all("h3", class_="list-of-entities__item-body-headline")
telegraphlinks

[]

In [107]:
telegraphlinks= telegraphlinks[:-4]

In [108]:
telegraphlinks= ["https://www.telegraph.co.uk"+l.a['href'] for l in telegraphlinks]


In [99]:
extract_from_links(telegraphlinks, "master.csv")

https://www.telegraph.co.uk/news/2018/01/13/liberal-churches-boosted-lgbt-weddings-couples-join-congregations/
2018-01-13 00:00:00
Liberal churches boosted by LGBT weddings as couples join their congregations
1  article scraped
https://www.telegraph.co.uk/news/2018/01/13/uk-quartet-celebrate-breaking-atlantic-rowing-record/
2018-01-13 00:00:00
UK quartet celebrate with a cup of tea after breaking Atlantic rowing record
2  article scraped
https://www.telegraph.co.uk/good-news/2016/06/28/love-at-first-sight-its-not-as-unrealistic-as-you-might-think/
2016-06-28 00:00:00
Love at first sight? It's not as unrealistic as you might think
3  article scraped
https://www.telegraph.co.uk/good-news/2016/09/16/boy-grows-hair-for-two-years-then-donates-it-for-child-cancer-pa/
2016-09-16 00:00:00
Boy donates hair to make wigs for child cancer patients
4  article scraped
https://www.telegraph.co.uk/travel/news/move-over-melbourne-this-is-really-the-worlds-most-liveable-city/
None
Move over Melbourne – 

[{'keywords': 'church, positive, market, worship, churches, couples, place, boosted, paper, known, liberal, samesex, congregations, join, marriage, weddings, lgbt, ',
  'link': 'https://www.telegraph.co.uk/news/2018/01/13/liberal-churches-boosted-lgbt-weddings-couples-join-congregations/',
  'published': 'Saturday, January 13 2018',
  'summary': 'Churches should hold gay weddings in order to \'market\' themselves to potential new worshippers, a study has found.\nThe commitment to same-sex marriage "can provide a vehicle for articulating the broader values of the place of worship to a wide audience," the paper says.\nChurches can even benefit from becoming known as "the gay church", it suggests.\nThe paper said: "As one Unitarian church reported, the commitment to same-sex marriage ‘gives us something distinctive to promote’.\n"Being known for solemnizing same-sex marriage may therefore be a positive ‘brand’ for a place of worship and not, as some members of some congregations experienc

### 9News Good News

In [114]:
soup=get_soup("https://www.9news.com.au/good-news/94")

In [116]:
soup.find_all("a", class_="story__headline__link")

[<a class="story__headline__link" data-tracking-link-name="na_section_good-news_0_na_na" href="https://www.9news.com.au/good-news/2016/07/17/03/38/disneys-first-latina-princess-to-debut-this-month"><span class="story__headline__text">Disney's first Latina princess to debut this month</span></a>,
 <a class="story__headline__link" data-tracking-link-name="na_section_good-news_1_na_na" href="https://www.9news.com.au/good-news/2016/07/14/03/07/missouri-man-converts-truck-into-mobile-shower-for-the-homeless"><span class="story__headline__text">Missouri man converts truck into mobile shower for the homeless</span></a>,
 <a class="story__headline__link" data-tracking-link-name="na_section_good-news_2_na_na" href="https://www.9news.com.au/good-news/2016/07/12/12/17/black-lives-matter-protesters-embrace-counter-protest-group-in-touching-display-of-solidarity"><span class="story__headline__text">‘Black Lives Matter’ protesters embrace counter-protest group in touching display of solidarity</span

a tag has class *story\__headline\__link*. Also, we have 94 pages of good news to scrape from. Therefore, we use for loop.

In [130]:
ninenewslinks=[]
for x in range(2,95):
    link= "https://www.9news.com.au/good-news/"+str(x)
    soup=get_soup(link)
    headline_class= soup.find_all("a",class_="story__headline__link")
    print(headline_class[0]['href'])
    articlelinks=[l['href'] for l in headline_class]
    ninenewslinks.extend(articlelinks)
    

https://www.9news.com.au/national/2018/06/21/10/55/melbourne-zoo-welcomes-endangered-spider-monkey
https://www.9news.com.au/national/2018/06/13/13/45/a-current-affair-louise-and-cherish-lavelle-tenants-trash-home-while-daughter-battles-cancer
https://www.9news.com.au/good-news/2018/05/11/07/47/sonia-burton-home-renovation-surprise-today-show
https://www.9news.com.au/good-news/2018/04/20/21/34/police-dogs-nsw-named-kids-children-s-hospital-westmead
https://www.9news.com.au/national/2018/04/07/20/32/longest-serving-nsw-air-ambulance-nurse-retires
https://www.9news.com.au/good-news/2018/03/27/17/21/identical-townsville-triplets-a-one-in-300000-chance
https://www.9news.com.au/good-news/2018/03/13/21/03/teenager-queensland-loses-half-her-body-weight
https://www.9news.com.au/national/2018/03/06/09/08/uss-lexington-wreckage-video-paul-allen-finds-lost-world-war-ii-aircraft-carrier
https://www.9news.com.au/good-news/2018/02/15/02/16/frances-abbott-marries-fiance-in-surprise-valentine-s-day-cer

https://www.9news.com.au/good-news/2016/09/15/06/54/woman-thanks-retailer-for-employing-shop-assistant-with-autism-after-perfect-encounter
https://www.9news.com.au/good-news/2016/09/11/08/56/father-thanks-store-employee-for-calming-cranky-son-with-sticker
https://www.9news.com.au/good-news/2016/09/09/06/40/motorist-who-gave-lift-to-wandering-93-year-old-woman-urges-people-to-reach-out
https://www.9news.com.au/good-news/2016/09/07/08/08/aussie-volunteers-knit-coats-to-help-keep-shelter-dogs-warm
https://www.9news.com.au/good-news/2016/09/03/10/47/canberra-boy-who-learned-sign-language-to-help-deaf-friend-recognised-with-award
https://www.9news.com.au/good-news/2016/08/27/14/10/woman-with-down-syndrome-announces-retirement-after-32-years-of-work
https://www.9news.com.au/good-news/2016/08/25/11/14/touching-tribute-to-wwii-veteran-goes-viral
https://www.9news.com.au/good-news/2016/08/23/14/39/australian-parents-cycling-5000km-across-america-with-autistic-son
https://www.9news.com.au/good-n

In [133]:
#Now, that we have a master list of links we can extract all approx 923 articles
ninenews=extract_from_links(ninenewslinks,"master.csv")

https://www.9news.com.au/national/2018/06/21/10/55/melbourne-zoo-welcomes-endangered-spider-monkey
2018-06-21 00:00:00
Melbourne Zoo spider monkey baby born Victoria
1  article scraped
https://www.9news.com.au/good-news/2018/06/20/23/31/royal-family-prince-louis-christening-st-james-palace
2018-06-20 00:00:00
Prince Louis to be christened next month Kensington Palace confirms
2  article scraped
https://www.9news.com.au/good-news/2018/06/20/04/18/royal-ascot-meghan-markle-stuns-at-debut


KeyboardInterrupt: 