# Scrape Latest News from Webhose.io

### Get Started with [Webhose](https://github.com/Webhose/webhoseio-python)

In [None]:
!pip install webhoseio

### Configure Crawls for NYTimes / Fox / Reuters

These crawls are configured to extract English language texts published in the US over the last thirty days from select media organizations.

In [2]:
import webhoseio
webhoseio.config(token=SECRET_TOKEN)

### Conservative Sites

In [3]:
# FoxNews Crawl
fox_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:foxnews.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
fox_output = webhoseio.query("filterWebContent", fox_query_params)

In [4]:
print('Total Results: ', str(fox_output['totalResults']))
#print(fox_output['posts'][0]['text']) # Print the text of the first post
#print(fox_output['posts'][0]['published']) # Print the text of the first post publication date

Total Results:  4304


In [5]:
# Breitbart Crawl
breitbart_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:breitbart.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
breitbart_output = webhoseio.query("filterWebContent", breitbart_query_params)

In [6]:
print('Total Results: ', str(breitbart_output['totalResults']))

Total Results:  2036


In [7]:
# Washington Times Crawl
washtimes_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:washingtontimes.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
washtimes_output = webhoseio.query("filterWebContent", washtimes_query_params)

In [9]:
print('Total Results: ', str(washtimes_output['totalResults']))

Total Results:  7524


### Liberal Sites

In [91]:
# NY Times Post Crawl
nytimes_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:nytimes.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
nytimes_output = webhoseio.query("filterWebContent", nytimes_query_params)

In [None]:
print('Total Results: ', str(nytimes_output['totalResults']))

In [14]:
# Huffington Post Crawl
huffpo_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:huffingtonpost.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
huffpo_output = webhoseio.query("filterWebContent", huffpo_query_params)

In [16]:
print('Total Results: ', str(huffpo_output['totalResults']))

Total Results:  2535


In [60]:
# CNN Crawl
cnn_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:cnn.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
cnn_output = webhoseio.query("filterWebContent", cnn_query_params)

In [61]:
print('Total Results: ', str(cnn_output['totalResults']))

Total Results:  7629


### Centrist Sites

In [45]:
# Reuters Crawl
reuters_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:reuters.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
reuters_output = webhoseio.query("filterWebContent", reuters_query_params)

In [46]:
print('Total Results: ', str(reuters_output['totalResults']))

Total Results:  26156


In [27]:
# NPR Crawl
npr_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:npr.org",
"ts": "1528084091530",
"sort": "crawled"
    }
    
npr_output = webhoseio.query("filterWebContent", npr_query_params)

In [28]:
print('Total Results: ', str(npr_output['totalResults']))

Total Results:  2908


In [29]:
# USA Today Crawl
usa_query_params = {
"q": "language:english thread.country:US site_type:news site_category:media is_first:true site:usatoday.com",
"ts": "1528084091530",
"sort": "crawled"
    }
    
usa_output = webhoseio.query("filterWebContent", usa_query_params)

In [30]:
print('Total Results: ', str(usa_output['totalResults']))

Total Results:  9672


### Save Crawls

In [36]:
import pandas as pd
import datetime

def save_crawl(media, media_bias, query_params):
  data = {'bias': [], 'source':[], 'headline':[], 'text':[], 'date':[]}
  output = webhoseio.query("filterWebContent", query_params)

  # set scrape range
  cap = min(40, int(output['totalResults']/100))
  
  for i in range(0,cap):
    output = webhoseio.get_next()
    
    # info to extract from posts
    for i in range(0, len(output['posts'])):
      data['bias'].append(media_bias)
      data['source'].append(output['posts'][i]['thread']['site'])
      data['headline'].append(output['posts'][i]['title'])
      data['text'].append(output['posts'][i]['text'])
      data['date'].append(output['posts'][i]['published'])
      
  # write crawled info to pandas df
  df = pd.DataFrame(data, columns = ['bias', 'source', 'headline', 'text', 'date'])
  
  # add date information to file name
  date = datetime.datetime.now()
  today = date.strftime("%Y-%m-%d")
  
  #save to csv file
  df.to_csv(media+'_crawl_'+'today', sep='\t', encoding='utf-8')
  
  #return df for inspection
  return df

In [42]:
fox = save_crawl('foxnews', 'Right', fox_query_params)

In [43]:
breitbart = save_crawl('breitbart', 'Right', breitbart_query_params)

In [41]:
washtimes = save_crawl('washtimes', 'Right', washtimes_query_params)

In [47]:
reuters = save_crawl('reuters', 'Center', reuters_query_params)

In [37]:
npr = save_crawl('npr', 'Center', npr_query_params)

In [40]:
usa = save_crawl('usa', 'Center', usa_query_params)

In [63]:
nytimes = save_crawl('nytimes', 'Left', nytimes_query_params)

In [56]:
huffpo = save_crawl('huffpo', 'Left', huffpo_query_params)

In [62]:
cnn = save_crawl('cnn', 'Left', cnn_query_params)

### Prepare Dataframe

In [86]:
combine = pd.concat([fox, breitbart, washtimes, reuters, npr, usa, nytimes, huffpo, cnn])
combine = combine.reset_index(drop=True)

In [87]:
# rename sources
combine['source'] = combine['source'].replace({'nytimes.com': 'New York Times', 'cnn.com': 'CNN', 'huffingtonpost.com':'Huffington Post', 
                                              'reuters.com': 'Reuters', 'npr.org':'NPR', 'usatoday.com':'USA Today',
                                               'foxnews.com': 'Fox News', 'breitbart.com':'Breitbart', 'washingtontimes.com':'Washington Times'})

In [88]:
combine['date'] = combine['date'].str[:10]

In [90]:
combine.tail()

Unnamed: 0,bias,source,headline,text,date
27179,Left,CNN,Couple holds mid-air wedding aboard plane,MUST WATCH Couple holds mid-air wedding aboard...,2018-06-19
27180,Left,CNN,New Nordstrom markdowns to shop,We've found some of the best Nordstrom markdow...,2018-06-18
27181,Left,CNN,World Cup fortune-telling animals: Achilles th...,(CNN) Everything is falling into place.\nLast ...,2018-06-19
27182,Left,CNN,Bob Goodlatte on family separations at the bor...,Washington (CNN) The Republican chairman of th...,2018-06-19
27183,Left,CNN,Trump ramps up rhetoric: Dems want 'illegal im...,Washington (CNN) President Donald Trump amplif...,2018-06-19


### Save Crawl to CSV

In [93]:
# add date information to file name
date = datetime.datetime.now()
today = date.strftime("%Y-%m-%d")
  
#save to csv file
combine.to_csv('webhose_crawl_'+today+'.csv', sep='\t', encoding='utf-8')