In [2]:
import datetime
import json
import numpy as np
import scipy as sp
import pandas as pd
import re
pd.set_option('display.width', 500)
pd.set_option('display.max_columns', 100)
pd.set_option('display.notebook_repr_html', True)

# CS 182: Data Collection and Cleaning

We decided to use two data sets.

# Github Data 

Our primary data set is from Peter Downs via Github.

Link: https://github.com/peterldowns/clickbait-classifier

In [3]:
# collect data
df1 = pd.read_json('https://raw.githubusercontent.com/peterldowns/clickbait-classifier/master/data/buzzfeed.json')
df2 = pd.read_json('https://raw.githubusercontent.com/peterldowns/clickbait-classifier/master/data/clickhole.json')
df3 = pd.read_json('https://raw.githubusercontent.com/peterldowns/clickbait-classifier/master/data/dose.json')
df4 = pd.read_json('https://raw.githubusercontent.com/peterldowns/clickbait-classifier/master/data/nytimes.json')

In [4]:
print 'There are', len(df1), 'Buzzfeed,', len(df4), 'NYT,', len(df2), 'Clickhole, and', len(df3), 'Daily Dose articles in the Github data set.'

There are 367 Buzzfeed, 3104 NYT, 547 Clickhole, and 2182 Daily Dose articles in the Github data set.


In [5]:
# label data before concatenating
df1['source'] = 'Buzzfeed'
df2['source'] = 'Clickhole'
df3['source'] = 'Daily Dose'
df4['source'] = 'NY Times'

In [6]:
# join publications
gh_data = pd.concat([df1[['article_title','article_url','clickbait','source']], df2, df3, df4[['article_title','clickbait','source']]])
gh_data.head()

Unnamed: 0,article_title,article_url,clickbait,source
0,23 Life Lessons Cosmo Kramer Taught Us,/javiermoreno/ife-lessons-you-learned-from-cos...,1,Buzzfeed
1,32 Men On TV Who Made You Thirsty In 2014,/erinlarosa/32-tv-men-who-made-you-thirsty-in-...,1,Buzzfeed
2,Hilary Duff Was The Walking Queen Of 2014,/lyapalater/hilary-duff-was-the-walking-queen-...,1,Buzzfeed
3,25 Reasons Wine Is Definitely Your Soulmate,/emleschh/25-reasons-why-wine-is-your-soulmate...,1,Buzzfeed
4,This Master Carver Making Pliers From One Stic...,/norbertobriceno/ernest-macguyver-warther,1,Buzzfeed


In [8]:
gh_data.to_csv('gh_data.csv')

# Kaggle UCI Data

Our supplementary data set is the UCI data set, which contains headlines, URLs, and categories for 422,937 news stories collected by a web aggregator between March 10th, 2014 and August 10th, 2014. According to Wikipedia, clickbait was already ubiquitous on the web by 2014, so the time captured by this data set is appropriate (https://en.wikipedia.org/wiki/Clickbait).

Link: https://www.kaggle.com/uciml/news-aggregator-dataset

In [15]:
df = pd.read_csv('uci-news-aggregator.csv')

In [16]:
df.head(1)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698


In [17]:
# convert unix time to readable format
df['DATE'] = [datetime.datetime.fromtimestamp(int(x)/1000).strftime('%Y-%m-%d') for x in df['TIMESTAMP']]

In [18]:
df.head(1)

Unnamed: 0,ID,TITLE,URL,PUBLISHER,CATEGORY,STORY,HOSTNAME,TIMESTAMP,DATE
0,1,"Fed official says weak data caused by weather,...",http://www.latimes.com/business/money/la-fi-mo...,Los Angeles Times,b,ddUyU0VZz0BRneMioxUPQVP6sIxvM,www.latimes.com,1394470370698,2014-03-10


In [19]:
# top 10 publications represented in the data set
df[['HOSTNAME','ID']].groupby('HOSTNAME').agg('count').sort('ID', ascending = False).reset_index().head(10)

Unnamed: 0,HOSTNAME,ID
0,in.reuters.com,2877
1,www.huffingtonpost.com,2603
2,www.businessweek.com,2420
3,www.contactmusic.com,2334
4,www.dailymail.co.uk,2258
5,www.nasdaq.com,2228
6,www.examiner.com,2085
7,www.globalpost.com,1975
8,www.latimes.com,1913
9,www.bizjournals.com,1882


In [20]:
list(df[['HOSTNAME','ID']].groupby('HOSTNAME').agg('count').sort('ID', ascending = False).reset_index().head(10)['HOSTNAME'])

['in.reuters.com',
 'www.huffingtonpost.com',
 'www.businessweek.com',
 'www.contactmusic.com',
 'www.dailymail.co.uk',
 'www.nasdaq.com',
 'www.examiner.com',
 'www.globalpost.com',
 'www.latimes.com',
 'www.bizjournals.com']

In [21]:
import sys
reload(sys)
sys.setdefaultencoding('utf-8')

# clickbait column
df['CLICKBAIT'] = 0

df.loc[df['HOSTNAME'] == 'www.aceshowbiz.com', 'CLICKBAIT'] = 1
df.loc[df['HOSTNAME'] == 'www.buzzfeed.com', 'CLICKBAIT'] = 1
df.loc[df['HOSTNAME'] == 'www.uproxx.com', 'CLICKBAIT'] = 1
df.loc[df['HOSTNAME'] == 'www.people.com', 'CLICKBAIT'] = 1
df.loc[df['HOSTNAME'] == 'www.inquisitr.com', 'CLICKBAIT'] = 0
df.loc[df['HOSTNAME'] == 'www.streetinsider.com', 'CLICKBAIT'] = 0
df.loc[df['HOSTNAME'] == 'www.moneynews.com', 'CLICKBAIT'] = 0
df.loc[df['HOSTNAME'] == 'www.bostonglobe.com', 'CLICKBAIT'] = 0

# add these rows to github data
extra_df = df[df['HOSTNAME'].isin(['www.aceshowbiz.com', 'www.buzzfeed.com', 'www.uproxx.com',
                                  'www.people.com', 'www.inquisitr.com', 'www.streetinsider.com',
                                  'www.moneynews.com', 'www.bostonglobe.com'])]
extra_df = extra_df[['TITLE','URL','CLICKBAIT','PUBLISHER']]
extra_df.columns = ['article_title','article_url','clickbait','source']

# merge dataframes
gh_data2 = pd.concat([gh_data, extra_df])
gh_data2.head()

gh_data2.to_csv('augmented.csv')

In [1]:
"Number of observations", + len(gh_data2)

NameError: name 'gh_data2' is not defined

In [31]:
print "Number clickbait", + float(len(gh_data2[gh_data2.clickbait == 1]))
print "Percentage clickbait", + float(len(gh_data2[gh_data2.clickbait == 1]))/len(gh_data2)

In [115]:
# bias is 0 if liberal  
df['BIAS'] = 0

# conservative
df.loc[df['HOSTNAME'] == 'www.breitbart.com', 'BIAS'] = 1
df.loc[df['HOSTNAME'] == 'www.theblaze.com', 'BIAS'] = 1
df.loc[df['HOSTNAME'] == 'www.rushlimbaugh.com', 'BIAS'] = 1
df.loc[df['HOSTNAME'] == 'www.foxnews.com', 'BIAS'] = 1
df.loc[df['HOSTNAME'] == 'nypost.com', 'BIAS'] = 1

# liberal
df.loc[df['HOSTNAME'] == 'www.theguardian.com', 'BIAS'] = 0
df.loc[df['HOSTNAME'] == 'www.nytimes.com', 'BIAS'] = 0
df.loc[df['HOSTNAME'] == 'www.slate.com', 'BIAS'] = 0
df.loc[df['HOSTNAME'] == 'www.newyorker.com', 'BIAS'] = 0
df.loc[df['HOSTNAME'] == 'www.aljazeera.com', 'BIAS'] = 0

# political csv
polit_df = df[df['HOSTNAME'].isin(['www.breitbart.com','www.theblaze.com','www.rushlimbaugh.com',
                                  'www.foxnews.com','www.drudge.com','www.theguardian.com','www.nytimes.com',
                                  'www.slate.com','www.newyorker.com','www.aljazeera.com'])]

polit_df.to_csv('politics.csv')