In [1]:
import sys
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import datetime

%matplotlib inline

In [4]:
ads_df = pd.read_csv('data/poltical_tv_ad_archive_entire_dataset.csv')

In [5]:
ads_metadata = pd.read_csv("data/unique_ad_metadata.csv")

In [8]:
ads_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,sponsors,sponsor_types,race,cycle,subjects,candidates,type,message,date_created
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59 UTC,2016-09-09 00:13:29 UTC,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25 UTC,2016-09-06 21:58:55 UTC,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,Hillary for America,Multiple,PRES,2016.0,"Energy, China, Jobs",Hillary Clinton,campaign,pro,2016-09-12 14:49:32


In [9]:
ads_metadata.head()

Unnamed: 0,wp_identifier,archive_id,embed_url,sponsors,sponsor_types,subjects,candidates,type,race,cycle,message,air_count,reference_count,market_count,transcript,date_ingested
0,8403,PolAd_CatherineCortezMasto_jbah8,https://archive.org/embed/PolAd_CatherineCorte...,Catherine Cortez Masto for Senate,Candidate Committee,"Candidate Biography, Bipartisanship, Criminal ...",Catherine Cortez Masto,campaign,NVS2,2016.0,pro,185,0,1,,2016/11/17 5:03:54 UTC
1,8404,PolAd_DonaldTrump_kc0en,https://archive.org/embed/PolAd_DonaldTrump_kc0en,Hillary for America,Multiple,"Nuclear, Candidate Biography, Military, Foreig...",Donald Trump,campaign,PRES,2016.0,con,139,0,1,,2016/11/17 5:02:52 UTC
2,8358,PolAd_DonaldTrump_ncck5,https://archive.org/embed/PolAd_DonaldTrump_ncck5,Hillary for America,Multiple,"Bankruptcy, Candidate Biography, Workers",Donald Trump,campaign,PRES,2016.0,con,195,0,2,,2016/11/14 10:44:42 UTC
3,8359,PolAd_Guns_gv0de,https://archive.org/embed/PolAd_Guns_gv0de,National Rifle Assn,PAC,Guns,,campaign,,,unknown,87,0,3,,2016/11/14 10:44:15 UTC
4,8361,PolAd_Guns_Veterans_Military_z2bvd,https://archive.org/embed/PolAd_Guns_Veterans_...,National Rifle Assn,PAC,"Guns, Veterans, Military",,campaign,,,unknown,54,0,3,,2016/11/14 10:43:51 UTC


# Initial Cleaning

Add filtering here first before adding/changing column types (reduce data size)

In [53]:
# filter on dates, campaign types, potentially remove nulls
ads_df.isnull().sum()

id                     0
wp_identifier          0
network                0
location              25
program               15
program_type           0
start_time             0
end_time               0
archive_id             0
embed_url              0
sponsors            1162
sponsor_types       1162
race               29872
cycle              29101
subjects           22562
candidates         25127
type                   0
message                0
date_created           0
ad_duration_sec        0
state                  0
dtype: int64

Variables of interest

Look into and clean the following columns: 
* location
* program_type
* start_time
* end_time
* sponsors
* candidates
* message

In [11]:
# some columns seem to be in various datatypes as most are encapsulated in objects. 
# we may want to check and understand necessary columns
ads_df.dtypes

id                 int64
wp_identifier      int64
network           object
location          object
program           object
program_type      object
start_time        object
end_time          object
archive_id        object
embed_url         object
sponsors          object
sponsor_types     object
race              object
cycle            float64
subjects          object
candidates        object
type              object
message           object
date_created      object
dtype: object

### Add Ad Duration column

In [21]:
# change start_time, end_time to "datetime" objects
ads_df['start_time'] = pd.to_datetime(ads_df['start_time'])
ads_df['end_time'] = pd.to_datetime(ads_df['end_time'])

# create a column for ad duration in seconds of each ad shown
ads_df['ad_duration_sec'] = (ads_df.end_time - ads_df.start_time).dt.total_seconds()

In [24]:
# top 5 ad durations
ads_df.ad_duration_sec.value_counts().head()

30.0    285854
60.0     20621
31.0     18143
29.0     16814
32.0     16761
Name: ad_duration_sec, dtype: int64

### Add State column

In [59]:
# Convert location to strings
ads_df['location'] = ads_df['location'].astype('string')
ads_df.location.value_counts()

Boston, MA/Manchester, NH                            50920
Philadelphia, PA                                     44326
Las Vegas, NV                                        42131
Ceder Rapids-Waterloo-Iowa City-Dublin, Iowa         24580
Tampa-St. Petersburg, FL                             24207
Cleveland, Ohio                                      23156
San Francisco-Oakland-San Jose, CA                   20944
Des Moines-Ames, Iowa                                17726
Raleigh-Durham-Fayetteville,  NC                     17437
Milwaukee, WI                                        15423
Sioux City, Iowa                                     12906
Phoenix-Prescott, AZ                                 12900
Washington, DC/Hagerstown, MD                        12325
Denver, CO                                           10901
Columbia, SC                                         10870
Greenville-Spartanburg, SC/Asheville-Anderson, NC     8565
Reno, NV                                              60

In [57]:
# extract states from locations since we care only about that
states = ads_df.location.apply(lambda x : x.split(',')[-1].replace(" ", "") if type(x) == str else 'None')
print(states.unique())
print(len(states.unique()))
# we see Ohio and Iowa not having correct state abbreviation 

['NV' 'NC' 'FL' 'Ohio' 'Iowa' 'PA' 'CO' 'WI' 'CA' 'AZ' 'NH' 'MD' 'NY' 'OH'
 'SC' 'VA' 'None']
17


In [35]:
def correct_state(st): 
    ''' Function to correct state names for Ohio and Iowa'''
    if st == 'Ohio': 
        return 'OH'
    elif st == 'Iowa':
        return 'IA'
    else: 
        return st

In [58]:
# correct poor format states
states = states.apply(lambda x : correct_state(x))
print(states.unique())
print(len(states.unique()))

['NV' 'NC' 'FL' 'OH' 'IA' 'PA' 'CO' 'WI' 'CA' 'AZ' 'NH' 'MD' 'NY' 'SC'
 'VA' 'None']
16


In [50]:
# add states column to our data
ads_df['state'] = states
ads_df.state.value_counts().head()

IA    55212
NH    50920
NV    48163
PA    44326
FL    31654
Name: state, dtype: int64

* We can see Iowa is the state with the most ads shown

### Join Ads Metadata to identify fact checked ads

descriptions of metadata columns can be found here: http://politicaladarchive.org/data/

important columns to add on: 
* reference_count - how many partner orgs fact checked ad
* air_count - how many times ad aired total for States the Internet Archive is tracking (incudes all airings, not just paid airings)
* market_count - how different markets the ad aired in 
* transcript - in case we want to do some NLP on the ad message

In [83]:
# archive_id seems to be the primary key to join on 
cols = ['archive_id', 'reference_count', 'air_count', 'market_count', 'transcript']
merge_df = ads_df.merge(ads_metadata[cols], how='left', on='archive_id')
# create binary flag on whether or not an ad was factchecked
merge_df['fact_checked'] = merge_df.reference_count.apply(lambda x : 1 if x > 0 else 0)

In [79]:
merge_df.head(2)

Unnamed: 0,id,wp_identifier,network,location,program,program_type,start_time,end_time,archive_id,embed_url,...,type,message,date_created,ad_duration_sec,state,reference_count,air_count,market_count,transcript,flag_fact_checked
0,1,5643,KLAS,"Las Vegas, NV",8 News Now at 5 PM,news,2016-09-09 00:12:59+00:00,2016-09-09 00:13:29+00:00,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,...,campaign,pro,2016-09-12 14:49:32,30.0,NV,0,1916,6,IThere's a race going on right approve tnow.me...,0
1,2,5643,WNCN,"Raleigh-Durham-Fayetteville, NC",North Carolina News at 500PM,news,2016-09-06 21:58:25+00:00,2016-09-06 21:58:55+00:00,PolAd_HillaryClinton_f1h3j,https://archive.org/embed/PolAd_HillaryClinton...,...,campaign,pro,2016-09-12 14:49:32,30.0,NC,0,1916,6,IThere's a race going on right approve tnow.me...,0


In [91]:
# how many times did a candidates run a fact checked ad
fc = merge_df.groupby(['candidates', 'fact_checked']).agg({'id':'count'}).reset_index(drop=False)
fc[fc.fact_checked == 1].sort_values('id', ascending = False).head()

Unnamed: 0,candidates,fact_checked,id
46,Donald Trump,1,13826
95,Hillary Clinton,1,11764
51,"Donald Trump, Hillary Clinton",1,9100
12,Bernie Sanders,1,6960
139,Marco Rubio,1,2558


* Donald Trump ran the most fact checked ads
* Candidate value that includes both Trump and Clinton? does it need cleaning? check ads metadata

In [None]:
# what was the messaging type for fact checked ads run? 
fc = merge_df.groupby(['candidates', 'fact_checked', 'message']).agg({'id':'count'}).reset_index(drop=False)
fc[fc.fact_checked == 1].sort_values('id', ascending = False).head()

Unnamed: 0,candidates,fact_checked,message,id
61,Donald Trump,1,con,10676
71,"Donald Trump, Hillary Clinton",1,mixed,8954
130,Hillary Clinton,1,pro,6551
18,Bernie Sanders,1,unknown,5083
62,Donald Trump,1,pro,3150


* Most of trumps ads that were ran had negative messaging