In [1]:
import numpy as np
import pandas as pd

First, we load our dataset

In [2]:
df = pd.read_csv('data/president_general_polls_2016.csv')
df.head() # display the first five rows of dataframe

Unnamed: 0,cycle,branch,type,matchup,forecastdate,state,startdate,enddate,pollster,grade,...,adjpoll_clinton,adjpoll_trump,adjpoll_johnson,adjpoll_mcmullin,multiversions,url,poll_id,question_id,createddate,timestamp
0,2016,President,polls-plus,Clinton vs. Trump vs. Johnson,11/8/16,U.S.,11/3/2016,11/6/2016,ABC News/Washington Post,A+,...,45.20163,41.7243,4.626221,,,https://www.washingtonpost.com/news/the-fix/wp...,48630,76192,11/7/16,09:35:33 8 Nov 2016
1,2016,President,polls-plus,Clinton vs. Trump vs. Johnson,11/8/16,U.S.,11/1/2016,11/7/2016,Google Consumer Surveys,B,...,43.34557,41.21439,5.175792,,,https://datastudio.google.com/u/0/#/org//repor...,48847,76443,11/7/16,09:35:33 8 Nov 2016
2,2016,President,polls-plus,Clinton vs. Trump vs. Johnson,11/8/16,U.S.,11/2/2016,11/6/2016,Ipsos,A-,...,42.02638,38.8162,6.844734,,,http://projects.fivethirtyeight.com/polls/2016...,48922,76636,11/8/16,09:35:33 8 Nov 2016
3,2016,President,polls-plus,Clinton vs. Trump vs. Johnson,11/8/16,U.S.,11/4/2016,11/7/2016,YouGov,B,...,45.65676,40.92004,6.069454,,,https://d25d2506sfb94s.cloudfront.net/cumulus_...,48687,76262,11/7/16,09:35:33 8 Nov 2016
4,2016,President,polls-plus,Clinton vs. Trump vs. Johnson,11/8/16,U.S.,11/3/2016,11/6/2016,Gravis Marketing,B-,...,46.84089,42.33184,3.726098,,,http://www.gravispolls.com/2016/11/final-natio...,48848,76444,11/7/16,09:35:33 8 Nov 2016


Now, we take a look at the properties of our dataset

In [3]:
print("Number of polls: " + str(df.shape[0]))
print("Columns: " + str(df.columns))

Number of polls: 12624
Columns: Index(['cycle', 'branch', 'type', 'matchup', 'forecastdate', 'state',
       'startdate', 'enddate', 'pollster', 'grade', 'samplesize', 'population',
       'poll_wt', 'rawpoll_clinton', 'rawpoll_trump', 'rawpoll_johnson',
       'rawpoll_mcmullin', 'adjpoll_clinton', 'adjpoll_trump',
       'adjpoll_johnson', 'adjpoll_mcmullin', 'multiversions', 'url',
       'poll_id', 'question_id', 'createddate', 'timestamp'],
      dtype='object')


Now, we refactor our dataframe with only the columns relevant to our analysis
(we only use adjusted, not raw, data, and we remove other candidates, and a bunch of other columns)

In [4]:
categories = ['type', 'state', 'enddate', 'pollster', 'grade', 'samplesize', 'population',
             'adjpoll_clinton', 'adjpoll_trump', 'poll_id']
our_df = df.loc[:, categories]
our_df.head()

Unnamed: 0,type,state,enddate,pollster,grade,samplesize,population,adjpoll_clinton,adjpoll_trump,poll_id
0,polls-plus,U.S.,11/6/2016,ABC News/Washington Post,A+,2220.0,lv,45.20163,41.7243,48630
1,polls-plus,U.S.,11/7/2016,Google Consumer Surveys,B,26574.0,lv,43.34557,41.21439,48847
2,polls-plus,U.S.,11/6/2016,Ipsos,A-,2195.0,lv,42.02638,38.8162,48922
3,polls-plus,U.S.,11/7/2016,YouGov,B,3677.0,lv,45.65676,40.92004,48687
4,polls-plus,U.S.,11/6/2016,Gravis Marketing,B-,16639.0,rv,46.84089,42.33184,48848


We varify that there are 3 different types of polls:

In [5]:
print(our_df.loc[:,'type'].unique())

['polls-plus' 'now-cast' 'polls-only']


We only consider the polls of type "polls-only" (summary: explain why) 

In [6]:
p_only = our_df[our_df.loc[:,'type']=='polls-only']
p_only = p_only.reset_index(drop=True) 
p_only.head()

Unnamed: 0,type,state,enddate,pollster,grade,samplesize,population,adjpoll_clinton,adjpoll_trump,poll_id
0,polls-only,U.S.,11/6/2016,ABC News/Washington Post,A+,2220.0,lv,45.21947,41.70754,48630
1,polls-only,U.S.,11/7/2016,Google Consumer Surveys,B,26574.0,lv,43.40083,41.14659,48847
2,polls-only,U.S.,11/6/2016,Ipsos,A-,2195.0,lv,42.01984,38.74365,48922
3,polls-only,U.S.,11/7/2016,YouGov,B,3677.0,lv,45.68214,40.90047,48687
4,polls-only,U.S.,11/6/2016,Gravis Marketing,B-,16639.0,rv,46.83107,42.27754,48848


However, we must convert the dates into Python datetime objects (FOR LATER ANALYSIS?)

In [7]:
p_only.loc[:,'enddate'] = pd.to_datetime(p_only.loc[:,'enddate']) #convert 'enddate' into 'datetime' variables
p_only.head()

Unnamed: 0,type,state,enddate,pollster,grade,samplesize,population,adjpoll_clinton,adjpoll_trump,poll_id
0,polls-only,U.S.,2016-11-06,ABC News/Washington Post,A+,2220.0,lv,45.21947,41.70754,48630
1,polls-only,U.S.,2016-11-07,Google Consumer Surveys,B,26574.0,lv,43.40083,41.14659,48847
2,polls-only,U.S.,2016-11-06,Ipsos,A-,2195.0,lv,42.01984,38.74365,48922
3,polls-only,U.S.,2016-11-07,YouGov,B,3677.0,lv,45.68214,40.90047,48687
4,polls-only,U.S.,2016-11-06,Gravis Marketing,B-,16639.0,rv,46.83107,42.27754,48848


We see if there are any gaps in our data:

In [8]:
print("Number of empty values for each column:")
print(p_only.isnull().sum())

Number of empty values for each column:
type                 0
state                0
enddate              0
pollster             0
grade              429
samplesize           1
population           0
adjpoll_clinton      0
adjpoll_trump        0
poll_id              0
dtype: int64


There are 429 polls that are not assigned a grade. Let's see the different grade values:

In [9]:
p_only['grade'].unique()

array(['A+', 'B', 'A-', 'B-', 'A', nan, 'B+', 'C+', 'C-', 'C', 'D'], dtype=object)

For simplicity, we conflate "-" and "+" designations, resulting in 4 unique grades - A,B,C, and D. We also designate the polls with "nan" grades as "No grade"

In [10]:
for index in range(len(p_only)):
    grade = p_only.loc[index, 'grade']
    if grade == "A-" or grade == "A+": 
        p_only.loc[index, 'grade'] = "A"
    elif grade == "B-" or  grade == "B+": 
        p_only.loc[index, 'grade'] = "B"
    elif grade == "C-" or grade == "C+":
        p_only.loc[index, 'grade'] = "C"
p_only.loc[p_only['grade'].isnull(), "grade"] = "No grade"    #change empty grades to "No Grade"

p_only["grade"].unique()

array(['A', 'B', 'No grade', 'C', 'D'], dtype=object)

In [11]:
p_only.head()

Unnamed: 0,type,state,enddate,pollster,grade,samplesize,population,adjpoll_clinton,adjpoll_trump,poll_id
0,polls-only,U.S.,2016-11-06,ABC News/Washington Post,A,2220.0,lv,45.21947,41.70754,48630
1,polls-only,U.S.,2016-11-07,Google Consumer Surveys,B,26574.0,lv,43.40083,41.14659,48847
2,polls-only,U.S.,2016-11-06,Ipsos,A,2195.0,lv,42.01984,38.74365,48922
3,polls-only,U.S.,2016-11-07,YouGov,B,3677.0,lv,45.68214,40.90047,48687
4,polls-only,U.S.,2016-11-06,Gravis Marketing,B,16639.0,rv,46.83107,42.27754,48848


SAMPLESIZE = 1 (1 empty samplesize????)

In [12]:
p_only.loc[p_only['samplesize'].isnull(), :]

Unnamed: 0,type,state,enddate,pollster,grade,samplesize,population,adjpoll_clinton,adjpoll_trump,poll_id
1801,polls-only,Illinois,2016-07-12,Basswood Research,C,,lv,49.63946,36.39749,44748


Dataframe with the results:
https://docs.google.com/spreadsheets/d/133Eb4qQmOxNvtesw2hdVns073R68EZx4SfCnP4IGQf8/htmlview

create dataframe

Save the dataframe (/s) to be used later

In [13]:
p_only.to_hdf('results/df1.h5', 'p_only')
#results.to_hdf('results/df2.h5', 'results')