In [1]:
import numpy as np
import pandas as pd

Importing the data from the previous notebook:

In [2]:
p_only = pd.read_hdf("results/df1.h5", 'p_only')
p_only.head()

Unnamed: 0,type,state,enddate,pollster,grade,samplesize,population,adjpoll_clinton,adjpoll_trump,poll_id
0,polls-only,U.S.,2016-11-06,ABC News/Washington Post,A+,2220.0,lv,45.21947,41.70754,48630
1,polls-only,U.S.,2016-11-07,Google Consumer Surveys,B,26574.0,lv,43.40083,41.14659,48847
2,polls-only,U.S.,2016-11-06,Ipsos,A,2195.0,lv,42.01984,38.74365,48922
3,polls-only,U.S.,2016-11-07,YouGov,B,3677.0,lv,45.68214,40.90047,48687
4,polls-only,U.S.,2016-11-06,Gravis Marketing,B,16639.0,rv,46.83107,42.27754,48848


Let's do some bootstrap resampling on the poll data:

In [3]:
def bootstrap_resampling(data, n):
    """ Bootstrap resample an array of data points.
    
    Parameters
    ----------
    X : array 
      array of data points to resample
    n : int
      desired length of resampled array
    
    Returns
    -------
    array 
    resampled array 
    """
    np.random.seed(5003)
    resample_i = np.floor(np.random.rand(n)*len(data)).astype(int)
    data_resample = data[resample_i]
    return data_resample

Let's test this function to make sure it works:

In [4]:
def test_bootstrap_resampling():
    #create array with 50,000 data points and ensure the returned resampled array has the same shape
    test_array = np.arange(50000)
    test_array_resample = bootstrap_resampling(test_array, 50000)
    assert test_array_resample.shape == (50000,)
    
    #try it with a different shape
    n = 37389
    test_array_resample = bootstrap_resampling(test_array, 37389)
    assert test_array_resample.shape == (37389,)

test_bootstrap_resampling()

Great! Now, let's use it with our Clinton and Trump poll data:

In [5]:
clinton_data = p_only["adjpoll_clinton"]
clinton_data.mean()

43.32251749524702

In [6]:
clinton_resampled = bootstrap_resampling(clinton_data, len(clinton_data))
clinton_resampled.mean()

43.27061350047525

In [7]:
trump_data = p_only["adjpoll_trump"]
trump_data.mean()

42.65442501116911

In [8]:
trump_resampled = bootstrap_resampling(trump_data, len(trump_data))
trump_resampled.mean()

42.62368282913505