In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random as r
from datetime import datetime
from scipy import stats

In [2]:
data = pd.read_csv('../input/update/president_polls_general_election.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [3]:
data.columns

Index(['question_id', 'poll_id', 'cycle', 'state', 'pollster_id', 'pollster',
       'sponsor_ids', 'sponsors', 'display_name', 'pollster_rating_id',
       'pollster_rating_name', 'fte_grade', 'sample_size', 'population',
       'population_full', 'methodology', 'office_type', 'seat_number',
       'seat_name', 'start_date', 'end_date', 'election_date',
       'sponsor_candidate', 'internal', 'partisan', 'tracking',
       'nationwide_batch', 'ranked_choice_reallocated', 'created_at', 'notes',
       'url', 'stage', 'race_id', 'answer', 'candidate_id', 'candidate_name',
       'candidate_party', 'pct'],
      dtype='object')

We keep only the columns that matter to our purposes here

In [4]:
col_interest = ['state', 'pollster','fte_grade', 'sample_size','methodology','start_date', 'end_date',
       'answer', 'pct']

In [5]:
data_interest = data[col_interest]

In [6]:
data_interest.head()

Unnamed: 0,state,pollster,fte_grade,sample_size,methodology,start_date,end_date,answer,pct
0,Iowa,Public Policy Polling,B,871.0,IVR/Text,11/1/20,11/2/20,Biden,49.0
1,Iowa,Public Policy Polling,B,871.0,IVR/Text,11/1/20,11/2/20,Trump,48.0
2,Pennsylvania,Susquehanna Polling & Research Inc.,C,499.0,Live Phone,11/1/20,11/2/20,Biden,48.4
3,Pennsylvania,Susquehanna Polling & Research Inc.,C,499.0,Live Phone,11/1/20,11/2/20,Trump,49.2
4,Pennsylvania,Susquehanna Polling & Research Inc.,C,499.0,Live Phone,11/1/20,11/2/20,Jorgensen,1.4


Let's see which states districts we have, and check is anything is missing

In [7]:
len(data_interest['state'].unique())

56

In [8]:
data_interest['state'].unique()

array(['Iowa', 'Pennsylvania', 'Florida', nan, 'Nebraska CD-2', 'Montana',
       'Maine', 'Maine CD-2', 'Maine CD-1', 'Arizona', 'North Carolina',
       'Texas', 'Georgia', 'Illinois', 'Michigan', 'Minnesota',
       'New Jersey', 'New York', 'Ohio', 'Wisconsin', 'Colorado',
       'California', 'Virginia', 'Alabama', 'Nevada', 'Kansas',
       'South Carolina', 'Mississippi', 'Indiana', 'Connecticut',
       'Kentucky', 'Louisiana', 'Maryland', 'Missouri', 'Oregon',
       'Tennessee', 'Washington', 'Wyoming', 'West Virginia', 'Vermont',
       'Utah', 'South Dakota', 'Rhode Island', 'Oklahoma', 'New Mexico',
       'New Hampshire', 'Nebraska', 'North Dakota', 'Massachusetts',
       'Idaho', 'Hawaii', 'Delaware', 'District of Columbia', 'Arkansas',
       'Alaska', 'Nebraska CD-1'], dtype=object)

51 + a nan (missing) + the districts of Maine and Nebraska

Let's take a look at the names of those who ran for the U.S. presidency

In [9]:
data_interest['answer'].unique()

array(['Biden', 'Trump', 'Jorgensen', 'Hawkins', 'West', 'Blankenship',
       'De La Fuente', 'Simmons', 'Pierce', 'Pence', 'Harris', 'La Riva',
       'Kennedy', 'Hornberger', 'Cuomo', 'Clinton', 'Obama', 'Amash',
       'Sanders', 'Warren', 'Bloomberg', 'Buttigieg', 'Klobuchar',
       'Gabbard', 'Steyer', 'Yang', 'Booker', 'Castro', "O'Rourke",
       'Haley', 'Bullock', 'Delaney', 'Gillibrand', 'Williamson',
       'Messam', 'Bennet', 'de Blasio', 'Winfrey', 'Inslee',
       'Hickenlooper', 'Gravel', 'Moulton', 'Rapinoe', 'Swalwell', 'Ryan',
       'Schultz', 'Brown', 'Pelosi', 'Schumer', 'Ocasio-Cortez'],
      dtype=object)

So many names. We will just focus on Biden and Trump

In [10]:
data_interest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16438 entries, 0 to 16437
Data columns (total 9 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   state        11385 non-null  object 
 1   pollster     16438 non-null  object 
 2   fte_grade    15253 non-null  object 
 3   sample_size  16436 non-null  float64
 4   methodology  16138 non-null  object 
 5   start_date   16438 non-null  object 
 6   end_date     16438 non-null  object 
 7   answer       16438 non-null  object 
 8   pct          16438 non-null  float64
dtypes: float64(2), object(7)
memory usage: 1.1+ MB


We want to make dates into date format, not just an object string

In [11]:
data_interest['end_date'] = pd.to_datetime(data_interest['end_date'])
data_interest['start_date'] = pd.to_datetime(data_interest['start_date'])
#these are the dates of interest in my analysis

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [12]:
data_interest['end_date'] - data_interest['start_date']

0       1 days
1       1 days
2       1 days
3       1 days
4       1 days
         ...  
16433   1 days
16434   1 days
16435   1 days
16436   1 days
16437   1 days
Length: 16438, dtype: timedelta64[ns]

Let's assume they are all for 1 day, so only one date of them matters, let's keep the end date

In [13]:
data_interest = data_interest.drop(columns = ['start_date'])

In [14]:
data_interest['end_date'].min()

Timestamp('2018-11-13 00:00:00')

As early as Nov 2018! We will consider the polls from mid July until first week of October. Not too early neither too close to the election date.

In [15]:
period = pd.date_range(start='7/15/2020', end='10/7/2020')
data_period = data_interest[data_interest['end_date'].isin(period)]

Let's make sure that period didn't exclude some states

In [16]:
len(data_period['state'].unique())

56

Drop rows with empty values, particularly will get rid of state nans

In [17]:
data_period = data_period.dropna()

Run this if want to ignore congressional districts

data_period = data_period.replace('Nebraska CD-1', 'Nebraska')
data_period = data_period.replace('Nebraska CD-2', 'Nebraska')
data_period = data_period.replace('Maine CD-1', 'Maine')
data_period = data_period.replace('Maine CD-2', 'Maine')

In [18]:
len(data_period['state'].unique())

55

In [19]:
# let's look at pollster grades in common

data_period['fte_grade'].unique()

array(['B', 'B-', 'A-', 'B/C', 'C+', 'C-', 'A/B', 'A+', 'A', 'B+', 'C/D',
       'D-', 'C'], dtype=object)

We will keep only the best rated which allow all states to show. As we see below, A+ alone isn't enough

In [20]:
len(data_period[data_period['fte_grade'] == 'A+'].state.unique())

17

In fact after trying, we need all those ratings present. Some states were not collected by better than D-. Here is the result if D- alone is missing

In [21]:
len(data_period[data_period['fte_grade'].isin(['A+','A','A-','A/B','B+','B','B-','B/C',
                           'C+','C','C-','C/D'])].state.unique())

47

In [22]:
len(data_period[data_period['fte_grade'].isin(['A+','A','A-','A/B','B+','B','B-','B/C',
                           'C+','C','C-','C/D','D-'])].state.unique())

55

My idea is, when investigating a state, I use the best avaliable quality pollster for it

In what follows, we build a function which when given a state and condadate, it returns a percentage averaging over the best pollsters over the time period.

In [23]:
def poll_best(state, date, candidate): 
#returns a table of data at best pollster grade at the date input for the candidate input
    B = data_period[data_period['answer'] == candidate].drop(columns=['answer'])
    for rating in ['A+', 'A', 'A-','A/B', 'B+', 'B', 'B-', 'B/C', 'C+', 'C', 'C-', 'C/D',
        'D-']: # we ordered them
        if state in B[B['fte_grade'] == rating].state.unique():
            B_rating = B[B['fte_grade'] == rating]
            B_rating_state = B_rating[B_rating['state'] == state]
            return B_rating_state[B_rating_state['end_date'] == date]
            break
    
def prob(state, candidate):
    Sample_size = []
    Pct = []
    for date in period:
        subtable = poll_best(state, date, candidate)
        Sample_size = np.append(Sample_size,subtable['sample_size'])
        Pct = np.append(Pct, subtable['pct'])
    return np.sum(Sample_size), np.sum(np.multiply(Sample_size,Pct))/np.sum(Sample_size)

Test Example

In [24]:
prob('Florida','Biden')

(5185.0, 47.955641272902604)

To clarify how the value was obtained, take a look at the table below. Those are all the highest grade polls for Florida over the period of time. You can see how for the same pollster, same method, and at the same time,  we do sometimes have multiple values for the percentage and the sample size. Hence, we are taking average by weighting each percentage with the size of the corresponding same, i.e., $$\frac{samplesize^T pct}{\sum samplesize}$$

In [25]:
for date in period:
    if len(poll_best('Florida', date,'Biden'))!=0:
        display(poll_best('Florida', date,'Biden'))

Unnamed: 0,state,pollster,fte_grade,sample_size,methodology,end_date,pct
8547,Florida,Marist College,A+,1047.0,Live Phone,2020-09-06,47.0
8549,Florida,Marist College,A+,766.0,Live Phone,2020-09-06,48.0


Unnamed: 0,state,pollster,fte_grade,sample_size,methodology,end_date,pct
8218,Florida,Monmouth University,A+,428.0,Live Phone,2020-09-13,50.0
8222,Florida,Monmouth University,A+,428.0,Live Phone,2020-09-13,50.0
8224,Florida,Monmouth University,A+,428.0,Live Phone,2020-09-13,49.0


Unnamed: 0,state,pollster,fte_grade,sample_size,methodology,end_date,pct
7877,Florida,ABC News/The Washington Post,A+,765.0,Live Phone,2020-09-20,48.0
7879,Florida,ABC News/The Washington Post,A+,613.0,Live Phone,2020-09-20,47.0


Unnamed: 0,state,pollster,fte_grade,sample_size,methodology,end_date,pct
6987,Florida,Siena College/The New York Times Upshot,A+,710.0,Live Phone,2020-10-01,47.0


Now we are able to creat a vector of probabilities for all states for every candiadate. We record only Biden vs Trump, and lump the rest as Other.

In [26]:
for state in data_period['state'].unique():
    print(state, prob(state,'Biden')[1])

Georgia 45.9587734241908
Florida 47.955641272902604
Michigan 52.0
Montana 42.0
Hawaii 58.51655629139073
Arizona 48.69341035515618
Missouri 43.2
Texas 43.0
Louisiana 42.57142857142857
Iowa 45.81866591928251
Ohio 45.0
Nevada 47.1764705882353
Maine 55.0
Maine CD-1 54.870000000000005
Maine CD-2 47.0
Minnesota 54.329896907216494
Colorado 50.0
West Virginia 39.0
North Carolina 48.10445580715851
Pennsylvania 51.51556548009983
New Jersey 57.89
Wisconsin 51.20160213618158
Nebraska CD-2 48.0
Utah 35.64405724953329
Alaska 44.0
New Hampshire 45.0
New Mexico 54.0
Maryland 61.4732872407291
Kansas 41.0
Wyoming 29.0463924963925
Washington 62.0
Vermont 55.73
Virginia 51.5
Tennessee 40.17987208598244
South Dakota 37.209625121713735
South Carolina 44.83499667332003
Rhode Island 59.138214285714284
Oregon 54.26417704011065
Oklahoma 35.38656924147529
New York 61.0
Nebraska 44.09947553988541
North Dakota 37.0
Mississippi 41.5
Massachusetts 69.44
Kentucky 39.31548480463097
Indiana 38.0
Illinois 52.9
Idaho 34.

**Beauty!**

In [27]:
Biden_probs = [prob(state,'Biden')[1] for state in data_period['state'].unique()]
Trump_probs = [prob(state,'Trump')[1] for state in data_period['state'].unique()]
Samp_size = [prob(state,'Trump')[0] for state in data_period['state'].unique()]

In [28]:
Bp = pd.DataFrame(Biden_probs, index = None)
Tp = pd.DataFrame(Trump_probs, index = None)
Sz = pd.DataFrame(Samp_size, index = None)

Build a pandas data frame to capture the outcomes and save it as csv in case we want to use it in another notebook

In [29]:
data = [pd.DataFrame(data_period['state'].unique()), Sz, Bp, Tp]
headers = ['state', 'sample_size', 'Biden', 'Trump' ]
Table =  pd.concat(data , axis=1, keys=headers)

In [30]:
Table.head()

Unnamed: 0_level_0,state,sample_size,Biden,Trump
Unnamed: 0_level_1,0,0,0,0
0,Georgia,2935.0,45.958773,47.602385
1,Florida,5185.0,47.955641,46.725169
2,Michigan,1881.0,52.0,43.424774
3,Montana,625.0,42.0,49.0
4,Hawaii,1963.0,58.516556,28.496689


In [31]:
np.shape(Table)

(55, 4)

Table.to_csv('probs_wthSampleSize.csv', index = False)

Now we build a function that returns state-win probabilities for Biden using bootstrapping.

In [32]:
def Return_prob_Biden(num_sim, sample_size, percs):
    #num_sim is number of scenarios
    #percs is the vector of probabilities 0: p_Biden 0, 1: p_Trump, 2:p_Other based on the percentages
    #we obtained earlier
    sim_polls = np.random.choice([0,1,2], (num_sim, sample_size),percs)
    return np.mean(np.sum(sim_polls == np.zeros((num_sim, sample_size)),axis = 1)> 
                   np.sum(sim_polls == np.ones((num_sim, sample_size)),axis = 1))

Now let's use it to calculate the probability of a Biden win at each state

In [33]:
#builds a table that combines states and corresponding Biden win probability
#predicted using our simulation function

num_sim = 20000
Biden_win_probs = []

for i in range(55):
    precs = [Biden_probs[i]/100, Trump_probs[i]/100, 1-(Biden_probs[i]+ Trump_probs[i])/100]
    Biden_win_probs=np.append(Biden_win_probs,Return_prob_Biden(num_sim, int(Samp_size[i]), precs))

In [34]:
Biden_win_probs

array([0.49645, 0.4957 , 0.49345, 0.48795, 0.498  , 0.49565, 0.49555,
       0.49485, 0.49635, 0.4936 , 0.49485, 0.48705, 0.491  , 0.4894 ,
       0.4905 , 0.49325, 0.4894 , 0.4935 , 0.497  , 0.4979 , 0.48805,
       0.49385, 0.4915 , 0.5007 , 0.4917 , 0.48115, 0.4951 , 0.49315,
       0.49275, 0.4963 , 0.4923 , 0.48945, 0.4896 , 0.49555, 0.49685,
       0.49825, 0.4915 , 0.4967 , 0.4957 , 0.48955, 0.48835, 0.49455,
       0.4934 , 0.4911 , 0.49505, 0.4906 , 0.4929 , 0.488  , 0.48475,
       0.50075, 0.4976 , 0.49365, 0.4977 , 0.4943 , 0.4842 ])

Beauty

Biden_pandas = pd.DataFrame(Biden_win_probs, index = None)
Biden_pandas.to_csv('Biden_vector.csv', index = False)

We filled the table with a electoral-votes column on Excel. Now we are ready to mimic the Eelectoral College as 55 biased coins

In [35]:
Table_Biden = pd.read_csv('../input/biden-win-probs/Biden_vector.csv', header = None).values

Now we find a Monte Carlo estimate of the probability of Biden win

In [36]:
n = 10000 # number of simulations
u = np.random.uniform(0,1,(n, 55))
Probss = np.repeat(Table_Biden[:,0].reshape(1,55),n, axis = 0)
Votess = np.repeat(Table_Biden[:,2].reshape(1,55),n, axis = 0)
eVotesSample = np.sum(np.multiply(Votess,(Probss - u)>0), axis = 1)
Probability = np.mean(eVotesSample >= 270)

In [37]:
Probability

0.4619