In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import random as r
from datetime import datetime
from scipy import stats

In [3]:
data = pd.read_csv('../input/update/president_polls_general_election.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [4]:
data.columns

Index(['question_id', 'poll_id', 'cycle', 'state', 'pollster_id', 'pollster',
       'sponsor_ids', 'sponsors', 'display_name', 'pollster_rating_id',
       'pollster_rating_name', 'fte_grade', 'sample_size', 'population',
       'population_full', 'methodology', 'office_type', 'seat_number',
       'seat_name', 'start_date', 'end_date', 'election_date',
       'sponsor_candidate', 'internal', 'partisan', 'tracking',
       'nationwide_batch', 'ranked_choice_reallocated', 'created_at', 'notes',
       'url', 'stage', 'race_id', 'answer', 'candidate_id', 'candidate_name',
       'candidate_party', 'pct'],
      dtype='object')

We keep only the columns that matter to our purposes here

In [4]:
col_interest = ['state', 'pollster','fte_grade', 'sample_size','start_date', 'end_date',
       'answer', 'pct']

In [5]:
data_interest = data[col_interest]

In [7]:
data_interest.head()

Unnamed: 0,state,pollster,fte_grade,sample_size,start_date,end_date,answer,pct
0,Iowa,Public Policy Polling,B,871.0,11/1/20,11/2/20,Biden,49.0
1,Iowa,Public Policy Polling,B,871.0,11/1/20,11/2/20,Trump,48.0
2,Pennsylvania,Susquehanna Polling & Research Inc.,C,499.0,11/1/20,11/2/20,Biden,48.4
3,Pennsylvania,Susquehanna Polling & Research Inc.,C,499.0,11/1/20,11/2/20,Trump,49.2
4,Pennsylvania,Susquehanna Polling & Research Inc.,C,499.0,11/1/20,11/2/20,Jorgensen,1.4


Let's see which states districts we have, and check is anything is missing

In [8]:
len(data_interest['state'].unique())

56

In [9]:
data_interest['state'].unique()

array(['Iowa', 'Pennsylvania', 'Florida', nan, 'Nebraska CD-2', 'Montana',
       'Maine', 'Maine CD-2', 'Maine CD-1', 'Arizona', 'North Carolina',
       'Texas', 'Georgia', 'Illinois', 'Michigan', 'Minnesota',
       'New Jersey', 'New York', 'Ohio', 'Wisconsin', 'Colorado',
       'California', 'Virginia', 'Alabama', 'Nevada', 'Kansas',
       'South Carolina', 'Mississippi', 'Indiana', 'Connecticut',
       'Kentucky', 'Louisiana', 'Maryland', 'Missouri', 'Oregon',
       'Tennessee', 'Washington', 'Wyoming', 'West Virginia', 'Vermont',
       'Utah', 'South Dakota', 'Rhode Island', 'Oklahoma', 'New Mexico',
       'New Hampshire', 'Nebraska', 'North Dakota', 'Massachusetts',
       'Idaho', 'Hawaii', 'Delaware', 'District of Columbia', 'Arkansas',
       'Alaska', 'Nebraska CD-1'], dtype=object)

51 + a nan (missing) + the districts of Maine and Nebraska

Let's take a look at the names of those who ran for the U.S. presidency

In [10]:
data_interest['answer'].unique()

array(['Biden', 'Trump', 'Jorgensen', 'Hawkins', 'West', 'Blankenship',
       'De La Fuente', 'Simmons', 'Pierce', 'Pence', 'Harris', 'La Riva',
       'Kennedy', 'Hornberger', 'Cuomo', 'Clinton', 'Obama', 'Amash',
       'Sanders', 'Warren', 'Bloomberg', 'Buttigieg', 'Klobuchar',
       'Gabbard', 'Steyer', 'Yang', 'Booker', 'Castro', "O'Rourke",
       'Haley', 'Bullock', 'Delaney', 'Gillibrand', 'Williamson',
       'Messam', 'Bennet', 'de Blasio', 'Winfrey', 'Inslee',
       'Hickenlooper', 'Gravel', 'Moulton', 'Rapinoe', 'Swalwell', 'Ryan',
       'Schultz', 'Brown', 'Pelosi', 'Schumer', 'Ocasio-Cortez'],
      dtype=object)

So many names. We will just focus on Biden and Trump

In [11]:
data_interest.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16438 entries, 0 to 16437
Data columns (total 8 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   state        11385 non-null  object 
 1   pollster     16438 non-null  object 
 2   fte_grade    15253 non-null  object 
 3   sample_size  16436 non-null  float64
 4   start_date   16438 non-null  object 
 5   end_date     16438 non-null  object 
 6   answer       16438 non-null  object 
 7   pct          16438 non-null  float64
dtypes: float64(2), object(6)
memory usage: 1.0+ MB


We want to make dates into date format, not just an object string

In [6]:
data_interest['end_date'] = pd.to_datetime(data_interest['end_date'])
data_interest['start_date'] = pd.to_datetime(data_interest['start_date'])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  """Entry point for launching an IPython kernel.
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [7]:
data_interest['end_date'] - data_interest['start_date']

0       1 days
1       1 days
2       1 days
3       1 days
4       1 days
         ...  
16433   1 days
16434   1 days
16435   1 days
16436   1 days
16437   1 days
Length: 16438, dtype: timedelta64[ns]

Let's assume they are all for 1 day, so only one date of them matters, let's keep the end date

In [8]:
data_interest = data_interest.drop(columns = ['start_date'])

In [9]:
data_interest['end_date'].min()

Timestamp('2018-11-13 00:00:00')

As early as Nov 2018! We will consider the polls from Oct 12, 2020, until Nov 2

In [10]:
period = pd.date_range(start='7/15/2020', end='11/2/2020')
data_period = data_interest[data_interest['end_date'].isin(period)]

In [11]:
data_period['end_date'].max()-data_period['end_date'].min()

Timedelta('110 days 00:00:00')

Let's make sure that period didn't exclude some states

In [30]:
len(data_period['state'].unique())

56

Drop rows with empty values, particularly will get rid of state nans

In [12]:
data_period = data_period.dropna()

Run this if want to ignore congressional districts

data_period = data_period.replace('Nebraska CD-1', 'Nebraska')
data_period = data_period.replace('Nebraska CD-2', 'Nebraska')
data_period = data_period.replace('Maine CD-1', 'Maine')
data_period = data_period.replace('Maine CD-2', 'Maine')

In [19]:
len(data_period['state'].unique())

55

In [20]:
# let's look at pollster grades in common

data_period['fte_grade'].unique()

array(['B', 'C', 'B-', 'C-', 'D-', 'C+', 'B/C', 'A+', 'B+', 'A-', 'A/B',
       'C/D', 'A'], dtype=object)

In what follows, we build two functions that give weights to pollsters, and to dates. Bigger weights for best pollsters and recent dates.

Before we proceed, we want to clarify to the reader how the data looks like.

In [21]:
def poll_best(state, date, candidate): 
#returns a table of data at best pollster grade at the date input for the candidate input
    B = data_period[data_period['answer'] == candidate].drop(columns=['answer'])
    for rating in ['A+', 'A', 'A-','A/B', 'B+', 'B', 'B-', 'B/C', 'C+', 'C', 'C-', 'C/D',
        'D-']: # we ordered them
        if state in B[B['fte_grade'] == rating].state.unique():
            B_rating = B[B['fte_grade'] == rating]
            B_rating_state = B_rating[B_rating['state'] == state]
            return B_rating_state[B_rating_state['end_date'] == date]
            break

In [22]:
for date in period:
    if len(poll_best('Florida', date,'Biden'))!=0:
        display(poll_best('Florida', date,'Biden'))

Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
8547,Florida,Marist College,A+,1047.0,2020-09-06,47.0
8549,Florida,Marist College,A+,766.0,2020-09-06,48.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
8218,Florida,Monmouth University,A+,428.0,2020-09-13,50.0
8222,Florida,Monmouth University,A+,428.0,2020-09-13,50.0
8224,Florida,Monmouth University,A+,428.0,2020-09-13,49.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
7877,Florida,ABC News/The Washington Post,A+,765.0,2020-09-20,48.0
7879,Florida,ABC News/The Washington Post,A+,613.0,2020-09-20,47.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
6987,Florida,Siena College/The New York Times Upshot,A+,710.0,2020-10-01,47.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
1849,Florida,Marist College,A+,743.0,2020-10-27,51.0
1851,Florida,Marist College,A+,1001.0,2020-10-27,51.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
1586,Florida,Monmouth University,A+,509.0,2020-10-28,50.0
1590,Florida,Monmouth University,A+,509.0,2020-10-28,51.0
1593,Florida,Monmouth University,A+,509.0,2020-10-28,50.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
1281,Florida,ABC News/The Washington Post,A+,915.0,2020-10-29,47.0
1285,Florida,ABC News/The Washington Post,A+,824.0,2020-10-29,48.0


Unnamed: 0,state,pollster,fte_grade,sample_size,end_date,pct
656,Florida,Siena College/The New York Times Upshot,A+,1451.0,2020-10-31,47.0


The table above shows the highest grade polls for Florida over the period of time. You can see how for the same pollster, same method, and at the same time, we do sometimes have multiple values for the percentage and the sample size. Hence, we are taking average by weighting each percentage with the size of the corresponding sample and according to the dates as will be made clear below. In essence,
$$\frac{weight^Tpct}{\sum weight}$$

Now we proceed to our data assimilation funtions. First, it will be easier if we replace grade symbols by numerical values. Also this will help defining the functions in a way that works vectorially

In [13]:
data_period = data_period.replace('A+', 0)
data_period = data_period.replace('A', 1)
data_period = data_period.replace('A-', 2)
data_period = data_period.replace('A/B', 3)
data_period = data_period.replace('B+', 4)
data_period = data_period.replace('B', 5)
data_period = data_period.replace('B-', 6)
data_period = data_period.replace('B/C', 7)
data_period = data_period.replace('C+', 8)
data_period = data_period.replace('C', 9)
data_period = data_period.replace('C-', 10)
data_period = data_period.replace('C/D', 11)
data_period = data_period.replace('D-', 12)

In [14]:
def w_poll(grade):
    return 1 - grade/15  # just a suitable denominator as grades take numbers up to 12

# similarly for the dates
end_p = period.max()
d_date = 4+len(period)
def w_date(date):
    return 1 - (end_p - date).dt.days/ d_date

# Remark: we defined the variable end_p, d_date to make the code run faster and avoid computations within funcitons

In what follows, we build a function which when given a state and condidate, it returns a percentage based on a weighted average over the pollsters and previous dates within the time period according.

In [25]:
def Assimilate1(state, candidate):
    # look at the following subtable
    C = data_period[data_period['answer'] == candidate]
    C = C[C['state'] == state]
    wd = w_date(C['end_date']) # weights of dates
    wg = w_poll(C['fte_grade']) # wights of grades
    wt = np.multiply(C['sample_size'], np.multiply(wd,wg)) # total weight
    return np.sum(C['sample_size']), np.dot(C['pct'],wt)/np.sum(wt)
# we return total sample size to use it when bootstrapping

In [26]:
Assimilate1('Florida','Biden')

(701592.0, 49.224292359706354)

Actually, the following alternative is more helpful

In [27]:
def Assimilate2(state):
    # look at the following subtable
    C = data_period[data_period['state'] == state]
    CB = C[C['answer'] == 'Biden']
    wd = w_date(CB['end_date']) # weights of dates
    wg = w_poll(CB['fte_grade']) # wights of grades
    wt = np.multiply(CB['sample_size'], np.multiply(wd,wg)) # total weight
    pB = 0.01*np.dot(CB['pct'],wt)/np.sum(wt)
    
    CT = C[C['answer'] == 'Trump']
    pT = 0.01*np.dot(CT['pct'],wt)/np.sum(wt)
    
    return np.sum(CB['sample_size']), pB,pT, 1-pT-pB
# we return total sample size to use it when bootstrapping

In [28]:
Assimilate2('Florida')

(701592.0, 0.4922429235970635, 0.4765304785633113, 0.03122659783962517)

Gorgeous!

Now let's build a function that runs simulations and predicts Biden win probability using MC method

In [29]:
def Return_prob_Biden(num_sim, state):
    #num_sim is number of scenarios
    #percs is the vector of probabilities 0: p_Biden 0, 1: p_Trump, 2:p_Other based on the assimilated percentages
    sample_size,p0,p1,p2 = Assimilate2(state)
    sz = int(sample_size)
    sim_polls = np.random.choice([0,1,2], (num_sim, sz),[p0,p1,p2])
    return np.mean(np.sum(sim_polls == np.zeros((num_sim, sz)),axis = 1)> 
                   np.sum(sim_polls == np.ones((num_sim, sz)),axis = 1))

In [41]:
%%timeit
Return_prob_Biden(1000, 'Florida')

18.1 s ± 94.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


This takes forever, maybe we should abandon some grades? Another thing to think about, does that weighting process affect big states? This can happen if there is a big state for which the only available pollsters have low grade, hence low weight. Let's take a look which states are available when we consider only top quality pollsters.

In [30]:
data_period[data_period['fte_grade'] < 5].state.unique() # states getting weights above 0.7

array(['Arizona', 'Pennsylvania', 'Ohio', 'Florida', 'Michigan', 'Iowa',
       'Nevada', 'Maine', 'Maine CD-1', 'Maine CD-2', 'North Carolina',
       'Georgia', 'Texas', 'Wisconsin', 'Nebraska CD-2', 'Massachusetts',
       'New Mexico', 'Minnesota', 'New Hampshire', 'South Dakota',
       'Kansas', 'Montana', 'Virginia', 'California', 'Kentucky',
       'South Carolina', 'Alaska', 'Indiana', 'New Jersey', 'Washington',
       'Colorado', 'New York'], dtype=object)

Alright, the heavy states all there. Nothing to worry about using that weighting system

Now to our invented strategy which picks the best pollster

In [31]:
# a slightly modified date weighting function to work on non-vectors of date
def w_date1(date):
    return 1 - (end_p - date).days/ d_date

def Assimilate3(state): 
# returns assimilated probability
    Sample_size = 0 # to total sample size 
    W = 0 # to total the weights
    PB = 0 # to total weight^T*perc for Biden
    PT = 0
    for date in period:
        for grade in range(13):
            # check if the state and date have a pollster with quality=grade
            if state in data_period[data_period['fte_grade'] == grade].state.unique():
                    if date in data_period[data_period['fte_grade'] == grade].end_date.unique():
                        Best = data_period[data_period['fte_grade'] == grade]
                        Best = Best[Best['state'] == state]
                        Best = Best[Best['end_date'] == date] # subtable at date, state, best available grade
                        pB = Best[Best['answer'] == 'Biden']['pct'] # vector of Biden percentages
                        pT = Best[Best['answer'] == 'Trump']['pct'] # vector of Trump percentages
                        Sz = Best[Best['answer'] == 'Biden']['sample_size'] # corresponding sample sizes

                        SZ = np.sum(Sz)
                        Sample_size += SZ
                        W += w_date1(date)*SZ # we use both sample size and date to set a weight
                        PB += w_date1(date)*0.01*np.dot(Sz,pB)
                        PT += w_date1(date)*0.01*np.dot(Sz,pT)
                        break
    return Sample_size, PB/W, PT/W

In [95]:
Assimilate3('Florida')

(26698.0, 0.48882099810265045, 0.4658498926833317)

Sample size is still huge

Let's consider maximum 3 weeks ahead of the last available date

In [32]:
from datetime import timedelta

In [33]:
# a slightly modified date weighting function to suit more the short time period
def w_date2(date):
    return 1 - (end_p - date).days/ 7

delta = timedelta(days = 3)
def Assimilate4(state): 
# returns assimilated probability
    Sample_size = 0 # to total sample size 
    W = 0 # to total the weights
    PB = 0 # to total weight^T*perc for Biden
    PT = 0
    for dd in period:
        if state in data_period[data_period['end_date'] == dd].state.unique():
            for date in pd.date_range(start=dd-delta, end=dd):
                for grade in range(13):
                    # check if the state and date have a pollster with quality=grade
                    if state in data_period[data_period['fte_grade'] == grade].state.unique():
                            if date in data_period[data_period['fte_grade'] == grade].end_date.unique():
                                Best = data_period[data_period['fte_grade'] == grade]
                                Best = Best[Best['state'] == state]
                                Best = Best[Best['end_date'] == date] # subtable at date, state, best available grade
                                pB = Best[Best['answer'] == 'Biden']['pct'] # vector of Biden percentages
                                pT = Best[Best['answer'] == 'Trump']['pct'] # vector of Trump percentages
                                Sz = Best[Best['answer'] == 'Biden']['sample_size'] # corresponding sample sizes

                                SZ = np.sum(Sz)
                                Sample_size += SZ
                                W += w_date2(date)*SZ # we use both sample size and date to set a weight
                                PB += w_date2(date)*0.01*np.dot(Sz,pB)
                                PT += w_date2(date)*0.01*np.dot(Sz,pT)
                                break
                break
    return Sample_size, PB/W, PT/W

In [117]:
Assimilate4('Florida')

(9720.0, 0.49416968715648124, 0.4519308357348704)

Let's re-define our bootstrapping fn to implement Assimilate4

In [34]:
def Return_prob_Biden1(num_sim, state):
    #num_sim is number of scenarios
    #percs is the vector of probabilities 0: p_Biden 0, 1: p_Trump, 2:p_Other based on the assimilated percentages
    sample_size,p0,p1 = Assimilate4(state)
    sz = int(sample_size)
    sim_polls = np.random.choice([0,1,2], (num_sim, sz),[p0,p1,1-p0-p1])
    return np.mean(np.sum(sim_polls == np.zeros((num_sim, sz)),axis = 1)> 
                   np.sum(sim_polls == np.ones((num_sim, sz)),axis = 1))

In [128]:
%%timeit
Return_prob_Biden1(50000, 'Florida')

14.8 s ± 2.8 s per loop (mean ± std. dev. of 7 runs, 1 loop each)


Now we assimilate only based on the most recent date available, and best grade available for that date

In [15]:
def Assimilate5(state): 
    date = data_period[data_period['state'] == state].end_date.max() # last date for the state
    stata = data_period[data_period['state'] == state]
    grade = stata[stata['end_date'] == date].fte_grade.min() # best grade available
    stata = stata[stata['fte_grade'] == grade]
    stataB = stata[stata['answer']=='Biden']
    stataT = stata[stata['answer']=='Trump']
    sz = stataB.sample_size
    szz = np.sum(sz)
    pB = 0.01*stataB.pct
    pT = 0.01*stataT.pct
    return szz, np.dot(pB,sz)/szz, np.dot(pT,sz)/szz

In [148]:
Assimilate5('Florida')

(9264.0, 0.49012629533678764, 0.46509067357512957)

In [16]:
def Return_prob_Biden2(num_sim, state):
    #num_sim is number of scenarios
    #percs is the vector of probabilities 0: p_Biden 0, 1: p_Trump, 2:p_Other based on the assimilated percentages
    sample_size,p0,p1 = Assimilate5(state)
    sz = int(sample_size)
    sim_polls = np.random.choice([0,1,2], (num_sim, sz),[p0,p1,1-p0-p1])
    return np.mean(np.sum(sim_polls == np.zeros((num_sim, sz)),axis = 1)> 
                   np.sum(sim_polls == np.ones((num_sim, sz)),axis = 1))

In [153]:
Return_prob_Biden2(10000, 'Florida')

0.497

Now we are able to creat a vector of probabilities for all states for every candiadate. We record only Biden vs Trump, and lump the rest as Other.

In [None]:
for state in data_period['state'].unique():
    print(state, Return_prob_Biden2(10000, state))

Iowa 0.4945
Pennsylvania 0.5049
Florida 0.4906
Nebraska CD-2 0.4852
Montana 0.4873
Maine 0.4903
Maine CD-2 0.4954
Maine CD-1 0.4958
Arizona 0.4944


Thinking of doing it 5 states at a time

In [37]:
SS = data_period['state'].unique()

In [38]:
for s in range(5):
    print(SS[s], Return_prob_Biden2(10000, SS[s]))

Iowa 0.4965
Pennsylvania 0.4991
Florida 0.4985
Nebraska CD-2 0.493
Montana 0.4862


In [None]:
for s in range(5,10):
    print(SS[s], Return_prob_Biden2(10000, SS[s]))

Maine 0.4877
Maine CD-2 0.4844
Maine CD-1 0.4931
Arizona 0.4859


Crashed again. Ok, let's do 5000 simulations

In [None]:
for state in data_period['state'].unique():
    print(state, Return_prob_Biden2(5000, state))

Iowa 0.494
Pennsylvania 0.4862
Florida 0.4892
Nebraska CD-2 0.4884
Montana 0.4974
Maine 0.4888
Maine CD-2 0.4864
Maine CD-1 0.477
Arizona 0.5012
