# Data Aggregation by Day

FiveThreeEight methodology:
- cap sample sizes at 5000
- if sample size isn't reported, use median sample size of polls from that polster (if no other info, use median sample size of all other polls
- sample size weighting - square root of poll's sample size / square root for median sample size for group

See `notes-p1.md` for details.

In [1]:
import pandas as pd

In [6]:
df = pd.read_csv("data-p1/polls_us_election_2016.csv")

df['startdate'] = pd.to_datetime(df['startdate'])
df['enddate'] = pd.to_datetime(df['enddate'])

df

Unnamed: 0,state,startdate,enddate,pollster,grade,samplesize,population,rawpoll_clinton,rawpoll_trump,rawpoll_johnson,rawpoll_mcmullin,adjpoll_clinton,adjpoll_trump,adjpoll_johnson,adjpoll_mcmullin
0,U.S.,2016-11-03,2016-11-06,ABC News/Washington Post,A+,2220.0,lv,47.00,43.00,4.00,,45.20163,41.72430,4.626221,
1,U.S.,2016-11-01,2016-11-07,Google Consumer Surveys,B,26574.0,lv,38.03,35.69,5.46,,43.34557,41.21439,5.175792,
2,U.S.,2016-11-02,2016-11-06,Ipsos,A-,2195.0,lv,42.00,39.00,6.00,,42.02638,38.81620,6.844734,
3,U.S.,2016-11-04,2016-11-07,YouGov,B,3677.0,lv,45.00,41.00,5.00,,45.65676,40.92004,6.069454,
4,U.S.,2016-11-03,2016-11-06,Gravis Marketing,B-,16639.0,rv,47.00,43.00,3.00,,46.84089,42.33184,3.726098,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4203,Virginia,2016-09-16,2016-09-22,Ipsos,A-,452.0,lv,46.54,40.04,,,46.47852,40.48017,,
4204,Wisconsin,2016-08-04,2016-08-07,Marquette University,A,683.0,lv,47.00,34.00,9.00,,48.74781,39.07778,4.705020,
4205,Utah,2016-11-01,2016-11-07,Google Consumer Surveys,B,286.0,lv,21.33,35.05,9.99,,26.65200,40.57738,9.705791,
4206,Oregon,2016-10-21,2016-11-02,Ipsos,A-,446.0,lv,46.46,37.41,,,45.12949,37.10720,,


In [7]:
df["grade"].value_counts()

grade
A-    1085
B     1011
C-     693
C+     329
B+     204
A      159
B-     142
A+      84
C       58
D       14
Name: count, dtype: int64

In [8]:
grade_order = ["F","D","D+","C-","C","C+","B-","B","B+","A-","A","A+"] # still included missing grades
df['grade'] = pd.Categorical(df['grade'], categories=grade_order, ordered=True)

In [9]:
df["population"].value_counts() # will probably just use likely voters, but keeping all for now in case

population
lv    3727
rv     418
v       42
a       21
Name: count, dtype: int64

In [23]:
df['state'].value_counts()

state
U.S.                    1106
Florida                  148
North Carolina           125
Pennsylvania             125
Ohio                     115
New Hampshire            112
Nevada                    93
Virginia                  91
Michigan                  86
Wisconsin                 80
Colorado                  80
Georgia                   80
Arizona                   79
California                71
Iowa                      70
Missouri                  68
Utah                      66
New York                  63
Illinois                  60
Texas                     58
Indiana                   56
Oregon                    54
New Jersey                54
Maine                     53
South Carolina            52
Washington                51
New Mexico                51
Kansas                    51
Massachusetts             51
Louisiana                 50
Kentucky                  49
Minnesota                 48
Idaho                     48
West Virginia             47
Maryland

In [10]:
df.isna().sum()

state                  0
startdate              0
enddate                0
pollster               0
grade                429
samplesize             1
population             0
rawpoll_clinton        0
rawpoll_trump          0
rawpoll_johnson     1409
rawpoll_mcmullin    4178
adjpoll_clinton        0
adjpoll_trump          0
adjpoll_johnson     1409
adjpoll_mcmullin    4178
dtype: int64

In [11]:
df = df.drop(columns=["rawpoll_johnson","rawpoll_mcmullin","adjpoll_johnson","adjpoll_mcmullin"]) # just looking at Clinton and Trump for now

In [12]:
pollster_medians = df.groupby('pollster')['samplesize'].median()
overall_median = float(df['samplesize'].median())

# imputation for sample size based on FiveThirtyEight methodology
def fill_na_with_median(row):

    if pd.isna(row['samplesize']):
        pollster_median = pollster_medians[row['pollster']]
        
        if pd.isna(pollster_median):
            return overall_median
        
        return pollster_median
    
    return row['samplesize']

df['samplesize'] = df.apply(fill_na_with_median, axis=1)

In [13]:
df['samplesizeadj'] = df['samplesize'].apply(lambda x: min(x, 5000)) # cap sample sizes at 5000

In [15]:
df.to_hdf('data-p1/polls-clean.h5', key='df', mode='w', format='t')
df.to_csv('data-p1/polls_clean.csv',index=False)

In [20]:
def split_to_individual_days(df):
    expanded_rows = []
    
    for idx, row in df.iterrows():
        day_range = pd.date_range(start=row['startdate'], end=row['enddate'])
        
        for day in day_range:
            expanded_row = row.copy()
            expanded_row['day'] = day
            expanded_rows.append(expanded_row)
    
    return pd.DataFrame(expanded_rows)

# formula from FiveThirtyEight
def weighted_average(group, value_column):
    sqrt_sample_sizes = group['samplesizeadj'] ** 0.5 
    median_sqrt_sample_size = group['samplesizeadj'].median() ** 0.5
    weights = sqrt_sample_sizes / median_sqrt_sample_size
    return (group[value_column] * weights).sum() / weights.sum()

def aggregate_by_day(df):
    agg_df = df.groupby(['pollster', 'state', 'population', 'day','grade'],dropna=False).apply(
        lambda x: pd.Series({
            'weighted_clinton': weighted_average(x, 'rawpoll_clinton'),
            'weighted_trump': weighted_average(x, 'rawpoll_trump'),
            'samplesize_total': x['samplesizeadj'].sum()
        })
    ).reset_index()
    
    agg_df["c-minus-t"] = agg_df["weighted_clinton"] - agg_df["weighted_trump"]
    return agg_df

expanded_df = split_to_individual_days(df)

df_agg = aggregate_by_day(expanded_df)

df_agg
 #20297

  agg_df = df.groupby(['pollster', 'state', 'population', 'day','grade'],dropna=False).apply(


Unnamed: 0,pollster,state,population,day,grade,weighted_clinton,weighted_trump,samplesize_total,c-minus-t
0,ABC News/Washington Post,Maryland,lv,2016-09-27,A+,63.0,27.0,706.0,36.0
1,ABC News/Washington Post,Maryland,lv,2016-09-28,A+,63.0,27.0,706.0,36.0
2,ABC News/Washington Post,Maryland,lv,2016-09-29,A+,63.0,27.0,706.0,36.0
3,ABC News/Washington Post,Maryland,lv,2016-09-30,A+,63.0,27.0,706.0,36.0
4,ABC News/Washington Post,Maryland,rv,2016-03-30,A+,63.0,28.0,752.0,35.0
...,...,...,...,...,...,...,...,...,...
20292,icitizen,U.S.,rv,2016-09-15,,42.0,37.0,1000.0,5.0
20293,icitizen,U.S.,rv,2016-09-16,,42.0,37.0,1000.0,5.0
20294,icitizen,U.S.,rv,2016-09-17,,42.0,37.0,1000.0,5.0
20295,icitizen,U.S.,rv,2016-09-18,,42.0,37.0,1000.0,5.0


In [16]:
grades = pd.Series(df.grade.values,index=df.pollster).to_dict()
df_agg['grade'] = df['pollster'].map(grades)
df_agg

Unnamed: 0,pollster,state,population,day,weighted_clinton,weighted_trump,samplesize_total,c-minus-t,grade
0,ABC News/Washington Post,Maryland,lv,2016-09-27,63.0,27.0,706.0,36.0,A+
1,ABC News/Washington Post,Maryland,lv,2016-09-28,63.0,27.0,706.0,36.0,B
2,ABC News/Washington Post,Maryland,lv,2016-09-29,63.0,27.0,706.0,36.0,A-
3,ABC News/Washington Post,Maryland,lv,2016-09-30,63.0,27.0,706.0,36.0,B
4,ABC News/Washington Post,Maryland,rv,2016-03-30,63.0,28.0,752.0,35.0,B-
...,...,...,...,...,...,...,...,...,...
20292,icitizen,U.S.,rv,2016-09-15,42.0,37.0,1000.0,5.0,
20293,icitizen,U.S.,rv,2016-09-16,42.0,37.0,1000.0,5.0,
20294,icitizen,U.S.,rv,2016-09-17,42.0,37.0,1000.0,5.0,
20295,icitizen,U.S.,rv,2016-09-18,42.0,37.0,1000.0,5.0,


In [28]:
df_agg.to_hdf('data-p1/agg_polls_by_day.h5', key='df', mode='w', format='t') # h5 format to preserve data types (categorical, datetime, etc)
df_agg.to_csv('data-p1/agg_polls_by_day.csv', index=False) # csv format in case anyone wants to use

### Electoral College

In [33]:
df_mn = df[(df['state'] == 'Maine') | (df['state'] == 'Nebraska')]
df_mn['state'] = [i.split(' ')[0] for i in df_mn['state']]
df_mn['state'] = [f'{i} State' for i in df_mn['state']]
df_mn

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mn['state'] = [i.split(' ')[0] for i in df_mn['state']]
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_mn['state'] = [f'{i} State' for i in df_mn['state']]


Unnamed: 0,state,startdate,enddate,pollster,grade,samplesize,population,rawpoll_clinton,rawpoll_trump,adjpoll_clinton,adjpoll_trump,samplesizeadj
168,Nebraska State,2016-11-01,2016-11-07,SurveyMonkey,C-,988.0,lv,35.00,52.00,33.59756,51.33007,988.0
170,Maine State,2016-10-28,2016-10-30,Emerson College,B,750.0,lv,46.00,42.00,46.20038,40.92931,750.0
199,Maine State,2016-11-01,2016-11-07,SurveyMonkey,C-,779.0,lv,46.00,38.00,44.61831,37.33969,779.0
223,Maine State,2016-10-20,2016-10-25,University of New Hampshire,B+,670.0,lv,48.00,37.00,47.81695,40.65277,670.0
274,Maine State,2016-10-24,2016-10-26,Maine People's Resource Center,C,812.0,lv,42.00,37.00,43.37541,39.14096,812.0
...,...,...,...,...,...,...,...,...,...,...,...,...
3191,Nebraska State,2016-07-29,2016-08-18,Ipsos,A-,185.0,lv,34.72,49.39,33.80489,51.23142,185.0
3202,Maine State,2016-07-29,2016-08-18,Ipsos,A-,210.0,lv,32.92,37.19,31.89776,39.14135,210.0
3251,Nebraska State,2016-08-17,2016-08-23,Google Consumer Surveys,B,174.0,lv,24.50,38.25,32.18918,48.02152,174.0
3375,Nebraska State,2016-08-10,2016-08-16,Google Consumer Surveys,B,180.0,lv,32.71,30.16,39.88624,40.50210,180.0


In [34]:
expanded_df_mn = split_to_individual_days(df_mn)
df_agg_mn = aggregate_by_day(expanded_df_mn)
df_agg_mn

  agg_df = df.groupby(['pollster', 'state', 'population', 'day','grade'],dropna=False).apply(


Unnamed: 0,pollster,state,population,day,grade,weighted_clinton,weighted_trump,samplesize_total,c-minus-t
0,Critical Insights,Maine State,lv,2016-03-04,C,43.0,34.0,610.0,9.0
1,Critical Insights,Maine State,lv,2016-03-05,C,43.0,34.0,610.0,9.0
2,Critical Insights,Maine State,lv,2016-03-06,C,43.0,34.0,610.0,9.0
3,Critical Insights,Maine State,lv,2016-03-07,C,43.0,34.0,610.0,9.0
4,Critical Insights,Maine State,lv,2016-03-08,C,43.0,34.0,610.0,9.0
...,...,...,...,...,...,...,...,...,...
599,YouGov,Nebraska State,lv,2016-11-02,B,32.3,44.9,506.0,-12.6
600,YouGov,Nebraska State,lv,2016-11-03,B,32.3,44.9,506.0,-12.6
601,YouGov,Nebraska State,lv,2016-11-04,B,32.3,44.9,506.0,-12.6
602,YouGov,Nebraska State,lv,2016-11-05,B,32.3,44.9,506.0,-12.6


In [37]:
df_agg_all = pd.concat([df_agg, df_agg_mn])
df_agg_all

Unnamed: 0,pollster,state,population,day,grade,weighted_clinton,weighted_trump,samplesize_total,c-minus-t
0,ABC News/Washington Post,Maryland,lv,2016-09-27,A+,63.0,27.0,706.0,36.0
1,ABC News/Washington Post,Maryland,lv,2016-09-28,A+,63.0,27.0,706.0,36.0
2,ABC News/Washington Post,Maryland,lv,2016-09-29,A+,63.0,27.0,706.0,36.0
3,ABC News/Washington Post,Maryland,lv,2016-09-30,A+,63.0,27.0,706.0,36.0
4,ABC News/Washington Post,Maryland,rv,2016-03-30,A+,63.0,28.0,752.0,35.0
...,...,...,...,...,...,...,...,...,...
599,YouGov,Nebraska State,lv,2016-11-02,B,32.3,44.9,506.0,-12.6
600,YouGov,Nebraska State,lv,2016-11-03,B,32.3,44.9,506.0,-12.6
601,YouGov,Nebraska State,lv,2016-11-04,B,32.3,44.9,506.0,-12.6
602,YouGov,Nebraska State,lv,2016-11-05,B,32.3,44.9,506.0,-12.6


In [36]:
electoral_college = {
    "Alabama": 9,
    "Alaska": 3,
    "Arizona": 11,
    "Arkansas": 6,
    "California": 55,
    "Colorado": 9,
    "Connecticut": 7,
    "Delaware": 3,
    "District of Columbia": 3,
    "Florida": 29,
    "Georgia": 16,
    "Hawaii": 4,
    "Idaho": 4,
    "Illinois": 20,
    "Indiana": 11,
    "Iowa": 6,
    "Kansas": 6,
    "Kentucky": 8,
    "Louisiana": 8,
    "Maine State": 2,
    "Maine CD-1": 1,
    "Maine CD-2": 1,
    "Maryland": 10,
    "Massachusetts": 11,
    "Michigan": 16,
    "Minnesota": 10,
    "Mississippi": 6,
    "Missouri": 10,
    "Montana": 3,
    "Nebraska State": 2,
    "Nebraska CD-1": 1,
    "Nebraska CD-2": 1,
    "Nebraska CD-3": 1,
    "Nevada": 6,
    "New Hampshire": 4,
    "New Jersey": 14,
    "New Mexico": 5,
    "New York": 29,
    "North Carolina": 15,
    "North Dakota": 3,
    "Ohio": 18,
    "Oklahoma": 7,
    "Oregon": 7,
    "Pennsylvania": 20,
    "Rhode Island": 4,
    "South Carolina": 9,
    "South Dakota": 3,
    "Tennessee": 11,
    "Texas": 38,
    "Utah": 6,
    "Vermont": 3,
    "Virginia": 13,
    "Washington": 12,
    "West Virginia": 5,
    "Wisconsin": 10,
    "Wyoming": 3
}


In [38]:
df_agg_all['electoral_college'] = df_agg_all['state'].map(electoral_college)
df_agg_all

Unnamed: 0,pollster,state,population,day,grade,weighted_clinton,weighted_trump,samplesize_total,c-minus-t,electoral_college
0,ABC News/Washington Post,Maryland,lv,2016-09-27,A+,63.0,27.0,706.0,36.0,10.0
1,ABC News/Washington Post,Maryland,lv,2016-09-28,A+,63.0,27.0,706.0,36.0,10.0
2,ABC News/Washington Post,Maryland,lv,2016-09-29,A+,63.0,27.0,706.0,36.0,10.0
3,ABC News/Washington Post,Maryland,lv,2016-09-30,A+,63.0,27.0,706.0,36.0,10.0
4,ABC News/Washington Post,Maryland,rv,2016-03-30,A+,63.0,28.0,752.0,35.0,10.0
...,...,...,...,...,...,...,...,...,...,...
599,YouGov,Nebraska State,lv,2016-11-02,B,32.3,44.9,506.0,-12.6,2.0
600,YouGov,Nebraska State,lv,2016-11-03,B,32.3,44.9,506.0,-12.6,2.0
601,YouGov,Nebraska State,lv,2016-11-04,B,32.3,44.9,506.0,-12.6,2.0
602,YouGov,Nebraska State,lv,2016-11-05,B,32.3,44.9,506.0,-12.6,2.0


In [39]:
df_agg.to_hdf('data-p1/electoral_college_agg_polls_by_day.h5', key='df', mode='w', format='t') # h5 format to preserve data types (categorical, datetime, etc)
df_agg.to_csv('data-p1/electoral_college_agg_polls_by_day.csv', index=False) # csv format in case anyone wants to use