# Data Aggregation by Day

FiveThreeEight methodology:
- cap sample sizes at 5000
- if sample size isn't reported, use median sample size of polls from that polster (if no other info, use median sample size of all other polls
- sample size weighting - square root of poll's sample size / square root for median sample size for group

See `notes-p1.md` for details.

In [1]:
import pandas as pd

In [2]:
df = pd.read_csv("data-p1/polls_us_election_2016.csv")

df['startdate'] = pd.to_datetime(df['startdate'])
df['enddate'] = pd.to_datetime(df['enddate'])

df

Unnamed: 0,state,startdate,enddate,pollster,grade,samplesize,population,rawpoll_clinton,rawpoll_trump,rawpoll_johnson,rawpoll_mcmullin,adjpoll_clinton,adjpoll_trump,adjpoll_johnson,adjpoll_mcmullin
0,U.S.,2016-11-03,2016-11-06,ABC News/Washington Post,A+,2220.0,lv,47.00,43.00,4.00,,45.20163,41.72430,4.626221,
1,U.S.,2016-11-01,2016-11-07,Google Consumer Surveys,B,26574.0,lv,38.03,35.69,5.46,,43.34557,41.21439,5.175792,
2,U.S.,2016-11-02,2016-11-06,Ipsos,A-,2195.0,lv,42.00,39.00,6.00,,42.02638,38.81620,6.844734,
3,U.S.,2016-11-04,2016-11-07,YouGov,B,3677.0,lv,45.00,41.00,5.00,,45.65676,40.92004,6.069454,
4,U.S.,2016-11-03,2016-11-06,Gravis Marketing,B-,16639.0,rv,47.00,43.00,3.00,,46.84089,42.33184,3.726098,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4203,Virginia,2016-09-16,2016-09-22,Ipsos,A-,452.0,lv,46.54,40.04,,,46.47852,40.48017,,
4204,Wisconsin,2016-08-04,2016-08-07,Marquette University,A,683.0,lv,47.00,34.00,9.00,,48.74781,39.07778,4.705020,
4205,Utah,2016-11-01,2016-11-07,Google Consumer Surveys,B,286.0,lv,21.33,35.05,9.99,,26.65200,40.57738,9.705791,
4206,Oregon,2016-10-21,2016-11-02,Ipsos,A-,446.0,lv,46.46,37.41,,,45.12949,37.10720,,


In [3]:
df["grade"].value_counts()

grade
A-    1085
B     1011
C-     693
C+     329
B+     204
A      159
B-     142
A+      84
C       58
D       14
Name: count, dtype: int64

In [4]:
grade_order = ["F","D","D+","C-","C","C+","B-","B","B+","A-","A","A+"] # still included missing grades
df['grade'] = pd.Categorical(df['grade'], categories=grade_order, ordered=True)

In [5]:
df["population"].value_counts() # will probably just use likely voters, but keeping all for now in case

population
lv    3727
rv     418
v       42
a       21
Name: count, dtype: int64

In [6]:
df.isna().sum()

state                  0
startdate              0
enddate                0
pollster               0
grade                429
samplesize             1
population             0
rawpoll_clinton        0
rawpoll_trump          0
rawpoll_johnson     1409
rawpoll_mcmullin    4178
adjpoll_clinton        0
adjpoll_trump          0
adjpoll_johnson     1409
adjpoll_mcmullin    4178
dtype: int64

In [7]:
df = df.drop(columns=["rawpoll_johnson","rawpoll_mcmullin","adjpoll_johnson","adjpoll_mcmullin"]) # just looking at Clinton and Trump for now

In [8]:
pollster_medians = df.groupby('pollster')['samplesize'].median()
overall_median = float(df['samplesize'].median())

# imputation for sample size based on FiveThirtyEight methodology
def fill_na_with_median(row):

    if pd.isna(row['samplesize']):
        pollster_median = pollster_medians[row['pollster']]
        
        if pd.isna(pollster_median):
            return overall_median
        
        return pollster_median
    
    return row['samplesize']

df['samplesize'] = df.apply(fill_na_with_median, axis=1)

In [9]:
df['samplesizeadj'] = df['samplesize'].apply(lambda x: min(x, 5000)) # cap sample sizes at 5000

In [15]:
df.to_hdf('data-p1/polls-clean.h5', key='df', mode='w', format='t')
df.to_csv('data-p1/polls_clean.csv',index=False)

In [10]:
def split_to_individual_days(df):
    expanded_rows = []
    
    for idx, row in df.iterrows():
        day_range = pd.date_range(start=row['startdate'], end=row['enddate'])
        
        for day in day_range:
            expanded_row = row.copy()
            expanded_row['day'] = day
            expanded_rows.append(expanded_row)
    
    return pd.DataFrame(expanded_rows)

# formula from FiveThirtyEight
def weighted_average(group, value_column):
    sqrt_sample_sizes = group['samplesizeadj'] ** 0.5 
    median_sqrt_sample_size = group['samplesizeadj'].median() ** 0.5
    weights = sqrt_sample_sizes / median_sqrt_sample_size
    return (group[value_column] * weights).sum() / weights.sum()

def aggregate_by_day(df):
    agg_df = df.groupby(['pollster', 'state', 'population', 'day']).apply(
        lambda x: pd.Series({
            'weighted_clinton': weighted_average(x, 'rawpoll_clinton'),
            'weighted_trump': weighted_average(x, 'rawpoll_trump'),
            'samplesize_total': x['samplesizeadj'].sum()
        })
    ).reset_index()
    
    agg_df["c-minus-t"] = agg_df["weighted_clinton"] - agg_df["weighted_trump"]
    return agg_df

expanded_df = split_to_individual_days(df)

df_agg = aggregate_by_day(expanded_df)

df_agg


  agg_df = df.groupby(['pollster', 'state', 'population', 'day']).apply(


Unnamed: 0,pollster,state,population,day,weighted_clinton,weighted_trump,samplesize_total,c-minus-t
0,ABC News/Washington Post,Maryland,lv,2016-09-27,63.0,27.0,706.0,36.0
1,ABC News/Washington Post,Maryland,lv,2016-09-28,63.0,27.0,706.0,36.0
2,ABC News/Washington Post,Maryland,lv,2016-09-29,63.0,27.0,706.0,36.0
3,ABC News/Washington Post,Maryland,lv,2016-09-30,63.0,27.0,706.0,36.0
4,ABC News/Washington Post,Maryland,rv,2016-03-30,63.0,28.0,752.0,35.0
...,...,...,...,...,...,...,...,...
20292,icitizen,U.S.,rv,2016-09-15,42.0,37.0,1000.0,5.0
20293,icitizen,U.S.,rv,2016-09-16,42.0,37.0,1000.0,5.0
20294,icitizen,U.S.,rv,2016-09-17,42.0,37.0,1000.0,5.0
20295,icitizen,U.S.,rv,2016-09-18,42.0,37.0,1000.0,5.0


In [14]:
df_agg.to_hdf('data-p1/agg_polls_by_day.h5', key='df', mode='w', format='t') # h5 format to preserve data types (categorical, datetime, etc)
df_agg.to_csv('data-p1/agg_polls_by_day.csv', index=False) # csv format in case anyone wants to use