# Predictions

In [1]:
# Imports
import pandas as pd
import geopandas as gpd
import numpy as np
from sklearn.model_selection import StratifiedKFold, KFold
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression


In [2]:
# Set random state for reproducibility
random_state = 199

In [3]:
# Read in geojson
bg_data = gpd.read_file("../data/block_groups.geojson").set_index("GEOID20")
t_data = gpd.read_file("../data/tracts.geojson").set_index("GEOID20")
c_data = gpd.read_file("../data/counties.geojson").set_index("GEOID20")

In [4]:
# Group demographic columns together
registered = ['total_reg']
age = ['age_18_19', 'age_20_24', 'age_25_29','age_30_34','age_35_44', 'age_45_54', 'age_55_64', 'age_65_74','age_75_84', 'age_85over']
gender = [ 'gender_m', 'gender_f', 'gender_unknown'] 
# party = ['party_npp', 'party_dem', 'party_rep','party_lib', 'party_grn', 'party_con', 'party_ain', 'party_scl','party_oth', 'party_unk']
party = ['party_npp', 'party_dem', 'party_rep','party_lib', 'party_grn', 'party_con', 'party_ain', 'party_scl','party_oth']
ethnicity1 = ['eth1_eur', 'eth1_hisp', 'eth1_aa',
                'eth1_esa', 'eth1_oth', 'eth1_unk']
languages = ['lang_english', 'lang_spanish',
                'lang_portuguese',
                'lang_chinese', 'lang_italian',
                'lang_vietnamese', 'lang_other',
                'lang_unknown']

predictors = [*registered, *age, *gender, *party, *ethnicity1, *languages, 'mean_hh_income']


In [5]:
X = bg_data[predictors]
y = bg_data["2020_turnout_pct"]

In [6]:
X

Unnamed: 0_level_0,total_reg,age_18_19,age_20_24,age_25_29,age_30_34,age_35_44,age_45_54,age_55_64,age_65_74,age_75_84,...,eth1_unk,lang_english,lang_spanish,lang_portuguese,lang_chinese,lang_italian,lang_vietnamese,lang_other,lang_unknown,mean_hh_income
GEOID20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
250010121011,1248,19,68,81,76,171,192,224,215,164,...,124,623,15,4,1,1,0,5,601,96435.738782
250010121015,833,10,39,44,59,131,105,153,158,104,...,83,391,10,2,0,1,0,5,425,94926.003601
250010121013,720,4,34,43,45,103,85,149,141,89,...,51,390,2,2,0,2,0,1,326,96467.529167
250010121014,534,1,30,35,71,98,72,101,68,41,...,55,215,2,3,0,2,0,2,314,70610.726592
250277552022,774,14,67,34,34,104,135,151,119,88,...,96,426,2,0,0,3,0,8,338,105674.436693
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
250251601032,262,5,34,41,21,35,38,45,26,13,...,16,44,39,2,0,1,0,2,176,77150.660305
250251601024,168,6,18,15,16,36,25,26,22,3,...,10,42,20,0,0,0,2,3,102,83077.303571
250250711013,339,1,15,61,70,70,41,40,29,12,...,49,85,3,0,2,0,0,4,246,104028.268437
250251803013,603,9,43,55,23,91,111,109,88,50,...,44,328,2,2,0,11,0,0,261,153424.233831


In [7]:
y

GEOID20
250010121011    0.804348
250010121015    0.782781
250010121013    0.792793
250010121014    0.654244
250277552022    0.845833
                  ...   
250251601032    0.687783
250251601024    0.655405
250250711013    0.808511
250251803013    0.881834
250250705022    0.725268
Name: 2020_turnout_pct, Length: 5078, dtype: float64

In [8]:
bg_predictions = bg_data[["2020_turnout_pct","2020_absent_pct","g20201103_reg_all","g20201103_voted_all"]].copy()

kf = KFold(n_splits=10,shuffle=True,random_state=random_state)
X.loc[:,["total_reg","mean_hh_income"]] = StandardScaler().fit_transform(X=X[["total_reg","mean_hh_income"]])

  2.29764472]' has dtype incompatible with int32, please explicitly cast to a compatible dtype first.
  X.loc[:,["total_reg","mean_hh_income"]] = StandardScaler().fit_transform(X=X[["total_reg","mean_hh_income"]])


In [9]:
linreg = LinearRegression()

for train_idx, val_idx in kf.split(X, y):
    train = X.index[train_idx]
    val = X.index[val_idx]
    linreg.fit(X.loc[train], y.loc[train])
    bg_predictions.loc[val,"2020_turnout_pct_pred"] = linreg.predict(X.loc[val])

In [10]:
# Rename columns
bg_predictions = bg_predictions.rename(columns={"g20201103_reg_all": "2020_reg", "g20201103_voted_all": "2020_turnout"})
bg_predictions["2020_absent"] = bg_predictions["2020_reg"] - bg_predictions["2020_turnout"]

# Calculate other columns
bg_predictions["2020_absent_pct_pred"] = 1 - bg_predictions["2020_turnout_pct_pred"]
bg_predictions["2020_turnout_pred"] = (bg_predictions["2020_reg"] * bg_predictions["2020_turnout_pct_pred"]).round(decimals=0).astype(int)
bg_predictions["2020_absent_pred"] = bg_predictions["2020_reg"] - bg_predictions["2020_turnout_pred"] 

In [11]:
bg_predictions

Unnamed: 0_level_0,2020_turnout_pct,2020_absent_pct,2020_reg,2020_turnout,2020_turnout_pct_pred,2020_absent,2020_absent_pct_pred,2020_turnout_pred,2020_absent_pred
GEOID20,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
250010121011,0.804348,0.195652,1150,925,0.809855,225,0.190145,931,219
250010121015,0.782781,0.217219,755,591,0.792895,164,0.207105,599,156
250010121013,0.792793,0.207207,666,528,0.790854,138,0.209146,527,139
250010121014,0.654244,0.345756,483,316,0.728162,167,0.271838,352,131
250277552022,0.845833,0.154167,720,609,0.773375,111,0.226625,557,163
...,...,...,...,...,...,...,...,...,...
250251601032,0.687783,0.312217,221,152,0.711854,69,0.288146,157,64
250251601024,0.655405,0.344595,148,97,0.737898,51,0.262102,109,39
250250711013,0.808511,0.191489,282,228,0.763997,54,0.236003,215,67
250251803013,0.881834,0.118166,567,500,0.860131,67,0.139869,488,79


In [12]:
# Aggregate to Tract
t_predictions = bg_predictions.copy()
t_predictions["tract_id"] = bg_predictions.index.str[:11]
t_predictions = t_predictions.groupby("tract_id")[["2020_reg", "2020_turnout", "2020_absent", "2020_turnout_pred", "2020_absent_pred"]].sum()
t_predictions["2020_turnout_pct_pred"] = t_predictions["2020_turnout_pred"] / t_predictions["2020_reg"]
t_predictions["2020_absent_pct_pred"] = 1 - t_predictions["2020_turnout_pct_pred"]

# Aggregate to County
c_predictions = bg_predictions.copy()
c_predictions["county_id"] = bg_predictions.index.str[:5]
c_predictions = c_predictions.groupby("county_id")[["2020_reg", "2020_turnout", "2020_absent", "2020_turnout_pred", "2020_absent_pred"]].sum()
c_predictions["2020_turnout_pct_pred"] = c_predictions["2020_turnout_pred"] / c_predictions["2020_reg"]
c_predictions["2020_absent_pct_pred"] = 1 - c_predictions["2020_turnout_pct_pred"]

In [13]:
t_predictions

Unnamed: 0_level_0,2020_reg,2020_turnout,2020_absent,2020_turnout_pred,2020_absent_pred,2020_turnout_pct_pred,2020_absent_pct_pred
tract_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25001010100,2727,2355,372,2314,413,0.848552,0.151448
25001010206,2563,2237,326,2185,378,0.852517,0.147483
25001010208,1164,1032,132,928,236,0.797251,0.202749
25001010304,2057,1855,202,1721,336,0.836655,0.163345
25001010306,2036,1834,202,1660,376,0.815324,0.184676
...,...,...,...,...,...,...,...
25027761100,2741,2121,620,2100,641,0.766144,0.233856
25027761200,2982,2544,438,2475,507,0.829980,0.170020
25027761300,2188,1732,456,1696,492,0.775137,0.224863
25027761401,2971,2720,251,2596,375,0.873780,0.126220


In [14]:
c_predictions

Unnamed: 0_level_0,2020_reg,2020_turnout,2020_absent,2020_turnout_pred,2020_absent_pred,2020_turnout_pct_pred,2020_absent_pct_pred
county_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
25001,159131,136118,23013,133338,25793,0.837913,0.162087
25003,77637,62427,15210,62243,15394,0.801718,0.198282
25005,345402,262063,83339,260603,84799,0.754492,0.245508
25007,13054,10882,2172,10308,2746,0.789643,0.210357
25009,485123,390592,94531,391569,93554,0.807154,0.192846
25011,45168,37587,7581,36675,8493,0.811969,0.188031
25013,288984,204314,84670,207478,81506,0.717957,0.282043
25015,92040,78645,13395,78667,13373,0.854704,0.145296
25017,928344,782670,145674,777594,150750,0.837614,0.162386
25019,7482,6225,1257,6005,1477,0.802593,0.197407


In [15]:
bg_joined = bg_data.merge(bg_predictions.drop(columns=["2020_turnout_pct","2020_absent_pct"]), left_on="GEOID20", right_on="GEOID20")
t_joined = t_data.merge(t_predictions, left_on="GEOID20", right_on="tract_id")
c_joined = c_data.merge(c_predictions, left_on="GEOID20", right_on="county_id")

In [16]:
# Optionally export the data for CS1710
bg_joined.to_file('../data/block_groups_pred.geojson', driver='GeoJSON')
t_joined.to_file('../data/tracts_pred.geojson', driver='GeoJSON')
c_joined.to_file('../data/counties_pred.geojson', driver='GeoJSON')