This notebook was used to 1) test a function for splitting the training and validation splitting, and 2) to create the fraction of elections voted in feature.

In [1]:
data_path = '/Volumes/FileStorage/Insight_data/'
ohio_path = '/Volumes/FileStorage/Insight_data/Ohio_data/data_feather/'

In [2]:
import numpy as np
import pandas as pd
import pickle
import feather
from sklearn.model_selection import train_test_split

In [3]:
oh1_df = feather.read_dataframe(ohio_path+'SWVF_1_22.feather')
oh2_df = feather.read_dataframe(ohio_path+'SWVF_23_44.feather')
oh3_df = feather.read_dataframe(ohio_path+'SWVF_45_66.feather')
oh4_df = feather.read_dataframe(ohio_path+'SWVF_67_88.feather')

In [4]:
#https://stackoverflow.com/questions/38250710/how-to-split-data-into-3-sets-train-validation-and-test
def train_validate_test_split(df, train_percent=.6, validate_percent=.2, seed=None):
    np.random.seed(seed)
    perm = np.random.permutation(df.index)
    m = len(df.index)
    train_end = int(train_percent * m)
    validate_end = int(validate_percent * m) + train_end
    train = df.iloc[perm[:train_end]]
    validate = df.iloc[perm[train_end:validate_end]]
    test = df.iloc[perm[validate_end:]]
    return train, validate, test

In [5]:
oh_df = pd.concat([oh1_df,oh2_df,oh3_df,oh4_df])

In [6]:
oh_train, oh_validate, oh_test = train_validate_test_split(oh_df,seed=1)

In [7]:
oh_train.head()

Unnamed: 0,SOS_VOTERID,COUNTY_NUMBER,COUNTY_ID,LAST_NAME,FIRST_NAME,MIDDLE_NAME,SUFFIX,DATE_OF_BIRTH,REGISTRATION_DATE,VOTER_STATUS,...,GENERAL-06/07/2016,PRIMARY-09/13/2016,GENERAL-11/08/2016,PRIMARY-05/02/2017,PRIMARY-09/12/2017,GENERAL-11/07/2017,PRIMARY-05/08/2018,GENERAL-08/07/2018,GENERAL-11/06/2018,PRIMARY-05/07/2019
44919,OH0022572078,23,187017,HOUK,LYNNAE,S,,1965-10-12,2016-09-03,ACTIVE,...,,,X,,,,,,,
350613,OH0011542577,15,9907343,PARKES,LINDA,L,,1955-09-24,1998-09-26,ACTIVE,...,,,X,,,X,,,X,
1715811,OH0023206328,9,611018,BYRD,ERIC,WAYNE,,1994-07-02,2018-09-17,ACTIVE,...,,,,,,,,,X,
329574,OH0016396163,2,71069,HALL,JULIE,A,,1964-12-02,2001-10-10,ACTIVE,...,,,X,,,,R,,X,
916210,OH0011586083,10,7801452,WASEMAN,HARRY,S,,1944-09-29,1978-02-07,ACTIVE,...,,,X,,,X,R,X,X,


In [8]:
from datetime import date, datetime

In [11]:
#Calculate age at a particular election based on DOB
def age_at_election(row, election='GENERAL-11/08/2016'):
    el_date_str = election.split('-')[1]
    el_obj = datetime.strptime(el_date_str, '%m/%d/%Y')
    bd_obj = datetime.strptime(row['DATE_OF_BIRTH'], '%Y-%m-%d')
    age = el_obj.year - bd_obj.year - ((el_obj.month, el_obj.day) < (bd_obj.month, bd_obj.day))
    return age

In [12]:
age_at_election(oh_df.iloc[0])

46

In [13]:
oh_df['AGE_2016'] = oh_df.apply (lambda row: age_at_election(row), axis=1)

In [18]:
oh_df['AGE_2012'] = oh_df.apply (lambda row: age_at_election(row, election='GENERAL-11/06/2012'), axis=1)

In [19]:
#2020 election date: Nov 3 2020
oh_df['AGE_2020'] = oh_df.apply (lambda row: age_at_election(row, election='GENERAL-11/03/2020'), axis=1)

In [21]:
election_list = []
election_i = []
general_list = []
primary_list = []
for key in oh_df:
        if key[:7]=='GENERAL':
                election_list.append(key)
                general_list.append(key)
        elif key[:7]=='PRIMARY':
                election_list.append(key)
                primary_list.append(key)
        elif key[:7]=='SPECIAL':
                election_list.append(key)

In [24]:
#calculate N_voted/N_eligible
def el_to_date(election):
    el_date_str = election.split('-')[1]
    el_obj = datetime.strptime(el_date_str, '%m/%d/%Y')
    return el_obj
date_list = [el_to_date(e) for e in election_list]
date_dict = dict(zip(election_list, date_list))

In [95]:
def n_eligible_voted(row, start_date, end_date):
    reg_date = datetime.strptime(row['REGISTRATION_DATE'],'%Y-%m-%d')
    eligible_list = [l for l in election_list if date_dict[l]>reg_date and date_dict[l]>start_date and date_dict[l]<end_date]
    n_el = row[eligible_list].count()
    return n_el

In [96]:
def n_eligible(row, start_date, end_date):
    reg_date = datetime.strptime(row['REGISTRATION_DATE'],'%Y-%m-%d')
    eligible_list = [l for l in election_list if date_dict[l]>reg_date and date_dict[l]>start_date and date_dict[l]<end_date]
    return len(eligible_list)

In [97]:
date_2000=datetime(2000, 11, 7)
date_2004=datetime(2004,11,2)
date_2008=datetime(2008,11,4)
date_2012=datetime(2012,11,6)
date_2016=datetime(2016,11,8)
date_2020=datetime(2020,11,3)

In [98]:
%%capture
from tqdm import tqdm_notebook as tqdm
tqdm().pandas()

In [84]:
oh_df['N_VOTED_ELIGIBLE_2012'] = oh_df.progress_apply(lambda row: n_eligible_voted(row,date_2000,date_2012), axis=1)

HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




In [99]:
from multiprocessing import Pool

In [100]:
def parallelize_dataframe(df, func, n_cores=4):
    df_split = np.array_split(df, n_cores)
    pool = Pool(n_cores)
    df = pd.concat(pool.map(func, df_split))
    pool.close()
    pool.join()
    return df

In [101]:
def calculate_voter_el(df):
    df['N_VOTED_ELIGIBLE_2012'] = oh_df.progress_apply(lambda row: n_eligible_voted(row,date_2000,date_2012), axis=1)
    df['N_ELIGIBLE_2012'] = df.progress_apply(lambda row: n_eligible(row,date_2000,date_2012), axis=1)
    df['N_VOTED_ELIGIBLE_2016'] = df.progress_apply(lambda row: n_eligible_voted(row,date_2004,date_2016), axis=1)
    df['N_ELIGIBLE_2016'] = df.progress_apply(lambda row: n_eligible(row,date_2004,date_2016), axis=1)
    df['N_VOTED_ELIGIBLE_2020'] = df.progress_apply(lambda row: n_eligible_voted(row,date_2008,date_2020), axis=1)
    df['N_ELIGIBLE_2020'] = df.progress_apply(lambda row: n_eligible(row,date_2008,date_2020), axis=1)

In [None]:
oh_df_voter_n=parallelize_dataframe(oh_df,calculate_voter_el)

In [47]:
oh_df['N_ELIGIBLE_2012'] = oh_df.progress_apply(lambda row: n_eligible(row,date_2000,date_2012), axis=1)

HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




In [48]:
oh_df['N_VOTED_ELIGIBLE_2016'] = oh_df.progress_apply(lambda row: n_eligible_voted(row,date_2004,date_2016), axis=1)
oh_df['N_ELIGIBLE_2016'] = oh_df.progress_apply(lambda row: n_eligible(row,date_2004,date_2016), axis=1)

HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




In [49]:
oh_df['N_VOTED_ELIGIBLE_2020'] = oh_df.progress_apply(lambda row: n_eligible_voted(row,date_2008,date_2020), axis=1)
oh_df['N_ELIGIBLE_2020'] = oh_df.progress_apply(lambda row: n_eligible(row,date_2008,date_2020), axis=1)

HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




In [55]:
def row_ratio(row,year):
    eligible_column = 'N_ELIGIBLE_'+year
    voted_column = 'N_VOTED_ELIGIBLE_'+year
    voted_i= row[voted_column]
    eligible_i = row[eligible_column]
    if eligible_i != 0:
        v_ratio = voted_i/eligible_i
    else:
        v_ratio = float('Inf')
    return v_ratio

In [56]:
oh_df['RATIO_2012'] = oh_df.progress_apply(lambda row: row_ratio(row,'2012'), axis=1)

HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




In [57]:
oh_df['RATIO_2016'] = oh_df.progress_apply(lambda row: row_ratio(row,'2016'), axis=1)
oh_df['RATIO_2020'] = oh_df.progress_apply(lambda row: row_ratio(row,'2020'), axis=1)

HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




HBox(children=(IntProgress(value=0, max=7825385), HTML(value='')))




In [58]:
edited_df_name = ohio_path+'oh_df_withvotenum_new.feather'
feather.write_dataframe(oh_df,edited_df_name)

In [60]:
del oh_train, oh_validate, oh_test
oh_train, oh_validate, oh_test = train_validate_test_split(oh_df,seed=1)

feather.write_dataframe(oh_train, ohio_path+'oh_train.feather')
feather.write_dataframe(oh_validate, ohio_path+'oh_validate.feather')
feather.write_dataframe(oh_test, ohio_path+'oh_test.feather')

In [61]:
oh_train.head()

Unnamed: 0,SOS_VOTERID,COUNTY_NUMBER,COUNTY_ID,LAST_NAME,FIRST_NAME,MIDDLE_NAME,SUFFIX,DATE_OF_BIRTH,REGISTRATION_DATE,VOTER_STATUS,...,N_ELIGIBLE,N_VOTED_ELIGIBLE_2012,N_ELIGIBLE_2012,N_VOTED_ELIGIBLE_2016,N_ELIGIBLE_2016,N_VOTED_ELIGIBLE_2020,N_ELIGIBLE_2020,RATIO_2012,RATIO_2016,RATIO_2020
44919,OH0022572078,23,187017,HOUK,LYNNAE,S,,1965-10-12,2016-09-03,ACTIVE,...,9,1,9,1,9,1,9,0.111111,0.111111,0.111111
350613,OH0011542577,15,9907343,PARKES,LINDA,L,,1955-09-24,1998-09-26,ACTIVE,...,60,17,58,14,50,10,35,0.293103,0.28,0.285714
1715811,OH0023206328,9,611018,BYRD,ERIC,WAYNE,,1994-07-02,2018-09-17,ACTIVE,...,2,1,2,1,2,1,2,0.5,0.5,0.5
329574,OH0016396163,2,71069,HALL,JULIE,A,,1964-12-02,2001-10-10,ACTIVE,...,57,23,57,18,50,11,35,0.403509,0.36,0.314286
916210,OH0011586083,10,7801452,WASEMAN,HARRY,S,,1944-09-29,1978-02-07,ACTIVE,...,60,27,58,21,50,15,35,0.465517,0.42,0.428571


print(oh_train.iloc(2))

In [64]:
print(oh_train.iloc[2,:])

SOS_VOTERID                       OH0023206328
COUNTY_NUMBER                                9
COUNTY_ID                               611018
LAST_NAME                                 BYRD
FIRST_NAME                                ERIC
MIDDLE_NAME                              WAYNE
SUFFIX                                    None
DATE_OF_BIRTH                       1994-07-02
REGISTRATION_DATE                   2018-09-17
VOTER_STATUS                            ACTIVE
PARTY_AFFILIATION                         None
RESIDENTIAL_ADDRESS1          5034 ROCKDALE RD
RESIDENTIAL_SECONDARY_ADDR                None
RESIDENTIAL_CITY                      HAMILTON
RESIDENTIAL_STATE                           OH
RESIDENTIAL_ZIP                          45011
RESIDENTIAL_ZIP_PLUS4                      NaN
RESIDENTIAL_COUNTRY                        NaN
RESIDENTIAL_POSTALCODE                     NaN
MAILING_ADDRESS1                          None
MAILING_SECONDARY_ADDRESS                 None
MAILING_CITY 

In [81]:
n_eligible(oh_train.iloc[2,:], date_2000, date_2020)

['GENERAL-11/06/2018', 'PRIMARY-05/07/2019']


2