In [None]:
import warnings
warnings.filterwarnings("ignore")

In [None]:
import geopandas as gpd
import pandas as pd
import numpy as np
from tqdm import tqdm

In [None]:
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

### Load Registration List

In [None]:
df_registered = pd.read_csv('../data/state_of_michigan_foia/March Entire State Reg List 2025.csv', encoding='ISO-8859-1')
df_registered.sample()

### Join Census County Codes

In [None]:
def mapCountyCodes(mich_code):
    df_counties = pd.read_csv('../data/custom_data/county_code_mapping.csv')
    df_census_code = df_counties[df_counties['Michigan County Code'] == mich_code]
    census_code = df_census_code['Census County Code'].values[0]
    return census_code

In [None]:
# THIS WILL TAKE A VERY LONG TIME.
df_registered['WARD'] = df_registered['WARD'].fillna(0)
df_registered['WARD'] = df_registered['WARD'].astype(int)

df_registered['Census County Code'] = df_registered['COUNTY_CODE'].apply(mapCountyCodes)

df_registered['standardized_id_num'] = df_registered.apply(lambda row: str(row['Census County Code']) + str(row['JURISDICTION_CODE']).zfill(5) + str(row['PRECINCT']), axis=1)
df_registered['standardized_id_num'] = df_registered['standardized_id_num'].astype(str).str.zfill(13)

df_registered.to_csv('../data/custom_data/registered_voters.csv', index=False)

### Registration Counts per Precinct

In [None]:
df_unique = df_registered['standardized_id_num'].value_counts().reset_index()
df_unique.columns = ['standardized_id_num', 'count']

df_unique.to_csv('../data/custom_data/registered_voters_count.csv', index=False)
df_unique.head()

### Load Voter History

In [None]:
hist1 = pd.read_csv('../data/state_of_michigan_foia/March Entire State Voter History 2025 Part 1.csv')
hist2 = pd.read_csv('../data/state_of_michigan_foia/March Entire State Voter History 2025 Part 2.csv')

### Join Registration List and Voter History

In [None]:
overlap_cols = df_registered.columns.intersection(hist1.columns).difference(['VOTER_IDENTIFICATION_NUMBER'])  # keep 'id' for merging

hist1 = hist1.drop(columns=overlap_cols)
hist1 = pd.merge(hist1, df_registered, on='VOTER_IDENTIFICATION_NUMBER', how='inner')

In [None]:
overlap_cols = df_registered.columns.intersection(hist2.columns).difference(['VOTER_IDENTIFICATION_NUMBER'])  # keep 'id' for merging

hist2 = hist2.drop(columns=overlap_cols)
hist2 = pd.merge(hist2, df_registered, on='VOTER_IDENTIFICATION_NUMBER', how='inner')

hist2 = hist2.loc[:, ~hist2.columns.duplicated()]

In [None]:
df_registration_history = pd.concat([hist1, hist2], axis=0)
df_registration_history = df_registration_history.drop(columns=['Unnamed: 8'])

In [None]:
# df_registration_history.to_csv('../data/custom_data/df_registration_history.csv', index=False)

### Features

In [None]:
# df_registration_history['ELECTION_YEAR'] = df_registration_history['ELECTION_DATE'].apply(lambda x: x[:4])
# df_registration_history['ELECTION_MONTH'] = df_registration_history['ELECTION_DATE'].apply(lambda x: x[5:7])
# df_registration_history['ELECTION_DAY'] = df_registration_history['ELECTION_DATE'].apply(lambda x: x[8:10])

In [None]:
# df_registration_history[(df_registration_history['LAST_NAME'] == 'KETCHUM') & (df_registration_history['FIRST_NAME'] == 'NICHOLAS') & (df_registration_history['MIDDLE_NAME'] == 'RYAN')]

In [None]:
cols_to_keep = [
    'YEAR_OF_BIRTH', 'GENDER', 'REGISTRATION_DATE', 'ZIP_CODE', 'VOTER_IDENTIFICATION_NUMBER', 
    'COUNTY_CODE', 'JURISDICTION_CODE', 'JURISDICTION_NAME', 'IS_PERM_AV_BALLOT_VOTER', 
    'VOTER_STATUS_TYPE_CODE', 'UOCAVA_STATUS_CODE', 'IS_PERM_AV_APP_VOTER', 
    'Census County Code', 'standardized_id_num'
]

df_registered = pd.read_csv('../data/custom_data/registered_voters.csv')
df_registered = df_registered[cols_to_keep]

df_registered.to_csv('df_registered.csv', index=False)
print(f'Num rows: {len(df_registered)}')
df_registered.sample()

In [None]:
# Load huge df by chunking
filename = '../data/custom_data/df_registration_history.csv'
chunksize = 100000

# Set bar length via line count
with open(filename, 'r') as f:
    total_lines = sum(1 for line in f)

dfs = []
for chunk in tqdm(pd.read_csv(filename, chunksize=chunksize), total=total_lines // chunksize):
    dfs.append(chunk)

df_registration_history = pd.concat(dfs, ignore_index=True)

print(f'Row count: {len(df_registration_history)}')

In [None]:
df_registration_history_filename = '../data/custom_data/df_registration_history.csv'
chunksize = 100_000
merged_chunks = []

# Calc progress bar size
with open(df_registration_history_filename, 'r') as f:
    total_lines = sum(1 for _ in f) - 1

# Create df from chunks
with tqdm(total=total_lines // chunksize + 1) as pbar:
    for chunk in pd.read_csv(df_registration_history_filename, chunksize=chunksize):
        merged = chunk.merge(df_registered, on='VOTER_IDENTIFICATION_NUMBER', how='inner')
        merged_chunks.append(merged)
        pbar.update(1)

df_merged = pd.concat(merged_chunks, ignore_index=True)

print(f'Shape: {df_merged.shape}')
print(f'Columns: {df_merged.columns}')
print(f'Row count: {len(df_merged)}')

print(f'Row count: {len(df_registration_history)}')