#### Get packages & download data

In [140]:
# Load packages

import geopandas as gpd
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import os

In [None]:
# Read CSV
df = pd.read_csv('./data2/Ohio_District_all_2023.csv')

#df.head()

#### Alter data format
(To match the one from 2018)

In [142]:
# Step 1: Melt the dataframe to long format
df_long = pd.melt(
    df, 
    id_vars=['Title'], 
    var_name='CD119FP', 
    value_name='Value'
)

# Step 2: Pivot so that Title values become columns
df_wide = df_long.pivot(
    index='CD119FP', 
    columns='Title', 
    values='Value'
).reset_index()

df_wide['CD119FP'] = df_wide['CD119FP'].str.extract(r'(\d+)').astype(int)

# df_wide = df_wide.replace(',', '', regex=True).astype(int)
for col in df_wide.columns.difference(['CD119FP']):
    df_wide[col] = pd.to_numeric(df_wide[col].astype(str).str.replace(',', ''), errors='coerce')


#df_wide.head()
#print(df_wide.dtypes)

#df_wide.drop('Title', axis=1, inplace=True)
#df = df.rename(columns={'old_name': 'new_name'})

#### Calculate sex, poverty, & education variables

In [143]:
#'Women', 'In Poverty'

df_wide['Women'] = (df_wide['Female']) / (df_wide['Female'] + df_wide['Male']) * 100
df_wide['Did not finish high school'] = 100 - df_wide['Percent high school graduate or higher']

df_wide.rename(columns={'Poverty Rate': 'In Poverty', 'Percent bachelor\'s degree or higher': 'Bachelors or more'},
                inplace=True)

df_wide.drop(['Female', 'Male', 'Percent high school graduate or higher',
              'Less than 9th grade', 'Total Population 25 years and over', '9th to 12th grade, no diploma'], axis=1, inplace=True)

#### Calculate age variables

In [144]:
# Isolate Age variables to calculate age
age_df = df_wide[['CD119FP', 'Total population', 'Under 5 years','5 to 9 years', '10 to 14 years', '15 to 19 years', 
       '20 to 24 years', '25 to 34 years', '35 to 44 years', '45 to 54 years',
       '55 to 59 years', '60 to 64 years', '18 years and over', '65 years and over']].copy()

#age_df['18 and under'] = age_df['Total population'] - age_df['18 years and over']

age_df['18-19'] = age_df['Under 5 years'] + age_df['5 to 9 years'] + age_df['10 to 14 years'] + age_df['15 to 19 years'] - age_df['Total population'] + age_df['18 years and over']



#18-44
age_df['Voting Population'] = age_df['18 years and over']
age_df['18-44'] = (age_df['18-19'] + age_df['20 to 24 years'] + age_df['25 to 34 years'] + age_df['35 to 44 years']) / age_df['Voting Population'] * 100
age_df['45-64'] = (age_df['45 to 54 years'] + age_df['55 to 59 years'] + age_df['60 to 64 years']) / age_df['Voting Population'] * 100
age_df['65 and older'] = age_df['65 years and over'] / age_df['Voting Population'] * 100

age_df = age_df[['CD119FP', '18-44', '45-64', '65 and older']]

age_df.head()

Title,CD119FP,18-44,45-64,65 and older
0,1,49.171973,30.472706,20.35532
1,2,41.771184,33.585636,24.64318
2,3,54.78656,27.912352,17.301088
3,4,43.30588,32.972827,23.721293
4,5,42.629982,32.041819,25.328198


#### Calculate race variables

In [145]:
# 'White', 'Black', 'Asian', 'Hispanic'

race_df = df_wide[['CD119FP', 'Total population', 'White', 'Black or African American',
                   'Asian', 'Hispanic or Latino (of any race)']].copy()

race_df.rename(columns={'Black or African American': 'Black', 'Hispanic or Latino (of any race)': 'Hispanic'},
                inplace=True)

race_df['White'] = race_df['White'] / race_df['Total population'] * 100
race_df['Black'] = race_df['Black'] / race_df['Total population'] * 100
race_df['Asian'] = race_df['Asian'] / race_df['Total population'] * 100
race_df['Hispanic'] = race_df['Hispanic'] / race_df['Total population'] * 100

race_df.head()



Title,CD119FP,Total population,White,Black,Asian,Hispanic
0,1,796831,68.499343,16.859158,4.196122,4.962784
1,2,787945,92.384367,1.639201,0.481125,1.472945
2,3,784597,54.12868,27.976656,5.800175,7.252768
3,4,803311,85.732176,3.879942,3.370924,2.821821
4,5,790574,85.769833,3.397658,1.041522,7.575255


#### Bring in urbanization data

In [146]:
#'urbanization_pct'
urbanization_df = pd.read_csv('./data2/urbanization_2023.csv')
urbanization_df.head()

Unnamed: 0,CD119FP,urbanization_pct
0,6,14.273753
1,13,60.409173
2,2,10.159529
3,9,19.871398
4,15,19.879884


#### Merge final dataset

In [147]:


cd = df_wide.copy()

# Drop race & age columns
cd.drop(['Under 5 years','5 to 9 years', '10 to 14 years', '15 to 19 years', 
       '20 to 24 years', '25 to 34 years', '35 to 44 years', '45 to 54 years',
       '55 to 59 years', '60 to 64 years', '18 years and over', '65 years and over',
       'White', 'Black or African American','Asian', 'Hispanic or Latino (of any race)'],axis=1, inplace=True)

# Merge back age df
cd = cd.merge(
    age_df[['CD119FP', '18-44', '45-64', '65 and older']],
    on='CD119FP',
    how='left'
)

# Merge back race df
cd = cd.merge(
    race_df[['CD119FP', 'White', 'Black', 'Asian', 'Hispanic']],
    on='CD119FP',
    how='left'
)

# Merge urbanization df
cd = cd.merge(
    urbanization_df[['CD119FP', 'urbanization_pct']],
    on='CD119FP',
    how='left'
)


cd.head()

Unnamed: 0,CD119FP,Bachelors or more,In Poverty,Total population,Women,Did not finish high school,18-44,45-64,65 and older,White,Black,Asian,Hispanic,urbanization_pct
0,1,48.7,12.2,796831,50.656789,6.2,49.171973,30.472706,20.35532,68.499343,16.859158,4.196122,4.962784,46.389193
1,2,21.4,14.6,787945,50.119742,9.3,41.771184,33.585636,24.64318,92.384367,1.639201,0.481125,1.472945,10.159529
2,3,46.0,16.6,784597,50.540596,8.2,54.78656,27.912352,17.301088,54.12868,27.976656,5.800175,7.252768,85.82613
3,4,28.9,10.8,803311,49.205351,7.2,43.30588,32.972827,23.721293,85.732176,3.879942,3.370924,2.821821,13.708126
4,5,26.3,10.9,790574,50.183411,6.4,42.629982,32.041819,25.328198,85.769833,3.397658,1.041522,7.575255,13.282808


In [None]:
# Final columns included are same as 2018 dataset

cd = cd[['CD119FP', '18-44', '45-64', '65 and older',
       'Women', 'In Poverty', 'Did not finish high school',
       'Bachelors or more', 'White', 'Black', 'Asian', 'Hispanic',
       'urbanization_pct']]



cd = cd.round(1)
cd.head(20)

cd.to_csv('./data2/cd_2023.csv', index=False)