In [1]:
import pandas as pd
import geopandas as gpd
import matplotlib.pyplot as plt
import numpy as np

from geopy.geocoders import Nominatim
from shapely.geometry import Point

from geopy.extra.rate_limiter import RateLimiter
import numpy as np  # Import NumPy

from geopy.exc import GeocoderTimedOut
from geopy.geocoders import Nominatim  # Replace with your geocoding service if different
import time

In [2]:
# Adjust the display settings to see the full content of the column
pd.set_option('display.max_colwidth', None)

# Set display option to show all rows
pd.set_option('display.max_rows', None)

In [3]:
# print columns function
def col_printing(df):
    for i, col in enumerate(df):
        print(i, col)

# Loading in the Census Tract shapefiles

In [None]:
# path to shapefile
shapefile_census = 'C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/WI_Tract/tl_2023_55_tract.shp'

# load the shapefile into geodataframe
tract_gdf = gpd.read_file(shapefile_census)

tract_gdf.to_csv('tract_gdf.csv', index = False)

In [182]:
col_printing(tract_gdf)

0 STATEFP
1 COUNTYFP
2 TRACTCE
3 GEOID
4 GEOIDFQ
5 NAME
6 NAMELSAD
7 MTFCC
8 FUNCSTAT
9 ALAND
10 AWATER
11 INTPTLAT
12 INTPTLON
13 geometry


In [5]:
# rename the GEOIDFQ to ID
tract_gdf = tract_gdf.rename(columns = {'GEOIDFQ' : 'ID'})

# start by reprojecting to EPSG: 26916, this allows us to calculate the area
tract_gdf = tract_gdf.to_crs(epsg=26916) 

# Add a new column for the area in square meters
tract_gdf['tract_area_sq_m'] = tract_gdf.geometry.area

# Loading in the Neighborhood shapefiles

In [None]:
# path to shapefile
shapefile_neigh = 'C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/DCDNeighborhoods.shp'

# load the shapefile into geodataframe
neighbor_gdf = gpd.read_file(shapefile_neigh)

# display the first few rows of dataframe
print(neighbor_gdf.head())

# start by reprojecting to EPSG: 3857, this allows us to calculate the area
neighbor_gdf = neighbor_gdf.to_crs(epsg=26916)  
# Add a new column for the area in square meters
neighbor_gdf['neigh_area_sq_m'] = neighbor_gdf.geometry.area

neighbor_gdf.to_csv('neighbor_gdf.csv', index = False)


   OBJECTID    Neighborho  SYMBOL    Neighbor_1    SHAPE_STAr    SHAPE_STLe  \
0         3     NEW COELN       1     New Coeln  5.723546e+06   9615.009330   
1         4    SOUTHPOINT       4    Southpoint  1.073536e+07  14415.135317   
2         5  TOWN OF LAKE       2  Town Of Lake  2.737191e+07  25967.630731   
3         6    TIPPECANOE       4    Tippecanoe  2.645802e+07  33503.757162   
4         7    MORGANDALE       2    Morgandale  2.952580e+07  22330.698849   

                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                             

# Load in Demographic Details
S2301 - population, labor, unemployment, employment rate
S0101 - age x gender
B03002 - not hispanic
B02001 - race

## 2023 files
merged_23_df is the merged file for all of the 2023 data

In [None]:
# read the files for 2023
df_S2301_emp = pd.read_csv("C:/Users/blah/blah/Data/Misc/dcdneighborhoods/ACSDT5Y2023.S2301-Transposed.csv")
df_S0101_age = pd.read_csv("C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/ACSDT5Y2023.S0101-Transposed.csv")
df_B03002_nothis = pd.read_csv("C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/ACSDT5Y2023.B03002-Data.csv")
df_B02001_race = pd.read_csv("C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/ACSDT5Y2023.B02001-Data.csv")

# rename totalpop in df_S0101_age to totalpop_agedf
df_S0101_age = df_S0101_age.rename(columns = {'totalpop' : 'totalpop_agedf'})

# rename totalpop in df_B02001_race to totalpop_racedf
df_B02001_race = df_B02001_race.rename(columns = {'totalpop' : 'totalpop_racedf'})

merge1_df = pd.merge(df_B02001_race, df_S0101_age, on = 'ID', how = 'inner')
merged_23_df = pd.merge(merge1_df, df_B03002_nothis, on = 'ID', how = 'inner')
merged_23_df = pd.merge(merged_23_df, df_S2301_emp, on = 'ID', how = 'inner')

## 2022 files (acs5yr_2022_merged.csv)

In [None]:
merged_22_df = pd.read_csv("C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/acs5yr_2022_merged.csv")

# rename id to ID
merged_22_df = merged_22_df.rename(columns = {'id' : 'ID', 'totalpop' : 'totalpop_2022'})

### Combine 2022 and 2023 dataframes together

In [9]:
# Merge DataFrames on the 'ID' column
combined_df = pd.merge(merged_22_df, merged_23_df, on='ID', suffixes=('_2022', '_2023'))

# totalpop_agedf and totalpop_racedf are the same so we drop totalpop_racedf and rename totalpop_agedf into totalpop2023
combined_df = combined_df.drop(columns=['totalpop_racedf'])
combined_df = combined_df.rename(columns = {'totalpop_agedf' : 'totalpop_2023'})

combined_df.to_csv('combined_nocrime.csv', index = False)

In [120]:
col_printing(combined_df)

0 ID
1 tractfips
2 geographicareaname
3 totalpop_2022
4 white_2022
5 black_2022
6 ai_an_2022
7 asian_2022
8 nh_opi_2022
9 otherrace_2022
10 two_or_more_race_2022
11 total_under5_2022
12 total_5to9_2022
13 total10to14_2022
14 total_15to19_2022
15 total_20to24_2022
16 total_25to29_2022
17 total_30to34_2022
18 total_35to39_2022
19 total_40to44_2022
20 total_45to49_2022
21 total_50to54_2022
22 total_55to59_2022
23 total_60to64_2022
24 total_65to69_2022
25 total_70to74_2022
26 total_75to79_2022
27 total_80to84_2022
28 total_85andolder_2022
29 total_under18_2022
30 total_16andover_2022
31 total_over18_2022
32 totalmale_2022
33 male_under5_2022
34 male_5to9_2022
35 male_10to14_2022
36 male_15to19_2022
37 male_20to24_2022
38 male_25to29_2022
39 male_30to34_2022
40 male_35to39_2022
41 male_40to44_2022
42 male_45to49_2022
43 male_50to54_2022
44 male_55to59_2022
45 male_60to64_2022
46 male_65to69_2022
47 male_70to74_2022
48 male_75to79_2022
49 male_80to84_2022
50 male_85andover_2022
51 male_under

### Calculating the Average Scores for Demographic Details 2022 and 2023

In [10]:
# Identify columns ending with '_2022' and '_2023'
columns_2022 = [col for col in combined_df.columns if col.endswith('_2022')]
columns_2023 = [col for col in combined_df.columns if col.endswith('_2023')]

# Ensure the same base columns exist in both years
common_bases = [col.replace('_2022', '') for col in columns_2022 if col.replace('_2022', '_2023') in columns_2023]

for base in common_bases:
    col_2022 = f'{base}_2022'
    col_2023 = f'{base}_2023'

    # Get the data types of the actual columns from the DataFrame
    column_types_2022 = combined_df[col_2022].dtype
    column_types_2023 = combined_df[col_2023].dtype

    print(f"Data type of {col_2022}: {column_types_2022}")
    print(f"Data type of {col_2023}: {column_types_2023}")

Data type of totalpop_2022: float64
Data type of totalpop_2023: int64
Data type of white_2022: float64
Data type of white_2023: object
Data type of black_2022: float64
Data type of black_2023: object
Data type of ai_an_2022: float64
Data type of ai_an_2023: int64
Data type of asian_2022: float64
Data type of asian_2023: int64
Data type of nh_opi_2022: float64
Data type of nh_opi_2023: int64
Data type of otherrace_2022: float64
Data type of otherrace_2023: object
Data type of two_or_more_race_2022: float64
Data type of two_or_more_race_2023: object
Data type of total_under5_2022: float64
Data type of total_under5_2023: int64
Data type of total_5to9_2022: float64
Data type of total_5to9_2023: int64
Data type of total10to14_2022: float64
Data type of total10to14_2023: int64
Data type of total_15to19_2022: float64
Data type of total_15to19_2023: int64
Data type of total_20to24_2022: float64
Data type of total_20to24_2023: int64
Data type of total_25to29_2022: float64
Data type of total_25t

In [11]:
# some columns are not in numeric format and here, we are converting them to numeric
colstoconvert = ['white_2023', 'black_2023', 'otherrace_2023', 
                 'two_or_more_race_2023', 'total_nothispanic_2023',
                 'white_not_hisp_2023', 'black_not_hisp_2023', 'total_hispanic_2023',
                 'hispanic_white_2023', 'hispanic_otherrace_2023',
                 'hispanic_two_or_more_race_2023'
                 ]

# loop through each column in the list
for col in colstoconvert:
    # remove commas and convert to numeric
    combined_df[col] = combined_df[col].str.replace(',', '', regex=True)
    combined_df[col] = pd.to_numeric(combined_df[col], errors='coerce')

In [12]:
def calculate_yearly_averages(df, base_columns):
    """
    Calculate averages between 2022 and 2023 values and add them to the original DataFrame.
    
    Parameters:
    df (pandas.DataFrame): Input DataFrame containing *_2022 and *_2023 columns
    base_columns (list): List of base column names without year suffixes
    
    Returns:
    pandas.DataFrame: Original DataFrame with new averaged columns added
    """
    # Create a copy of the original DataFrame to avoid modifying it directly
    result_df = df.copy()
    
    for col in base_columns:
        col_2022 = f"{col}_2022"
        col_2023 = f"{col}_2023"
        
        # Calculate average and store in new column
        # The new column will be named {col}_avg
        result_df[f"{col}_avg"] = df[[col_2022, col_2023]].mean(axis=1)
    
    return result_df

# Use the function
combined_df = calculate_yearly_averages(combined_df, common_bases)

In [13]:
# rename combined_df['tractfips'] to GEOID
combined_df = combined_df.rename(columns = {'tractfips' : 'GEOID'})

In [14]:
combined_df.to_csv("combined_df.csv", index = False)

### Add the geometry columns of our census tract file

In [None]:
# CENSUS TRACT SHAPEFILE
shapefile_pathtract = 'C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/WI_Tract/tl_2023_55_tract.shp'
tract_gdf = gpd.read_file(shapefile_pathtract)
tract_gdf.to_csv('tract_gdf.csv', index = False)
print(tract_gdf.crs)

# NEIGHBORHOOD TRACT SHAPEFILE
shapefile_pathneigh = 'C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/DCDNeighborhoods.shp'
neighborhood_gdf = gpd.read_file(shapefile_pathneigh)
neighborhood_gdf.to_csv('neighborhood_gdf.csv', index = False)
print(tract_gdf.crs)

neighborhood_gdf = neighborhood_gdf.to_crs("EPSG:26916")
tract_gdf = tract_gdf.to_crs("EPSG:26916")

EPSG:4269
EPSG:4269


#### Merging tract_gdf with combined_df

In [16]:
tract_gdf['GEOID'] = tract_gdf['GEOID'].astype('int64')

combined_acs5yr_map_df = pd.merge(combined_df, tract_gdf, on = 'GEOID', how = 'inner')

acs5yr_2022_2023_avgonly = combined_acs5yr_map_df.copy()

acs5yr_2022_2023_avgonly.to_csv("acs5yr_2022_2023_avgonly.csv", index = False)

##### Retain specific columns for acs5yr_2022_2023_avgonly (remove the _2022 and _2023 columns) and calling this new dataframe, map_acs5yr

In [17]:
# specify columns to keep
column_indices = list(range(0, 3)) + list(range(92, 98)) + list(range(191, 298))

# Subset the DataFrame to keep only the specified columns
map_acs5yr = acs5yr_2022_2023_avgonly.iloc[:, column_indices]

map_acs5yr.to_csv('map_acs5yr.csv', index = False)

## Merging our map_acs5yr file with neighborhood shapefile (neighborhood_gdf) to find tract and neighborhood intersections

In [301]:
print(neighborhood_gdf.crs)

EPSG:32054


In [18]:
# Create GeoDataFrame with proper CRS
map_acs5yr = gpd.GeoDataFrame(
    map_acs5yr,
    geometry='geometry',  # Ensure you're using the correct column for geometry
    crs="EPSG:26916"  # Set the CRS to EPSG:26916 (NAD83 / Wisconsin South)
)

neighborhood_gdf = neighborhood_gdf.to_crs("EPSG:26916")

In [19]:
# Perform spatial intersection to find where tracts and neighborhoods overlap
intersection = gpd.overlay(map_acs5yr, neighborhood_gdf, how='intersection')

# Calculate the area of intersection for each tract-neighborhood pair
intersection['tractneigh_intersection_area'] = intersection.geometry.area

# Group by neighborhood to see which tracts are in each neighborhood
neighborhood_tracts = intersection.groupby('Neighborho').agg({
    'GEOID': list,  # List of tract IDs
    'NAME': list,   # List of tract names
    'tractneigh_intersection_area': list  # Areas of intersection
}).reset_index()

neighborhood_tracts.to_csv('neighborhood_tracts.csv', index = False)

# Figuring out the Neighborhood Demographic Details

In [20]:
# clean up the three columns that are in percentages
# Replace '-' with NaN
for col in ['Labor Force Participation Rate', 'Employment/Population Ratio', 'Unemployment rate']:
    map_acs5yr[col] = map_acs5yr[col].replace('-', np.nan)

# Remove '%' and convert columns to numeric
for col in ['Labor Force Participation Rate', 'Employment/Population Ratio', 'Unemployment rate']:
    map_acs5yr[col] = map_acs5yr[col].str.rstrip('%').astype(float)

    # fill in NaN for empty elements
    map_acs5yr = map_acs5yr.replace('', np.nan)

avg_cols = ['Labor Force Participation Rate', 'Employment/Population Ratio', 'Unemployment rate']
# Replace '-' with NaN and then convert to float
map_acs5yr[avg_cols] = map_acs5yr[avg_cols].replace('-', np.nan).replace('%', '', regex=True).astype(float)

In [448]:
print(map_acs5yr.crs)

EPSG:26916


In [21]:
def estimate_neighborhood_demographics(tract_gdf, neighborhood_gdf, demographic_columns, rate_columns):
    """
    Estimate neighborhood demographics based on overlapping census tracts.
    
    Parameters:
    tract_gdf: GeoDataFrame with census tract data including demographics and geometry
    neighborhood_gdf: GeoDataFrame with neighborhood boundaries
    demographic_columns: List of demographic column names to estimate (raw demographic data)
    rate_columns: List of rate column names to estimate (percentage columns like Labor Force Participation Rate, etc.)
    
    Returns:
    GeoDataFrame with estimated neighborhood demographics
    """
    # Create copies to avoid modifying original data
    tracts = tract_gdf.copy()
    neighborhoods = neighborhood_gdf.copy()
    
    # Calculate tract areas for later proportion calculations
    tracts['tract_area'] = tracts.geometry.area
    
    # Perform spatial intersection
    intersections = gpd.overlay(tracts, neighborhoods, how = 'intersection')
    
    # Calculate the area of each intersection
    intersections['overlap_area'] = intersections.geometry.area
    
    # Calculate what proportion of each tract falls in each neighborhood
    intersections['tract_proportion'] = intersections['overlap_area'] / intersections['tract_area']
    
    # Initialize a dictionary to store results
    demographic_estimates = {}
    
    # Calculate weighted demographics for each neighborhood
    for col in demographic_columns:
        # Multiply demographic values by the proportion of tract area
        intersections[f'{col}_weighted'] = intersections[col] * intersections['tract_proportion']
        
        # Sum up the weighted values for each neighborhood
        demographic_estimates[col] = intersections.groupby('Neighborho')[f'{col}_weighted'].sum()
    
    # Handle rate columns differently (calculate average of rates within each neighborhood)
    for rate_col in rate_columns:
        # Take the weighted average of the rate columns (not the raw values)
        intersections[f'{rate_col}_avg'] = intersections[rate_col]
        
        # Calculate the average rate for each neighborhood (sum of rates / number of tracts in that neighborhood)
        demographic_estimates[rate_col] = intersections.groupby('Neighborho')[f'{rate_col}_avg'].mean()
    
    # Create results DataFrame
    results = pd.DataFrame(demographic_estimates)
    
    # Merge results back with original neighborhood data
    final_results = neighborhoods.merge(results, left_on='Neighborho', right_index=True)
    
    return final_results

def verify_estimates(original_tracts, estimated_neighborhoods, demographic_columns, rate_columns):
    """
    Verify that total populations roughly match between tracts and neighborhoods, 
    and handle verification for columns with percentage values by checking averages.
    """
    print("Verification of estimates:")
    print("-" * 50)
    
    # Verify demographic columns (non-percentage columns)
    for col in demographic_columns:
        tract_total = original_tracts[col].sum()
        neighborhood_total = estimated_neighborhoods[col].sum()
        difference_pct = ((neighborhood_total - tract_total) / tract_total) * 100
        
        print(f"\n{col}:")
        print(f"Total from tracts: {tract_total:,.0f}")
        print(f"Total from neighborhoods: {neighborhood_total:,.0f}")
        print(f"Difference: {difference_pct:,.2f}%")
    
    # Verify rate columns (percentage columns like Labor Force Participation Rate, etc.)
    for rate_col in rate_columns:
        tract_avg = original_tracts[rate_col].mean()
        neighborhood_avg = estimated_neighborhoods[rate_col].mean()
        difference_pct = ((neighborhood_avg - tract_avg) / tract_avg) * 100
        
        print(f"\n{rate_col}:")
        print(f"Average from tracts: {tract_avg:,.2f}%")
        print(f"Average from neighborhoods: {neighborhood_avg:,.2f}%")
        print(f"Difference: {difference_pct:,.2f}%")


In [22]:
demographic_cols = ['Population', 'totalpop_avg', 'white_avg', 'black_avg', 'asian_avg', 'ai_an_avg', 'nh_opi_avg', 'otherrace_avg', 
    'two_or_more_race_avg', 'total_under5_avg', 'total_5to9_avg', 'total10to14_avg', 
    'total_15to19_avg', 'total_20to24_avg', 'total_25to29_avg', 'total_30to34_avg', 
    'total_35to39_avg', 'total_40to44_avg', 'total_45to49_avg', 'total_50to54_avg', 
    'total_55to59_avg', 'total_60to64_avg', 'total_65to69_avg', 'total_70to74_avg', 
    'total_75to79_avg', 'total_80to84_avg', 'total_85andolder_avg', 'total_under18_avg', 
    'total_16andover_avg', 'total_over18_avg', 'totalmale_avg', 'male_under5_avg', 
    'male_5to9_avg', 'male_10to14_avg', 'male_15to19_avg', 'male_20to24_avg', 'male_25to29_avg',
    'male_30to34_avg', 'male_35to39_avg', 'male_40to44_avg', 'male_45to49_avg', 'male_50to54_avg',
    'male_55to59_avg', 'male_60to64_avg', 'male_65to69_avg', 'male_70to74_avg', 'male_75to79_avg',
    'male_80to84_avg', 'male_85andover_avg', 'male_under18_avg', 'male_16andover_avg', 'male_over18_avg',
    'totalfemale_avg', 'female_under5_avg', 'female_5to9_avg', 'female_10to14_avg', 'female_15to19_avg',
    'female_20to24_avg', 'female_25to29_avg', 'female_30to34_avg', 'female_35to39_avg', 'female_40to44_avg',
    'female_45to49_avg', 'female_50to54_avg', 'female_55to59_avg', 'female_60to64_avg', 'female_65to69_avg',
    'female_70to74_avg', 'female_75to79_avg', 'female_80to84_avg', 'female_85andover_avg', 'female_under18_avg',
    'female_16andover_avg', 'female_over18_avg', 'total_nothispanic_avg', 'white_not_hisp_avg', 'black_not_hisp_avg',
    'ai_an_not_hisp_avg', 'asian_not_hisp_avg', 'nh_opi_not_hisp_avg', 'otherrace_not_hisp_avg', 'two_or_more_race_not_hisp_avg',
    'total_hispanic_avg', 'hispanic_white_avg', 'hispanic_black_avg', 'hispanic_ai_an_avg', 'hispanic_asian_avg',
    'hispanic_nh_opi_avg', 'hispanic_otherrace_avg', 'hispanic_two_or_more_race_avg']

rate_cols = ['Labor Force Participation Rate', 'Employment/Population Ratio', 'Unemployment rate']

# Calculate estimates
neighborhood_estimates = estimate_neighborhood_demographics(
    map_acs5yr,    # your tract GeoDataFrame
    neighborhood_gdf,  # your neighborhood GeoDataFrame
    demographic_cols, rate_cols   # list of columns to estimate
)

# Verifying estimates
verified_numbers = verify_estimates(map_acs5yr, neighborhood_estimates, demographic_cols, rate_cols) 

Verification of estimates:
--------------------------------------------------

Population:
Total from tracts: 692,047
Total from neighborhoods: 551,126
Difference: -20.36%

totalpop_avg:
Total from tracts: 884,260
Total from neighborhoods: 713,014
Difference: -19.37%

white_avg:
Total from tracts: 384,864
Total from neighborhoods: 260,910
Difference: -32.21%

black_avg:
Total from tracts: 306,655
Total from neighborhoods: 290,155
Difference: -5.38%

asian_avg:
Total from tracts: 39,757
Total from neighborhoods: 31,292
Difference: -21.29%

ai_an_avg:
Total from tracts: 5,400
Total from neighborhoods: 4,489
Difference: -16.88%

nh_opi_avg:
Total from tracts: 141
Total from neighborhoods: 110
Difference: -21.94%

otherrace_avg:
Total from tracts: 56,418
Total from neighborhoods: 48,492
Difference: -14.05%

two_or_more_race_avg:
Total from tracts: 91,026
Total from neighborhoods: 77,566
Difference: -14.79%

total_under5_avg:
Total from tracts: 59,192
Total from neighborhoods: 49,536
Differ

In [341]:
# # Define the demographic columns you want to estimate for neighborhoods
# demographic_cols = ['Population', 'totalpop_avg', 'white_avg', 'black_avg', 'asian_avg', 'ai_an_avg', 'nh_opi_avg', 'otherrace_avg', 
#     'two_or_more_race_avg', 'total_under5_avg', 'total_5to9_avg', 'total10to14_avg', 
#     'total_15to19_avg', 'total_20to24_avg', 'total_25to29_avg', 'total_30to34_avg', 
#     'total_35to39_avg', 'total_40to44_avg', 'total_45to49_avg', 'total_50to54_avg', 
#     'total_55to59_avg', 'total_60to64_avg', 'total_65to69_avg', 'total_70to74_avg', 
#     'total_75to79_avg', 'total_80to84_avg', 'total_85andolder_avg', 'total_under18_avg', 
#     'total_16andover_avg', 'total_over18_avg', 'totalmale_avg', 'male_under5_avg', 
#     'male_5to9_avg', 'male_10to14_avg', 'male_15to19_avg', 'male_20to24_avg', 'male_25to29_avg',
#     'male_30to34_avg', 'male_35to39_avg', 'male_40to44_avg', 'male_45to49_avg', 'male_50to54_avg',
#     'male_55to59_avg', 'male_60to64_avg', 'male_65to69_avg', 'male_70to74_avg', 'male_75to79_avg',
#     'male_80to84_avg', 'male_85andover_avg', 'male_under18_avg', 'male_16andover_avg', 'male_over18_avg',
#     'totalfemale_avg', 'female_under5_avg', 'female_5to9_avg', 'female_10to14_avg', 'female_15to19_avg',
#     'female_20to24_avg', 'female_25to29_avg', 'female_30to34_avg', 'female_35to39_avg', 'female_40to44_avg',
#     'female_45to49_avg', 'female_50to54_avg', 'female_55to59_avg', 'female_60to64_avg', 'female_65to69_avg',
#     'female_70to74_avg', 'female_75to79_avg', 'female_80to84_avg', 'female_85andover_avg', 'female_under18_avg',
#     'female_16andover_avg', 'female_over18_avg', 'total_nothispanic_avg', 'white_not_hisp_avg', 'black_not_hisp_avg',
#     'ai_an_not_hisp_avg', 'asian_not_hisp_avg', 'nh_opi_not_hisp_avg', 'otherrace_not_hisp_avg', 'two_or_more_race_not_hisp_avg',
#     'total_hispanic_avg', 'hispanic_white_avg', 'hispanic_black_avg', 'hispanic_ai_an_avg', 'hispanic_asian_avg',
#     'hispanic_nh_opi_avg', 'hispanic_otherrace_avg', 'hispanic_two_or_more_race_avg']


# def estimate_neighborhood_demographics(tract_df, neighborhood_df, demographic_columns):
#     """
#     Estimate neighborhood demographics based on overlapping census tracts.
#     """
#     # Create copies to avoid modifying original data
#     tracts = tract_df.copy()
#     neighborhoods = neighborhood_df.copy()
    
#     # Calculate tract areas for later proportion calculations
#     tracts['tract_area'] = tracts.geometry.area
    
#     # Perform spatial intersection
#     intersections = gpd.overlay(tracts, neighborhoods, how='intersection')
    
#     # Calculate the area of each intersection
#     intersections['overlap_area'] = intersections.geometry.area
    
#     # Calculate what proportion of each tract falls in each neighborhood
#     intersections['tract_proportion'] = intersections['overlap_area'] / intersections['tract_area']
    
#     # Initialize a dictionary to store results
#     demographic_estimates = {}
    
#     # Calculate weighted demographics for each neighborhood
#     for col in demographic_columns:
#         # Multiply demographic values by the proportion of tract area
#         intersections[f'{col}_weighted'] = intersections[col] * intersections['tract_proportion']
        
#         # Sum up the weighted values for each neighborhood
#         demographic_estimates[col] = intersections.groupby('Neighborho')[f'{col}_weighted'].sum()
    
#     # Create results DataFrame
#     results = pd.DataFrame(demographic_estimates)
    
#     # Merge results back with original neighborhood data
#     final_results = neighborhoods.merge(results, left_on='Neighborho', right_index=True)
    
#     return final_results

# # Calculate the estimates
# try:
#     neighborhood_estimates = estimate_neighborhood_demographics(
#         map_acs5yr,  # your census tract data
#         neighborhood_gdf,  # your neighborhood data
#         demographic_cols
#     )
    
#     # Print summary of results
#     print("\nNeighborhood Demographic Estimates:")
#     print("-" * 50)
#     for idx, row in neighborhood_estimates.iterrows():
#         print(f"\nNeighborhood: {row['Neighborho']}")
#         for col in demographic_cols[:5]:  # Print first 5 demographic columns as example
#             print(f"{col}: {row[col]:,.2f}")
            
#     # Basic verification
#     print("\nVerification of total population:")
#     print(f"Sum of tract populations: {map_acs5yr['totalpop_avg'].sum():,.2f}")
#     print(f"Sum of estimated neighborhood populations: {neighborhood_estimates['totalpop_avg'].sum():,.2f}")
    
# except Exception as e:
#     print(f"An error occurred: {str(e)}")
#     print("\nAvailable columns in tract data:")
#     print(map_acs5yr.columns)
#     print("\nAvailable columns in neighborhood data:")
#     print(neighborhood_gdf.columns)


Neighborhood Demographic Estimates:
--------------------------------------------------

Neighborhood: NEW COELN
Population: 213.06
totalpop_avg: 256.88
white_avg: 206.05
black_avg: 2.01
asian_avg: 10.51

Neighborhood: SOUTHPOINT
Population: 2,335.45
totalpop_avg: 3,277.21
white_avg: 1,719.46
black_avg: 168.00
asian_avg: 340.44

Neighborhood: TOWN OF LAKE
Population: 3,649.39
totalpop_avg: 4,565.12
white_avg: 3,526.61
black_avg: 218.54
asian_avg: 128.48

Neighborhood: TIPPECANOE
Population: 4,799.79
totalpop_avg: 5,783.84
white_avg: 4,859.68
black_avg: 226.08
asian_avg: 26.16

Neighborhood: MORGANDALE
Population: 7,474.98
totalpop_avg: 10,195.09
white_avg: 4,560.56
black_avg: 540.65
asian_avg: 748.04

Neighborhood: SOUTHGATE
Population: 3,646.93
totalpop_avg: 4,941.15
white_avg: 2,250.55
black_avg: 346.18
asian_avg: 404.83

Neighborhood: BAY VIEW
Population: 15,051.40
totalpop_avg: 17,802.06
white_avg: 14,183.48
black_avg: 671.11
asian_avg: 251.95

Neighborhood: WEDGEWOOD
Population: 1

### neighborhood_estimates contains demographic details by each neighborhood

In [23]:
# Rename the Neighborho column to Neighborhood
neighborhood_estimates = neighborhood_estimates.rename(columns = {'Neighborho' : 'Neighborhood'})

# Lower case the Neighborhood values
neighborhood_estimates['Neighborhood'] = neighborhood_estimates['Neighborhood'].fillna('').apply(str)
neighborhood_estimates['Neighborhood'] = neighborhood_estimates['Neighborhood'].str.strip().str.lower()

# Export to CSV
neighborhood_estimates.to_csv('neighborhood_estimates.csv', index = False)

# Bringing in the Victims and Suspects dataframes

In [None]:
# read the suspect and victims dataframe
df_victims = pd.read_csv("C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/2022-2023 - Part I Violent Crime and Part II Crime Against Person - Victim.csv")
df_suspects = pd.read_csv("C:/Users/blah/blah/Documents/Data/Misc/dcdneighborhoods/2022-2023 - Part I Violent Crime and Part II Crime Against Person - Suspect.csv")

# Cleaning the Neighborhood column for df_victims
# Replace 'nan' (string) with np.nan
df_victims['Neighborhood'] = df_victims['Neighborhood'].replace('nan', np.nan)
# Convert the 'Neighborhood' column to string values and then apply string methods
df_victims['Neighborhood'] = df_victims['Neighborhood'].fillna('').apply(str)
df_victims['Neighborhood'] = df_victims['Neighborhood'].str.strip().str.lower()

# Drop rows where 'Neighborhood' is either NaN or an empty string
df_victims = df_victims[df_victims['Neighborhood'].str.strip() != '']

# Cleaning the Neighborhood column for df_suspects
# Replace 'nan' (string) with np.nan
df_suspects['Neighborhood'] = df_suspects['Neighborhood'].replace('nan', np.nan)
# Convert the 'Neighborhood' column to string values and then apply string methods
df_suspects['Neighborhood'] = df_suspects['Neighborhood'].fillna('').apply(str)
df_suspects['Neighborhood'] = df_suspects['Neighborhood'].str.strip().str.lower()

# Drop rows where 'Neighborhood' is either NaN or an empty string
df_suspects = df_suspects[df_suspects['Neighborhood'].str.strip() != '']

# Renaming the race-ethnicity column
race_dict = {"Black / African American" : "black",
             "Hispanic / Latino" : "hispanic",
             "Native Hawaiian / Other Pacific Islander" : "NH_PI",
             "Asian" : "Asian",
             "White" : "white",
             "American Indian / Alaskan Native" : "AI_AN",
             "Unknown / Other" : "other"}

df_victims['Race-Ethnicity'] = df_victims['Race-Ethnicity'].replace(race_dict)
df_suspects['Race-Ethnicity'] = df_suspects['Race-Ethnicity'].replace(race_dict)

# Renaming the offenses
offense_dict = {"Aggravated Assault" : 'total_aggravated_assault',
                                                  "Crime Against Person" : "total_partIIperson",
                                                  'Homicide' : 'total_homicide',
                                                  'Human Trafficking' : 'total_human_trafficking',
                                                  'Rape' : 'total_rape',
                                                  'Robbery' : 'total_robbery'}

df_suspects['Offense Type'] = df_suspects['Offense Type'].replace(offense_dict)
df_victims['Offense Type'] = df_victims['Offense Type'].replace(offense_dict)

In [25]:
# Creating a new dataframe to count crime by neighborhood and count crime by race by neighborhood
# FOR df_victims (Offense in general)
# Pivot the table to count occurrences
pivot_vic_offtyp = df_victims.groupby(['Neighborhood', 'Offense Type']).size().unstack(fill_value = 0)
# Create a column for Race-Ethnicity and Offense Type
df_victims['Race_Offense'] = df_victims['Race-Ethnicity'] + '_' + df_victims['Offense Type']
# Create a new column counting the number of occurrence of offense type x race
df_victims['Counts'] = df_victims.groupby(['Neighborhood', 'Race_Offense']).transform('size')
pivot_vic_offrace = df_victims.pivot_table(index = 'Neighborhood', columns = 'Race_Offense', 
                                           values = 'Counts', aggfunc = 'sum',
                                           fill_value = 0)

# FOR df_victims (Part I offense)
# Create a column that crosses 'Race-Ethnicity' and 'Offense Category'
df_victims['Race_OffenseCat'] = df_victims['Race-Ethnicity'] + '_' + df_victims['Offense Category']
# Count occurence of offense category x race
df_victims['Off_Cat_Counts'] = df_victims.groupby(['Neighborhood', 'Race-Ethnicity']).transform('size')
# pivot into table
pivot_vic_offcat_race = df_victims.pivot_table(index = 'Neighborhood', columns = 'Race_OffenseCat', 
                                               values = 'Off_Cat_Counts', aggfunc = 'sum',
                                               fill_value = 0)
df_victims_new = pd.merge(pivot_vic_offcat_race, pivot_vic_offrace, left_index=True, right_index=True)
# Reset index to make 'Neighborhood' a column again
df_victims_new = df_victims_new.reset_index()

df_victims_new = pd.merge(df_victims_new, pivot_vic_offtyp, on = 'Neighborhood', how = 'outer')
# Reset index to make 'Neighborhood' a column again
df_victims_new = df_victims_new.reset_index()

# FOR df_suspects (Offense in general)
# Group by 'Neighborhood' and 'Offense Type' to count occurrences
pivot_off_type = df_suspects.groupby(['Neighborhood', 'Offense Type']).size().unstack(fill_value=0)
# Create a new column combining 'Race-Ethnicity' and 'Offense Type'
df_suspects['Race_Offense'] = df_suspects['Race-Ethnicity'] + '_' + df_suspects['Offense Type']
# Create a new column counting the number of occurrence of crime x race
df_suspects['Counts'] = df_suspects.groupby(['Neighborhood', 'Race_Offense']).transform('size')
pivot_off_race = df_suspects.pivot_table(index = 'Neighborhood', columns = 'Race_Offense', values = 'Counts', aggfunc = 'sum', fill_value = 0)

# FOR df_suspects (Part I offense)
# Create a new column combining 'Race-Ethnicity' and 'Offense Category'
df_suspects['Race_OffenseCat'] = df_suspects['Race-Ethnicity'] + '_' + df_suspects['Offense Category']
# Create a new column counting the number of occurrence of crime category x race
df_suspects['Off_Cat_Counts'] = df_suspects.groupby(['Neighborhood', 'Race_OffenseCat']).transform('size')
pivot_offcat_race = df_suspects.pivot_table(index = 'Neighborhood', columns = 'Race_OffenseCat', values = 'Off_Cat_Counts', aggfunc = 'sum', fill_value = 0)
df_suspects_new = pd.merge(pivot_offcat_race, pivot_off_race, 
                     left_index=True, right_index=True)

# Reset index to make 'Neighborhood' a column again
df_suspects_new = df_suspects_new.reset_index()
df_suspects_new = pd.merge(df_suspects_new, pivot_off_type, on = 'Neighborhood', how = 'outer')

# COMBINE VICTIMS AND SUSPECTS DATA INTO ONE DATAFRAME
combined_crime_df = pd.merge(df_victims_new, df_suspects_new, on = 'Neighborhood', suffixes = ('_vic', '_sus'))
# Export to CSV
combined_crime_df.to_csv('combined_crime_df.csv', index = False)

In [28]:
pivot_vic_offrace.head()

Race_Offense,AI_AN_total_aggravated_assault,AI_AN_total_homicide,AI_AN_total_partIIperson,AI_AN_total_rape,AI_AN_total_robbery,Asian_total_aggravated_assault,Asian_total_homicide,Asian_total_partIIperson,Asian_total_rape,Asian_total_robbery,...,other_total_human_trafficking,other_total_partIIperson,other_total_rape,other_total_robbery,white_total_aggravated_assault,white_total_homicide,white_total_human_trafficking,white_total_partIIperson,white_total_rape,white_total_robbery
Neighborhood,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
alcott park,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,9,0,0
alverno,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,16,0,0,64,0,0
amani,0,0,4,0,0,0,0,0,0,0,...,0,0,0,0,25,0,0,25,0,0
arlington gardens,0,0,0,0,0,0,0,1,0,0,...,0,0,0,0,49,1,0,121,0,16
arlington heights,1,0,1,0,0,9,0,0,0,4,...,0,0,0,0,324,0,1,361,16,16


##### Merge neighborhood_estimates with victims and suspects data

In [26]:
neighborhood_vicsus_est = pd.merge(neighborhood_estimates, combined_crime_df, on = 'Neighborhood', how = 'inner')
neighborhood_vicsus_est.to_csv('neighborhood_vicsus_est.csv', index = False)

# Bringing in the Pedestrian Dataframe

In [None]:
df_ped = pd.read_stata("C:/Users/blah/blah/Documents/Data/WI_RAD/persons_fionly_geolocated_foranalysis_2022-2023.dta")

# Rename the neighborhood column to Neighborhood
df_ped = df_ped.rename(columns = {'neighborhood':'Neighborhood'})
# Replace 'nan' (string) with np.nan
df_ped['Neighborhood'] = df_ped['Neighborhood'].replace('nan', np.nan)
# Convert the 'Neighborhood' column to string values and then apply string methods
df_ped['Neighborhood'] = df_ped['Neighborhood'].fillna('').apply(str)
df_ped['Neighborhood'] = df_ped['Neighborhood'].str.strip().str.lower()
# Drop rows where 'Neighborhood' is either NaN or an empty string
df_ped = df_ped[df_ped['Neighborhood'].str.strip() != '']
df_ped = df_ped[df_ped['Neighborhood'] != '1_could not locate']

In [30]:
## Figuring out the count of pedestrian encounters in each neighborhood
PedNeigh_counts = df_ped.groupby('Neighborhood').size().reset_index(name='ped_count')

# Bringing in the Vehicles/Encounter Dataframe

In [None]:
df_enc = pd.read_stata("C:/Users/blah/blah/Documents/Data/WI_RAD/ts_encounters_analysis_updated key variables_2022-2023.dta")

# Rename the neighborhood column to Neighborhood
df_enc = df_enc.rename(columns = {'neighborhood':'Neighborhood'})
# Replace 'nan' (string) with np.nan
df_enc['Neighborhood'] = df_enc['Neighborhood'].replace('nan', np.nan)
# Convert the 'Neighborhood' column to string values and then apply string methods
df_enc['Neighborhood'] = df_enc['Neighborhood'].fillna('').apply(str)
df_enc['Neighborhood'] = df_enc['Neighborhood'].str.strip().str.lower()
# Drop rows where 'Neighborhood' is either NaN or an empty string
df_enc = df_enc[df_enc['Neighborhood'].str.strip() != '']
df_enc = df_enc[df_enc['Neighborhood'] != '1_could not locate']

In [32]:
## Figuring out the count of vehicle encounters in each neighborhood
EncNeigh_counts = df_enc.groupby('Neighborhood').size().reset_index(name='enc_count')

##### Creating a dataframe with neighborhood estimates, victims and suspects, and pedestrian and vehicle counts

In [33]:
neighborhood_vicsus_pedenc_est = pd.merge(neighborhood_vicsus_est, EncNeigh_counts, on = 'Neighborhood', how = 'inner')
neighborhood_vicsus_pedenc_est = pd.merge(neighborhood_vicsus_pedenc_est, PedNeigh_counts, on = 'Neighborhood', how = 'inner')
neighborhood_vicsus_pedenc_est.to_csv('neighborhood_vicsus_pedenc_est.csv', index = False)

# New Dataframe with Pedestrians as the Unit of Analysis (df_ped)

### Start with creating a count of victims by race in the df_victims dataframe

In [34]:
# race of victims count by neighborhoods
race_vic_neigh = df_victims.groupby(['Neighborhood', 'Race-Ethnicity']).size().unstack(fill_value=0).reset_index()

race_vic_neigh = race_vic_neigh.rename(columns = {'AI_AN' : 'AI_AN_viccnt',
                                                  'Asian': 'Asian_viccnt',
                                                  'NH_PI': 'NH_PI_viccnt',
                                                  'black':'black_viccnt',
                                                  'white' : 'white_viccnt',
                                                  'other' : 'other_viccnt',
                                                  'hispanic':'hispanic_viccnt'})

### Creating a count of pedestrian stops by race in the df_ped dataframe

In [35]:
# Use cat.rename_categories to rename 'missing' to NaN
df_ped['racecat'] = df_ped['racecat'].cat.remove_categories('missing')

race_count_neigh = df_ped.pivot_table(index = 'Neighborhood', columns = 'racecat', aggfunc = 'size', fill_value = 0, observed = True)

race_count_neigh = race_count_neigh.rename(columns = {'white' : 'white_peds', 'black' : 'black_peds',
                                                      'latinx' : 'hispanic_peds', 
                                                      'other' : 'other_peds'})

# Reset the index so that 'neighborhood' becomes a column
race_count_neigh = race_count_neigh.reset_index()

In [428]:
col_printing(race_count_neigh)

0 Neighborhood
1 white_peds
2 black_peds
3 hispanic_peds
4 other_peds


##### Merge the race_count_neigh and race_vic_neigh dataframe together

In [36]:
vic_ped_count = pd.merge(race_vic_neigh, race_count_neigh, on = 'Neighborhood', how = 'inner')

In [429]:
col_printing(vic_ped_count)

0 Neighborhood
1 AI_AN_viccnt
2 Asian_viccnt
3 NH_PI_viccnt
4 black_viccnt
5 hispanic_viccnt
6 other_viccnt
7 white_viccnt
8 white_peds
9 black_peds
10 hispanic_peds
11 other_peds


## Forming the pedestrian df with victim counts and pedestrian counts

In [37]:
df_ped_vic_count = pd.merge(df_ped, vic_ped_count, on = 'Neighborhood', how = 'inner')
df_ped_vic_count.to_csv("df_ped_vic_count.csv", index = False)

## Forming the pedestrian df (unit of analysis) with victim counts, pedestrian counts, and neighborhood details

In [38]:
df_ped_vic_count_neigh = pd.merge(df_ped_vic_count, neighborhood_estimates, on = 'Neighborhood', how = 'inner')
df_ped_vic_count_neigh.to_csv("df_ped_vic_count_neigh.csv", index = False)

In [39]:
col_printing(df_ped_vic_count_neigh)

0 top10neighborhood
1 Neighborhood
2 searchperformed
3 pcsearch
4 incidentarrestsearch
5 contactdesc1a
6 stop_justificationa
7 stop_justificationb
8 search_justificationa
9 search_justificationb
10 uniqueperson
11 top20neighborhood
12 call_no
13 sexcat
14 racecat
15 agecat
16 agecontinuous
17 young_old
18 timeofday
19 morning
20 afternoon
21 firstevening
22 secondevening
23 latitudea1
24 longitudea1
25 street
26 city
27 state
28 district
29 stoptype
30 violentstop
31 violent_weapon
32 violent_other
33 propertystop
34 drugdealingstop
35 roadsafetystop
36 equipmentstop
37 publicdisorderstop
38 wantedorwarrantstop
39 trafficstop
40 possweaponstop
41 possdrugstop
42 otherstop
43 missingstop
44 violation_typea
45 searcharrest
46 searchnofrisk
47 friskperformed
48 searchandfrisk
49 frisktype
50 friskarrest
51 contrabandretrieved
52 stopoutcome
53 top20officer
54 top20unit
55 officers
56 officers2
57 officers3
58 officername1a
59 officername2a
60 officername3a
61 officername4a
62 officername5

In [40]:
def interest_summing(df, colnames):
    results = {}  

    for col in colnames:
        
        results[f"{col}_num"] = df.groupby('Neighborhood')[col].sum().reset_index()
    
    return results  

#### Tidy up the df_ped_vic_count_neigh dataframe into ped_vic_neigh_count

In [48]:
col_keep = list(range(0, 23)) + list(range(28, 55)) + list(range(668, 671)) + list(range(1613, 1619)) + list(range(1626, 1760)) 

ped_vic_neigh_count = df_ped_vic_count_neigh.iloc[:, col_keep]
ped_vic_neigh_count.to_csv('ped_vic_neigh_count.csv', index = False)

In [43]:
col_printing(ped_vic_neigh_count)

0 top10neighborhood
1 Neighborhood
2 searchperformed
3 pcsearch
4 incidentarrestsearch
5 contactdesc1a
6 stop_justificationa
7 stop_justificationb
8 search_justificationa
9 search_justificationb
10 uniqueperson
11 top20neighborhood
12 call_no
13 sexcat
14 racecat
15 agecat
16 agecontinuous
17 young_old
18 timeofday
19 morning
20 afternoon
21 firstevening
22 secondevening
23 district
24 stoptype
25 violentstop
26 violent_weapon
27 violent_other
28 propertystop
29 drugdealingstop
30 roadsafetystop
31 equipmentstop
32 publicdisorderstop
33 wantedorwarrantstop
34 trafficstop
35 possweaponstop
36 possdrugstop
37 otherstop
38 missingstop
39 violation_typea
40 searcharrest
41 searchnofrisk
42 friskperformed
43 searchandfrisk
44 frisktype
45 friskarrest
46 contrabandretrieved
47 stopoutcome
48 top20officer
49 top20unit
50 anysearch
51 search_nofrisk
52 friskandarrest
53 female
54 white
55 black
56 latino
57 otherrace
58 old
59 nonviolentstop
60 violent_stop
61 nosearch
62 noincidentarrestsearc

### Adding More Variables into ped_vic_neigh_count
Variables like: 
- totalvictims - total_viccnt - (AI_AN_viccnt, Asian_viccnt, NH_PI_viccnt, black_viccnt, hispanic_viccnt, other_viccnt, white_viccnt)
- totalblackwhite_pedstops (white_peds + black_peds)
- black stop rate (black stops/black+white stops)
- white stop rate (white stops/black+white stops)
- population proportion of black individuals (black population/totalpopulation)
- population proportion of white individuals (white population/totalpopulation)
- black_ped_DI (black stop rate/population proportion of black individuals)
- white_ped_DI (white stop rate/population proportion of white individuals)


##### Calculating total victims

In [52]:
victimgrps = ['AI_AN_viccnt', 'Asian_viccnt', 'NH_PI_viccnt', 'black_viccnt', 'hispanic_viccnt', 'other_viccnt', 'white_viccnt']
race_vic_neigh['total_viccnt'] = race_vic_neigh[victimgrps].sum(axis = 1)
race_vic_neigh.head()

Race-Ethnicity,Neighborhood,AI_AN_viccnt,Asian_viccnt,NH_PI_viccnt,black_viccnt,hispanic_viccnt,other_viccnt,white_viccnt,total_viccnt
0,alcott park,0,0,0,3,0,0,3,6
1,alverno,0,0,0,4,7,0,12,23
2,amani,2,0,0,262,6,0,10,280
3,arlington gardens,0,1,0,114,6,0,23,144
4,arlington heights,2,5,0,539,14,0,46,606


##### Calculating black and white victimization rates

In [53]:
race_vic_neigh['black_vicrate'] = race_vic_neigh['black_viccnt'] / race_vic_neigh['total_viccnt']
race_vic_neigh['white_vicrate'] = race_vic_neigh['white_viccnt'] / race_vic_neigh['total_viccnt']

In [66]:
# subset race_vic_neigh to a smaller df for merging later on
vic_info = race_vic_neigh[['black_vicrate', 'white_vicrate', 'total_viccnt', 'Neighborhood']]

##### Calculating stop rates for Black and White pedestrians

In [None]:
# recode racecat so that missing = NA
racecat_missing = {'missing' : np.nan}
df_enc['racecat'] = df_enc['racecat'].replace(racecat_missing)
df_ped['racecat'] = df_ped['racecat'].replace(racecat_missing)

# Creating a pivot table that lets us see the counts of vehicle stops by race by neighborhood
encneighrace_pivot = df_enc.pivot_table(
    index='Neighborhood',   # Rows (neighborhoods)
    columns='racecat',      # Columns (races)
    aggfunc='size',         # Count occurrences
    fill_value=0            # Replace NaNs with 0s
)

encneighrace_pivot = encneighrace_pivot.reset_index()

# Creating a pivot table that lets us see the counts of pedestrian stops by race by neighborhood
pedneighrace_pivot = df_ped.pivot_table(
    index='Neighborhood',   # Rows (neighborhoods)
    columns='racecat',      # Columns (races)
    aggfunc='size',         # Count occurrences
    fill_value=0            # Replace NaNs with 0s
)

pedneighrace_pivot = pedneighrace_pivot.reset_index()

# merge pedneighrace_pivot into neighborhood_vicsus_est
neighborhood_vicsus_pedenc_final = pd.merge(neighborhood_vicsus_est, pedneighrace_pivot, on = 'Neighborhood', how = 'inner')

# rename the pedestrian columns
neighborhood_vicsus_pedenc_final = neighborhood_vicsus_pedenc_final.rename(columns = {'white' : 'white_pedstops',
                                                                                      'black' : 'black_pedstops',
                                                                                      'latinx' : 'latinx_pedstops',
                                                                                      'other' : 'other_pedstops'})

# merge encneighrace_pivot into neighborhood_vicsus_pedenc_final
neighborhood_vicsus_pedenc_final = pd.merge(neighborhood_vicsus_pedenc_final, encneighrace_pivot, on = 'Neighborhood', how = 'inner')

# rename the vehicle columns
neighborhood_vicsus_pedenc_final = neighborhood_vicsus_pedenc_final.rename(columns = {'white' : 'white_vehstops',
                                                                                      'black' : 'black_vehstops',
                                                                                      'latinx' : 'latinx_vehstops',
                                                                                      'other' : 'other_vehstops'})


  encneighrace_pivot = df_enc.pivot_table(
  pedneighrace_pivot = df_ped.pivot_table(


In [62]:
new_ped_df = neighborhood_vicsus_pedenc_final.copy()

pedgrps = ['white_pedstops', 'black_pedstops']
# creating a variable that has the total pedestrian stops for black and white people
new_ped_df['black_white_total_pedstops'] = new_ped_df[pedgrps].sum(axis = 1)

# calculating white and black stop rate
new_ped_df['white_ped_stoprate'] = new_ped_df['white_pedstops']/new_ped_df['black_white_total_pedstops']
new_ped_df['black_ped_stoprate'] = new_ped_df['black_pedstops']/new_ped_df['black_white_total_pedstops']

# subset this dataframe for merging later
new_ped_df = new_ped_df[['white_ped_stoprate', 'black_ped_stoprate', 'black_white_total_pedstops', 'Neighborhood']]

##### Calculating Population Proportion for Black and White folks by neighborhood

In [65]:
new_pop_df = neighborhood_vicsus_pedenc_final.copy()

popgrps = ['white_avg', 'black_avg', 'asian_avg', 'ai_an_avg', 'nh_opi_avg', 'otherrace_avg', 'two_or_more_race_avg']
# creating a variable that has the total population count for each neighborhood
new_pop_df['totalpopulation_count'] = new_pop_df[popgrps].sum(axis = 1)

# calculating population proportion for white and black individuals in different neighborhood
new_pop_df['blackpop_prop'] = new_pop_df['black_avg']/new_pop_df['totalpopulation_count']
new_pop_df['whitepop_prop'] = new_pop_df['white_avg']/new_pop_df['totalpopulation_count']

# subset this dataframe for merging later
new_pop_df = new_pop_df[['totalpopulation_count', 'blackpop_prop', 'whitepop_prop', 'Neighborhood']]

### Calculating the non-RAD Disparity Index for Pedestrian Stops
This is stop rate of a given group divided population proportion of that given group

In [76]:
disparity_ped_df = pd.merge(new_ped_df, vic_info, on = 'Neighborhood', how = 'inner')
disparity_ped_df = pd.merge(disparity_ped_df, new_pop_df, on = 'Neighborhood', how = 'inner')

In [77]:
col_printing(disparity_ped_df)

0 white_ped_stoprate
1 black_ped_stoprate
2 black_white_total_pedstops
3 Neighborhood
4 black_vicrate
5 white_vicrate
6 total_viccnt
7 totalpopulation_count
8 blackpop_prop
9 whitepop_prop


In [78]:
disparity_ped_df['black_ped_DI'] = disparity_ped_df['black_ped_stoprate'] / disparity_ped_df['blackpop_prop']
disparity_ped_df['white_ped_DI'] = disparity_ped_df['white_ped_stoprate'] / disparity_ped_df['whitepop_prop']

## Merging disparity information into our pedestrian dataframe (ped_vic_neigh_count)

In [79]:
PedestrianRegression_df = pd.merge(ped_vic_neigh_count, disparity_ped_df, on = 'Neighborhood', how = 'inner')
PedestrianRegression_df.to_csv("PedestrianRegression_df.csv", index = False)

# RAD Index Dataframe Construction

In [89]:
# from neighborhood_vicsus_pedenc_est, pull 'white_not_hisp_avg', 'black_not_hisp_avg', 'totalpop_avg'
popcount = neighborhood_vicsus_pedenc_est[['Neighborhood', 'white_not_hisp_avg', 'black_not_hisp_avg', 'totalpop_avg']]

In [90]:
# pull victim counts for black and white
race_vic_neigh2 = race_vic_neigh.copy()

total_interest = ['AI_AN_viccnt', 'Asian_viccnt', 'NH_PI_viccnt', 'black_viccnt', 'hispanic_viccnt', 'other_viccnt', 'white_viccnt']

# Calculate total_viccnt and add it to the DataFrame
race_vic_neigh2['total_viccnt'] = race_vic_neigh2[total_interest].sum(axis = 1)
vic_interest2 = ['black_viccnt', 'white_viccnt', 'total_viccnt']

results = interest(race_vic_neigh2, vic_interest2)

# Access the results by column name
white_df = results['white_viccnt_num']
black_df = results['black_viccnt_num']
total_df = results['total_viccnt_num']

vic_df2 = pd.merge(white_df, black_df, on = 'Neighborhood', how = 'outer')
vic_df2 = pd.merge(vic_df2, total_df, on = 'Neighborhood', how = 'outer')

In [91]:
# pull pedestrian counts for black and white
ped_interest = ['white_peds', 'black_peds']
results = interest(race_count_neigh, ped_interest)

# Access the results by column name
whiteped_df = results['white_peds_num']
blackped_df = results['black_peds_num']

pedinterest_df2 = pd.merge(whiteped_df, blackped_df, on = 'Neighborhood', how = 'outer')

#### Merge pedinterest_df2 with vic_df2 with popcount

In [None]:
result_RAD = pd.merge(popcount, vic_df2, on = 'Neighborhood', how = 'inner')
result_RAD= pd.merge(result_RAD, pedinterest_df2, on = 'Neighborhood', how = 'outer')


In [93]:
# Creating a percentage column for Total White Population and Total Black population
result_RAD['percent_whitepop'] = (result_RAD['white_not_hisp_avg'] / result_RAD['totalpop_avg']) * 100
result_RAD['percent_blackpop'] = (result_RAD['black_not_hisp_avg'] / result_RAD['totalpop_avg']) * 100

In [95]:
result_RAD.to_csv('RAD_Ped_df.csv', index = False)

# New Dataframe with Vehicles as the Unit of Analysis (df_enc)

### Creating a count of vehicle stops by race in the df_enc dataframe

In [85]:
# Use cat.rename_categories to rename 'missing' to NaN
# df_enc['racecat'] = df_enc['racecat'].cat.remove_categories('missing')

race_count_enc = df_enc.pivot_table(index = 'Neighborhood', columns = 'racecat', aggfunc = 'size', fill_value = 0, observed = True)

race_count_enc = race_count_enc.rename(columns = {'white' : 'vehlvl_white', 'black' : 'vehlvl_black',
                                                      'latinx' : 'vehlvl_hispanic', 
                                                      'other' : 'vehlvl_other'})

# Reset the index so that 'neighborhood' becomes a column
race_count_enc = race_count_enc.reset_index()

##### Merge the race_count_enc and race_vic_neigh dataframe together

In [86]:
vic_enc_count = pd.merge(race_vic_neigh, race_count_enc, on = 'Neighborhood', how = 'inner')

In [87]:
print(len(vic_enc_count))
print(len(df_enc))

184
56226


In [474]:
vic_enc_count['Neighborhood']

0                     alcott park
1                         alverno
2                           amani
3               arlington gardens
4               arlington heights
5                    avenues west
6                      baran park
7                        bay view
8               bluemound heights
9                  borchert field
10                bradley estates
11                  brewer's hill
12                       brynwood
13                   burnham park
14                  calumet farms
15              cambridge heights
16                    cannon park
17                capitol heights
18                   castle manor
19                  clarke square
20                  clayton crest
21              clock tower acres
22               cold spring park
23                college heights
24                  columbus park
25                      concordia
26                    cooper park
27                copernicus park
28                    dineen park
29            

## Forming the vehicles df with victim counts and pedestrian counts

In [88]:
df_enc_vic_count = pd.merge(df_enc, vic_enc_count, on = 'Neighborhood', how = 'inner')
df_enc_vic_count.to_csv("df_enc_vic_count.csv", index = False)

## Forming the vehicles df (unit of analysis) with victim counts, pedestrian counts, and neighborhood details

In [89]:
neighborhood_estimates = neighborhood_estimates.rename(columns = {'Neighborho' : 'Neighborhood'})
# Convert the 'Neighborhood' column to string values and then apply string methods
neighborhood_estimates['Neighborhood'] = neighborhood_estimates['Neighborhood'].fillna('').apply(str)
neighborhood_estimates['Neighborhood'] = neighborhood_estimates['Neighborhood'].str.strip().str.lower()

In [90]:
neighborhood_vicsus_est = neighborhood_vicsus_est.rename(columns = {'Neighborho' : 'Neighborhood'})
# Convert the 'Neighborhood' column to string values and then apply string methods
neighborhood_vicsus_est['Neighborhood'] = neighborhood_vicsus_est['Neighborhood'].fillna('').apply(str)
neighborhood_vicsus_est['Neighborhood'] = neighborhood_vicsus_est['Neighborhood'].str.strip().str.lower()

df_enc_vic_count_neigh = pd.merge(df_enc_vic_count, neighborhood_vicsus_est, on = 'Neighborhood', how = 'inner')
df_enc_vic_count_neigh.to_csv("enc_vic_count_neigh.csv", index = False)

#### Tidy up df_enc_vic_count into enc_vic_neigh_count

In [91]:
col_keep = list(range(0, 2)) + list(range(43, 45)) + list(range(55, 65)) + list(range(109, 117)) + list(range(130, 134)) + list(range(139, 146)) + list(range(1555, 1560)) + list(range(1812, 1822)) + list(range(1831, 1842)) + list(range(1849, 2074))
enc_vic_count_neigh = df_enc_vic_count_neigh.iloc[:, col_keep]
enc_vic_count_neigh.to_csv('enc_vic_count_neigh.csv', index = False)

### Adding More Variables into enc_vic_count_neigh
Variables like: 
- totalvictims - total_viccnt - (AI_AN_viccnt, Asian_viccnt, NH_PI_viccnt, black_viccnt, hispanic_viccnt, other_viccnt, white_viccnt)
- totalblackwhite_vehstops (white_vehs + black_vehs)
- black stop rate (black stops/black+white stops)
- white stop rate (white stops/black+white stops)
- population proportion of black individuals (black population/totalpopulation)
- population proportion of white individuals (white population/totalpopulation)
- black_veh_DI (black stop rate/population proportion of black individuals)
- white_veh_DI (white stop rate/population proportion of white individuals)

total victim count and black and white victimization rates are available in vic_info from calculations above

##### Calculating stop rates for Black and White driven vehicles

In [92]:
new_veh_df = neighborhood_vicsus_pedenc_final.copy()

vehgrps = ['white_vehstops', 'black_vehstops']
# creating a variable that has the total pedestrian stops for black and white people
new_veh_df['black_white_total_vehstops'] = new_veh_df[vehgrps].sum(axis = 1)

# calculating white and black stop rate
new_veh_df['white_veh_stoprate'] = new_veh_df['white_vehstops']/new_veh_df['black_white_total_vehstops']
new_veh_df['black_veh_stoprate'] = new_veh_df['black_vehstops']/new_veh_df['black_white_total_vehstops']

# subset this dataframe for merging later
new_veh_df = new_veh_df[['white_veh_stoprate', 'black_veh_stoprate', 'black_white_total_vehstops', 'Neighborhood']]

##### Calculating Population Proportion for Black and White folks by neighborhood
This was already done above. It is in the df, new_pop_df

### Calculating the non-RAD Disparity Index for Vehicle Stops
This is stop rate of a given group divided population proportion of that given group

In [93]:
disparity_veh_df = pd.merge(new_veh_df, vic_info, on = 'Neighborhood', how = 'inner')
disparity_veh_df = pd.merge(disparity_veh_df, new_pop_df, on = 'Neighborhood', how = 'inner')

## Merging disparity information into our pedestrian dataframe (enc_vic_neigh_count)

In [94]:
VehicleRegression_df = pd.merge(enc_vic_count_neigh, disparity_veh_df, on = 'Neighborhood', how = 'inner')
VehicleRegression_df.to_csv("VehicleRegression_df.csv", index = False)