# Spatial Cluster Analysis Pre-Processing Pipeline
## SOCI 20519 Final Research Project 
### Michelangelo Pagan

## Step 1: Loading the US Hospitals Shapefile and Selecting for Chicago Hospitals

In [3]:
import geopandas as gpd

hospitals = gpd.read_file("Hospitals/Hospitals.shp")
hospitals.head(2)

Unnamed: 0,OBJECTID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,TELEPHONE,TYPE,...,WEBSITE,STATE_ID,ALT_NAME,ST_FIPS,OWNER,TTL_STAFF,BEDS,TRAUMA,HELIPAD,geometry
0,8237,1336420,ANDALUSIA HEALTH,"849 SOUTH THREE NOTCH STREET, PO BOX 760",ANDALUSIA,AL,36420,NOT AVAILABLE,(334) 222-8466,GENERAL ACUTE CARE,...,http://www.andalusiaregionalhospital.com,H2001,NOT AVAILABLE,1,PROPRIETARY,-999,88,LEVEL III,Y,POINT (-9628529.163 3671223.674)
1,8238,1535611,ATHENS LIMESTONE HOSPITAL,700 WEST MARKET STREET,ATHENS,AL,35611,NOT AVAILABLE,(256) 262-6468,GENERAL ACUTE CARE,...,http://www.athenslimestonehospital.com,H4201,NOT AVAILABLE,1,PROPRIETARY,-999,71,LEVEL III,N,POINT (-9682392.001 4137299.549)


In [6]:
chicago_hospitals = hospitals.loc[hospitals.CITY == 'CHICAGO']
chicago_hospitals.head(5)

Unnamed: 0,OBJECTID,ID,NAME,ADDRESS,CITY,STATE,ZIP,ZIP4,TELEPHONE,TYPE,...,WEBSITE,STATE_ID,ALT_NAME,ST_FIPS,OWNER,TTL_STAFF,BEDS,TRAUMA,HELIPAD,geometry
1821,10058,97460611,ANN AND ROBERT H LURIE CHILDREN'S HOSPITAL OF ...,"225 EAST CHICAGO AVENUE, PO BOX 140",CHICAGO,IL,60611,NOT AVAILABLE,(312) 227-4000,CHILDREN,...,NOT AVAILABLE,5843,NOT AVAILABLE,17,NOT AVAILABLE,-999,364,LEVEL I PEDIATRIC,Y,POINT (-9754005.088 5145480.827)
1823,10060,260640,"AURORA CHICAGO LAKESHORE HOSPITAL, LLC",4840 N MARINE DRIVE,CHICAGO,IL,60640,NOT AVAILABLE,(773) 878-9700,PSYCHIATRIC,...,https://www.chicagolakeshorehospital.com/,5207,AURORA CHICAGO LAKESHORE HOSPITAL,17,PROPRIETARY,-999,161,NOT AVAILABLE,N,POINT (-9757220.163 5156531.459)
1848,10085,96360624,"GARFIELD PARK HOSPITAL, LLC",520 N RIDGEWAY AVE,CHICAGO,IL,60624,NOT AVAILABLE,(773) 265-3700,PSYCHIATRIC,...,https://garfieldparkhospital.com/,5918,GARFIELD PARK HOSPITAL,17,NOT AVAILABLE,-999,88,NOT AVAILABLE,N,POINT (-9764966.085 5144622.643)
1862,10099,6660629,HOLY CROSS HOSPITAL,2701 W 68TH ST,CHICAGO,IL,60629,NOT AVAILABLE,(773) 884-9000,GENERAL ACUTE CARE,...,NOT AVAILABLE,992,NOT AVAILABLE,17,PROPRIETARY,-999,248,NOT AVAILABLE,N,POINT (-9761871.125 5126500.027)
1867,10104,10060622,HUMBOLDT PARK HEALTH,1044 N FRANCISCO AVE,CHICAGO,IL,60622,NOT AVAILABLE,(773) 292-8200,GENERAL ACUTE CARE,...,NOT AVAILABLE,1727,NOT AVAILABLE,17,NON-PROFIT,-999,210,NOT AVAILABLE,N,POINT (-9762618.652 5146131.304)


## Step 2: Manually Assigining Academic Status and Community Area # To Hospitals

In [7]:
hospital_academic_status = {
    "ANN AND ROBERT H LURIE CHILDREN'S HOSPITAL OF CHICAGO": 1,  # Affiliated with Northwestern
    "AURORA CHICAGO LAKESHORE HOSPITAL, LLC": 0,
    "GARFIELD PARK HOSPITAL, LLC": 0,
    "HOLY CROSS HOSPITAL": 0,
    "HUMBOLDT PARK HEALTH": 0,
    "INSIGHT HOSPITAL AND MEDICAL CENTER": 0,
    "JACKSON PARK HOSPITAL": 0,
    "JOHN H. STROGER JR. HOSPITAL OF COOK COUNTY": 1,  # Teaching hospital, Cook County
    "KINDRED CHICAGO NORTHLAKE, LLC": 0,
    "LA RABIDA CHILDREN'S HOSPITAL": 0,
    "LORETTO HOSPITAL": 0,
    "MT. SINAI HOSPITAL MEDICAL CENTER": 1,  # Affiliated with Sinai Chicago, some academic programs
    "NORTHWESTERN MEMORIAL HOSPITAL": 1,  # Major academic hospital (Feinberg)
    "PIPELINE - WEISS MEMORIAL HOSPITAL, LLC": 0,
    "PRESENCE CHICAGO HOSPITALS NETWORK": 0,
    "PRESENCE ST MARY AND ELIZABETH": 0,
    "PROVIDENT HOSPITAL OF COOK COUNTY": 1,  # Teaching hospital (Cook County)
    "REHABILITATION INSTITUTE OF CHICAGO": 1,  # Now Shirley Ryan AbilityLab, research-focused
    "RML HEALTH PROVIDERS LIMITED PARTNERSHIP": 0,
    "ROSELAND COMMUNITY HOSPITAL": 0,
    "RUSH UNIVERSITY MEDICAL CENTER": 1,  # Rush University teaching hospital
    "SAINT ANTHONY HOSPITAL": 0,
    "SCHWAB REHABILITATION HOSPITAL AND CARE NETWORK": 0,
    "SHRINERS HOSPITAL FOR CHILDREN - CHICAGO": 0,
    "SOUTH SHORE HOSPITAL": 0,
    "ST. BERNARD HOSPITAL": 0,
    "SWEDISH COVENANT HEALTH": 0,
    "THOREK MEMORIAL HOSPITAL": 0,
    "UHS HARTGROVE HOSPITAL": 0,
    "UNIVERSITY OF ILLINOIS HOSPITAL AT CHICAGO": 1,  # UIC Medical School Affiliated
    "JESSE BROWN DEPARTMENT OF VETERANS AFFAIRS MEDICAL CENTER": 1,  # UIC-affiliated VA
    "SACRED HEART HOSPITAL": 0,
    "KINDRED CHICAGO CENTRAL HOSPITAL": 0,
    "UI HEALTH": 1,  # Academic medical center
    "UCHICAGO MEDICINE": 1,  # Major academic hospital (Pritzker)
    "MOUNT SINAI HOSPITAL": 1,  # Affiliated with Sinai Chicago, teaching programs
    "COMMUNITY FIRST MEDICAL CENTER": 0,
    "ASCENSION SAINTS MARY AND ELIZABETH": 0,
    "ASCENSION RESURRECTION": 0,
    "ADVOCATE ILLINOIS MASONIC MEDICAL CENTER": 1,  # Some affiliations with UIC
    "ADVOCATE TRINITY HOSPITAL": 0,
    "ASCENSION SAINT JOSEPH - CHICAGO": 0
}

hospital_CA_number = {
    "ANN AND ROBERT H LURIE CHILDREN'S HOSPITAL OF CHICAGO": 8, 
    "AURORA CHICAGO LAKESHORE HOSPITAL, LLC": 3,
    "GARFIELD PARK HOSPITAL, LLC": 23,
    "HOLY CROSS HOSPITAL": 66,
    "HUMBOLDT PARK HEALTH": 24,
    "INSIGHT HOSPITAL AND MEDICAL CENTER": 33,
    "JACKSON PARK HOSPITAL": 43,
    "JOHN H. STROGER JR. HOSPITAL OF COOK COUNTY": 28,  
    "KINDRED CHICAGO NORTHLAKE, LLC": 4,
    "LA RABIDA CHILDREN'S HOSPITAL": 42,
    "LORETTO HOSPITAL": 25,
    "MT. SINAI HOSPITAL MEDICAL CENTER": 29, 
    "NORTHWESTERN MEMORIAL HOSPITAL": 8,  
    "PIPELINE - WEISS MEMORIAL HOSPITAL, LLC": 3,
    "PRESENCE CHICAGO HOSPITALS NETWORK": 10,
    "PRESENCE ST MARY AND ELIZABETH": 24,
    "PROVIDENT HOSPITAL OF COOK COUNTY": 38, 
    "REHABILITATION INSTITUTE OF CHICAGO": 8, 
    "RML HEALTH PROVIDERS LIMITED PARTNERSHIP": 27,
    "ROSELAND COMMUNITY HOSPITAL": 49,
    "RUSH UNIVERSITY MEDICAL CENTER": 28, 
    "SAINT ANTHONY HOSPITAL": 30,
    "SCHWAB REHABILITATION HOSPITAL AND CARE NETWORK": 29,
    "SHRINERS HOSPITAL FOR CHILDREN - CHICAGO": 18,
    "SOUTH SHORE HOSPITAL": 46,
    "ST. BERNARD HOSPITAL": 68,
    "SWEDISH COVENANT HEALTH": 4,
    "THOREK MEMORIAL HOSPITAL": 3,
    "UHS HARTGROVE HOSPITAL": 25,
    "UNIVERSITY OF ILLINOIS HOSPITAL AT CHICAGO": 28, 
    "JESSE BROWN DEPARTMENT OF VETERANS AFFAIRS MEDICAL CENTER": 28, 
    "SACRED HEART HOSPITAL": 23,
    "KINDRED CHICAGO CENTRAL HOSPITAL": 16,
    "UI HEALTH": 28,  
    "UCHICAGO MEDICINE": 41, 
    "MOUNT SINAI HOSPITAL": 29, 
    "COMMUNITY FIRST MEDICAL CENTER": 15,
    "ASCENSION SAINTS MARY AND ELIZABETH": 24,
    "ASCENSION RESURRECTION": 10,
    "ADVOCATE ILLINOIS MASONIC MEDICAL CENTER": 6, 
    "ADVOCATE TRINITY HOSPITAL": 48,
    "ASCENSION SAINT JOSEPH - CHICAGO": 6
}

import pandas as pd

df_academic_status = pd.DataFrame(
    list(hospital_academic_status.items()), columns=["NAME", "Academic_Status"]
)
df_CA = pd.DataFrame(
    list(hospital_CA_number.items()), columns=["NAME", "CommunityArea"]
)

df_hospitals = chicago_hospitals.merge(df_academic_status, on="NAME", how="left")
df_hospitals_ca = df_hospitals.merge(df_CA, on="NAME", how='left')

df_hospitals_ca.drop(columns = ["ADDRESS", "ZIP4","TELEPHONE", "ST_FIPS", "TTL_STAFF", "COUNTRY", "ALT_NAME", "WEBSITE", "SOURCE", "SOURCEDATE", "VAL_METHOD", "VAL_DATE", "STATE_ID"])

Unnamed: 0,OBJECTID,ID,NAME,CITY,STATE,ZIP,TYPE,STATUS,POPULATION,COUNTY,...,LONGITUDE,NAICS_CODE,NAICS_DESC,OWNER,BEDS,TRAUMA,HELIPAD,geometry,Academic_Status,CommunityArea
0,10058,97460611,ANN AND ROBERT H LURIE CHILDREN'S HOSPITAL OF ...,CHICAGO,IL,60611,CHILDREN,OPEN,364,COOK,...,-87.621719,622110,"CHILDREN'S HOSPITALS, GENERAL",NOT AVAILABLE,364,LEVEL I PEDIATRIC,Y,POINT (-9754005.088 5145480.827),1,8
1,10060,260640,"AURORA CHICAGO LAKESHORE HOSPITAL, LLC",CHICAGO,IL,60640,PSYCHIATRIC,OPEN,161,COOK,...,-87.6506,622210,PSYCHIATRIC AND SUBSTANCE ABUSE HOSPITALS,PROPRIETARY,161,NOT AVAILABLE,N,POINT (-9757220.163 5156531.459),0,3
2,10085,96360624,"GARFIELD PARK HOSPITAL, LLC",CHICAGO,IL,60624,PSYCHIATRIC,OPEN,88,COOK,...,-87.720183,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,NOT AVAILABLE,88,NOT AVAILABLE,N,POINT (-9764966.085 5144622.643),0,23
3,10099,6660629,HOLY CROSS HOSPITAL,CHICAGO,IL,60629,GENERAL ACUTE CARE,OPEN,248,COOK,...,-87.69238,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,PROPRIETARY,248,NOT AVAILABLE,N,POINT (-9761871.125 5126500.027),0,66
4,10104,10060622,HUMBOLDT PARK HEALTH,CHICAGO,IL,60622,GENERAL ACUTE CARE,OPEN,210,COOK,...,-87.699095,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,NON-PROFIT,210,NOT AVAILABLE,N,POINT (-9762618.652 5146131.304),0,24
5,10106,7660616,INSIGHT HOSPITAL AND MEDICAL CENTER,CHICAGO,IL,60616,GENERAL ACUTE CARE,OPEN,412,COOK,...,-87.621492,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,NON-PROFIT,412,NOT AVAILABLE,N,POINT (-9753979.827 5138071.767),0,33
6,10108,8660649,JACKSON PARK HOSPITAL,CHICAGO,IL,60649,GENERAL ACUTE CARE,OPEN,239,COOK,...,-87.585235,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,NON-PROFIT,239,NOT AVAILABLE,N,POINT (-9749943.768 5124757.466),0,43
7,10111,6260612,JOHN H. STROGER JR. HOSPITAL OF COOK COUNTY,CHICAGO,IL,60612,GENERAL ACUTE CARE,OPEN,450,COOK,...,-87.674398,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,GOVERNMENT - LOCAL,450,"LEVEL I ADULT, LEVEL I PEDIATRIC",Y,POINT (-9759869.392 5141874.969),1,28
8,10113,97160618,"KINDRED CHICAGO NORTHLAKE, LLC",CHICAGO,IL,60618,LONG TERM CARE,OPEN,94,COOK,...,-87.693297,622110,GENERAL MEDICAL AND SURGICAL HOSPITALS,NOT AVAILABLE,94,NOT AVAILABLE,N,POINT (-9761973.188 5155214.46),0,4
9,10117,1360649,LA RABIDA CHILDREN'S HOSPITAL,CHICAGO,IL,60649,CHILDREN,OPEN,49,COOK,...,-87.571312,622110,"CHILDREN'S HOSPITALS, GENERAL",NON-PROFIT,49,NOT AVAILABLE,N,POINT (-9748393.908 5127677.225),0,42


## Step 3: Saving Updated Shapefiles (with Separate Files for Academic and Non-Academic)

In [8]:
df_hospitals.to_file("/Users/michelangelopagan/Desktop/DATA/Data20519/Spatial Analysis RQ Data/Filtered/ChiHospitals.shp")

hospitals_academic = df_hospitals_ca.loc[df_hospitals_ca.Academic_Status == 1]
hospitals_nonacademic = df_hospitals_ca.loc[df_hospitals_ca.Academic_Status == 0]

hospitals_academic.to_file("/Users/michelangelopagan/Desktop/DATA/Data20519/Spatial Analysis RQ Data/Filtered/ChiAcademicHospitals.shp")
hospitals_nonacademic.to_file("/Users/michelangelopagan/Desktop/DATA/Data20519/Spatial Analysis RQ Data/Filtered/ChiNonAcademicHospitals.shp")

## Step 4: Creating EPA Metrics for Academic and Non-Academic Hospitals in Chicago

In [14]:
academic_counts = df_hospitals["Academic_Status"].value_counts()
total_hospitals = len(df_hospitals)

print("Hospital Count by Academic Status:")
for status, count in academic_counts.items():
    percentage = (count / total_hospitals) * 100
    print(f"{status}: {count} hospitals ({percentage:.2f}%)")

# Sum the total number of beds in academic and non-academic hospitals
bed_sums = df_hospitals.groupby("Academic_Status")["BEDS"].sum()

total_beds = bed_sums.sum()

print("\nTotal Beds by Academic Status:")
for status, bed_count in bed_sums.items():
    percentage = (bed_count / total_beds) * 100
    print(f"{status}: {bed_count} beds ({percentage:.2f}%)")

Hospital Count by Academic Status:
0: 31 hospitals (68.89%)
1: 14 hospitals (31.11%)

Total Beds by Academic Status:
0: 6427 beds (52.16%)
1: 5895 beds (47.84%)


## Step 5: Mapping Community Areas to Zip Codes Using Census Tract Walk Files and Coverage Ratios

The files below contain **census tract data** and their 'total_ratio' (in terms of area) in each ZIP code. The tract has census tracts in all the US, which I shrink to just Cook county.

In [None]:
tract_CA = pd.read_csv("Spatial Analysis RQ Data/Census_Tracts_in_Chicago_Community_Areas.csv")
tract_ZIP = pd.read_csv("Spatial Analysis RQ Data/TRACT_ZIP_032010.csv")

tract_ZIP.head(2)

Unnamed: 0,TRACT,ZIP,RES_RATIO,BUS_RATIO,OTH_RATIO,TOT_RATIO
0,1001020100,36067,1.0,1.0,1.0,1.0
1,1001020200,36008,0.027254,0.003378,0.027778,0.021773


In [None]:
cook_tracts = tract_CA.Tract.to_list()

In [None]:
cook_tract_zip = tract_ZIP[tract_ZIP['TRACT'].isin(cook_tracts)].drop(columns = ["RES_RATIO", "BUS_RATIO", "OTH_RATIO"]).reset_index(drop=True).rename(columns = {"TRACT":"Tract"})
cook_tract_zip.head(2)

Unnamed: 0,Tract,ZIP,TOT_RATIO
0,17031010100,60202,0.0003
1,17031010100,60626,0.9997


In [None]:
tract_CA

Unnamed: 0,Tract,Label,CommunityAreaNumber,CommunityAreaName
0,17031010100,"Census Tract 101, Cook County, Illinois",1,Rogers Park
1,17031010201,"Census Tract 102.01, Cook County, Illinois",1,Rogers Park
2,17031010202,"Census Tract 102.02, Cook County, Illinois",1,Rogers Park
3,17031010300,"Census Tract 103, Cook County, Illinois",1,Rogers Park
4,17031010400,"Census Tract 104, Cook County, Illinois",1,Rogers Park
...,...,...,...,...
789,17031843500,"Census Tract 8435, Cook County, Illinois",30,South Lawndale
790,17031843600,"Census Tract 8436, Cook County, Illinois",38,Grand Boulevard
791,17031843700,"Census Tract 8437, Cook County, Illinois",5,North Center
792,17031843800,"Census Tract 8438, Cook County, Illinois",61,New City


Now that I have tract data for Cook County mapped to Community Areas, I can merge the dataframes to include the tract, ZIP it belongs to, ratio within the ZIP, and Community Area Name. This is useful, because we can now take hospital data that is at a ZIP code level, use proportions to assign those into census tracts, and sum them for putting them in Community Areas.

In [None]:
merged_CA_ZIP_df = cook_tract_zip.merge(tract_CA, on="Tract", how="inner")
merged_CA_ZIP_df.head(2)

Unnamed: 0,Tract,ZIP,TOT_RATIO,Label,CommunityAreaNumber,CommunityAreaName
0,17031010100,60202,0.0003,"Census Tract 101, Cook County, Illinois",1,Rogers Park
1,17031010100,60626,0.9997,"Census Tract 101, Cook County, Illinois",1,Rogers Park


## Step 6: Integrating Readmission Data for Heart Failure, Pneumonia, and Heart Attacks
- for the readmission dataframes, the unit is per 100k (by ZIP)
- for the preventable and non preventable ER hospitalizations, the unit is percentage (by ZIP)

In [1]:
HA_readmit = pd.read_csv("Spatial Analysis RQ Data/HA_Readmissions.csv", header =1 )
HF_readmit = pd.read_csv("Spatial Analysis RQ Data/HF_Readmissions.csv", header =1 )
non_preventable = pd.read_csv("Spatial Analysis RQ Data/non_preventable.csv", header =1 )
PN_readmit = pd.read_csv("Spatial Analysis RQ Data/PN_readmissions.csv", header =1 )
preventable = pd.read_csv("Spatial Analysis RQ Data/preventable.csv", header =1 )

NameError: name 'pd' is not defined

In [None]:
HA_readmit = HA_readmit[HA_readmit["region"].astype(str).str.isdigit()]
HA_readmit = HA_readmit[["region", "value", "date_start", "date_end"]]
chicago_zip_codes = [
    "60601", "60602", "60603", "60604", "60605", "60606", "60607", "60608", "60609", "60610",
    "60611", "60612", "60613", "60614", "60615", "60616", "60617", "60618", "60619", "60620",
    "60621", "60622", "60623", "60624", "60625", "60626", "60628", "60629", "60630", "60631",
    "60632", "60633", "60634", "60636", "60637", "60638", "60639", "60640", "60641", "60642",
    "60643", "60644", "60645", "60646", "60647", "60649", "60651", "60652", "60653", "60654",
    "60655", "60656", "60657", "60659", "60660", "60661", "60664", "60666", "60680", "60681",
    "60690", "60691", "60701", "60706", "60707", "60803", "60804", "60805", "60827"
]

HA_chicago = HA_readmit[HA_readmit["region"].astype(str).isin(chicago_zip_codes)]
HA_chicago = HA_chicago.copy()
HA_chicago.columns = HA_chicago.columns.str.strip()
HA_chicago.loc[:, "value"] = pd.to_numeric(HA_chicago["value"], errors="coerce")
HA_chicago = HA_chicago.dropna(subset=["value"])
HA_chicago["region"] = HA_chicago["region"].astype(str)
HA_chicago_avg = HA_chicago.groupby("region", as_index=False).agg({"value": "mean"})
HA_chicago_avg.rename(columns={"region": "ZIP", "value": "avg_readmission_rate"}, inplace=True)

In [None]:
def process_readmission_data(df, chicago_zip_codes):
    """
    Cleans and processes a readmission DataFrame to filter only Chicago ZIP codes
    and compute the average readmission rate per ZIP code.

    Parameters:
    df (pd.DataFrame): The raw readmission dataset.
    chicago_zip_codes (list): List of valid Chicago ZIP codes.

    Returns:
    pd.DataFrame: Processed DataFrame with ZIP and avg_readmission_rate.
    """
    df = df[df["region"].astype(str).str.isdigit()]

    df = df[["region", "value", "date_start", "date_end"]]

    df = df[df["region"].astype(str).isin(chicago_zip_codes)].copy()

    df.columns = df.columns.str.strip()

    df.loc[:, "value"] = pd.to_numeric(df["value"], errors="coerce")

    df = df.dropna(subset=["value"])

    df["region"] = df["region"].astype(str)

    df_avg = df.groupby("region", as_index=False).agg({"value": "mean"})

    df_avg.rename(columns={"region": "ZIP", "value": "avg_readmission_per_100k"}, inplace=True)

    return df_avg

chicago_zip_codes = [
    "60601", "60602", "60603", "60604", "60605", "60606", "60607", "60608", "60609", "60610",
    "60611", "60612", "60613", "60614", "60615", "60616", "60617", "60618", "60619", "60620",
    "60621", "60622", "60623", "60624", "60625", "60626", "60628", "60629", "60630", "60631",
    "60632", "60633", "60634", "60636", "60637", "60638", "60639", "60640", "60641", "60642",
    "60643", "60644", "60645", "60646", "60647", "60649", "60651", "60652", "60653", "60654",
    "60655", "60656", "60657", "60659", "60660", "60661", "60664", "60666", "60680", "60681",
    "60690", "60691", "60701", "60706", "60707", "60803", "60804", "60805", "60827"
]

HA_chicago_avg = process_readmission_data(HA_readmit, chicago_zip_codes).rename(columns = {"avg_readmission_per_100k": "avg_HA_readmission_per_100k"})
HF_chicago_avg = process_readmission_data(HF_readmit, chicago_zip_codes).rename(columns = {"avg_readmission_per_100k": "avg_HF_readmission_per_100k"})
non_preventable_avg = process_readmission_data(non_preventable, chicago_zip_codes).rename(columns = {"avg_readmission_per_100k": "non_preventable_em_percent"})
PN_chicago_avg = process_readmission_data(PN_readmit, chicago_zip_codes).rename(columns = {"avg_readmission_per_100k": "avg_PN_readmission_per_100k"})
preventable_avg = process_readmission_data(preventable, chicago_zip_codes).rename(columns = {"avg_readmission_per_100k": "preventable_em_percent"})

Finding the data frame that includes the largest amount of zip code data to perform left joins:

In [None]:
largest_df = max(
    [HA_chicago_avg, HF_chicago_avg, non_preventable_avg, PN_chicago_avg, preventable_avg], 
    key=lambda df: df["ZIP"].nunique()
)
largest_df

Unnamed: 0,ZIP,non_preventable_em_percent
0,60601,13.2450
1,60602,11.2275
2,60603,11.5500
3,60604,10.5575
4,60605,13.2075
...,...,...
57,60707,11.9000
58,60803,12.1350
59,60804,10.7725
60,60805,12.1325


In [None]:
merged_df = largest_df.copy()

# Merge each dataset one by one, keeping all ZIPs from the largest DataFrame
merged_df = merged_df.merge(HA_chicago_avg, on="ZIP", how="left")
merged_df = merged_df.merge(HF_chicago_avg, on="ZIP", how="left")
merged_df = merged_df.merge(non_preventable_avg, on="ZIP", how="left")
merged_df = merged_df.merge(PN_chicago_avg, on="ZIP", how="left")
merged_df = merged_df.merge(preventable_avg, on="ZIP", how="left")

In [None]:
merged_admission_df = merged_df.copy()

### We now have these readmission metrics per zip code where data was available. Next step is to go back and merge them with the other DF to start doing the spatial weight .

In [None]:
merged_admission_df["ZIP"] = [int(x) for x in merged_admission_df["ZIP"]]
merged_admission_df

Unnamed: 0,ZIP,non_preventable_em_percent_x,avg_HA_readmission_per_100k,avg_HF_readmission_per_100k,non_preventable_em_percent_y,avg_PN_readmission_per_100k,preventable_em_percent
0,60601,13.2450,,,13.2450,,3.7450
1,60602,11.2275,,,11.2275,,3.9900
2,60603,11.5500,,,11.5500,,4.6375
3,60604,10.5575,,,10.5575,,4.6725
4,60605,13.2075,,78.035000,13.2075,,4.2475
...,...,...,...,...,...,...,...
57,60707,11.9000,60.280,97.780000,11.9000,42.970000,5.1375
58,60803,12.1350,,127.823333,12.1350,,5.2500
59,60804,10.7725,22.375,58.226667,10.7725,27.436667,5.2000
60,60805,12.1325,,162.046667,12.1325,103.170000,4.8725


In [None]:
merged_CA_ZIP_df.loc[merged_CA_ZIP_df.ZIP == 60637]

Unnamed: 0,Tract,ZIP,TOT_RATIO,Label,CommunityAreaNumber,CommunityAreaName
468,17031400300,60637,0.004902,"Census Tract 4003, Cook County, Illinois",40,Washington Park
469,17031400400,60637,1.0,"Census Tract 4004, Cook County, Illinois",40,Washington Park
471,17031400500,60637,0.982784,"Census Tract 4005, Cook County, Illinois",40,Washington Park
473,17031400800,60637,0.990352,"Census Tract 4008, Cook County, Illinois",40,Washington Park
482,17031411000,60637,0.911901,"Census Tract 4110, Cook County, Illinois",41,Hyde Park
484,17031411100,60637,0.826358,"Census Tract 4111, Cook County, Illinois",41,Hyde Park
486,17031411200,60637,0.970884,"Census Tract 4112, Cook County, Illinois",41,Hyde Park
487,17031420100,60637,1.0,"Census Tract 4201, Cook County, Illinois",42,Woodlawn
488,17031420200,60637,1.0,"Census Tract 4202, Cook County, Illinois",42,Woodlawn
489,17031420300,60637,1.0,"Census Tract 4203, Cook County, Illinois",42,Woodlawn


In [None]:
# Merge ZIP-level readmission data onto Tract-to-Community mapping
weighted_data = merged_CA_ZIP_df.merge(merged_admission_df, on="ZIP", how="left")

# Compute weighted readmission rates by multiplying with TOT_RATIO
for col in [
    "avg_HA_readmission_per_100k", 
    "avg_HF_readmission_per_100k", 
    "avg_PN_readmission_per_100k", 
    "non_preventable_em_percent_x", 
    "preventable_em_percent"
]:
    weighted_data[f"{col}_weighted"] = weighted_data[col] * weighted_data["TOT_RATIO"]

weighted_data

Unnamed: 0,Tract,ZIP,TOT_RATIO,Label,CommunityAreaNumber,CommunityAreaName,non_preventable_em_percent_x,avg_HA_readmission_per_100k,avg_HF_readmission_per_100k,non_preventable_em_percent_y,avg_PN_readmission_per_100k,preventable_em_percent,avg_HA_readmission_per_100k_weighted,avg_HF_readmission_per_100k_weighted,avg_PN_readmission_per_100k_weighted,non_preventable_em_percent_x_weighted,preventable_em_percent_weighted
0,17031010100,60202,0.000300,"Census Tract 101, Cook County, Illinois",1,Rogers Park,,,,,,,,,,,
1,17031010100,60626,0.999700,"Census Tract 101, Cook County, Illinois",1,Rogers Park,10.970,23.903333,58.663333,10.970,45.733333,5.7200,23.896162,58.645734,45.719613,10.966709,5.718284
2,17031010300,60626,1.000000,"Census Tract 103, Cook County, Illinois",1,Rogers Park,10.970,23.903333,58.663333,10.970,45.733333,5.7200,23.903333,58.663333,45.733333,10.970000,5.720000
3,17031010400,60626,0.997992,"Census Tract 104, Cook County, Illinois",1,Rogers Park,10.970,23.903333,58.663333,10.970,45.733333,5.7200,23.855335,58.545537,45.641501,10.947972,5.708514
4,17031010400,60660,0.002008,"Census Tract 104, Cook County, Illinois",1,Rogers Park,11.755,38.216667,86.233333,11.755,28.160000,4.9300,0.076740,0.173159,0.056546,0.023604,0.009900
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
752,17031750400,60643,0.108309,"Census Tract 7504, Cook County, Illinois",75,Morgan Park,11.170,43.373333,175.660000,11.170,52.633333,6.3425,4.697722,19.025559,5.700664,1.209812,0.686950
753,17031750400,60655,0.891691,"Census Tract 7504, Cook County, Illinois",75,Morgan Park,12.180,,74.703333,12.180,,4.0975,,66.612290,,10.860796,3.653704
754,17031750500,60643,1.000000,"Census Tract 7505, Cook County, Illinois",75,Morgan Park,11.170,43.373333,175.660000,11.170,52.633333,6.3425,43.373333,175.660000,52.633333,11.170000,6.342500
755,17031750600,60628,0.043450,"Census Tract 7506, Cook County, Illinois",75,Morgan Park,10.250,60.360000,211.486667,10.250,49.286667,7.5950,2.622648,9.189117,2.141511,0.445364,0.330004


In [None]:
# Group by Community Area
community_readmission = weighted_data.groupby(["CommunityAreaNumber", "CommunityAreaName"]).agg({
    "avg_HA_readmission_per_100k_weighted": "sum",
    "avg_HF_readmission_per_100k_weighted": "sum",
    "avg_PN_readmission_per_100k_weighted": "sum",
    "non_preventable_em_percent_x_weighted": "sum",
    "preventable_em_percent_weighted": "sum",
    "TOT_RATIO": "sum"  # Used for normalization
}).reset_index()

# Convert sums to weighted averages
for col in [
    "avg_HA_readmission_per_100k_weighted", 
    "avg_HF_readmission_per_100k_weighted", 
    "avg_PN_readmission_per_100k_weighted", 
    "non_preventable_em_percent_x_weighted", 
    "preventable_em_percent_weighted"
]:
    community_readmission[col] = community_readmission[col] / community_readmission["TOT_RATIO"]

# Rename columns for clarity
community_readmission.rename(columns={
    "avg_HA_readmission_per_100k_weighted": "HA_readmit_per_100k",
    "avg_HF_readmission_per_100k_weighted": "HF_readmit_per_100k",
    "avg_PN_readmission_per_100k_weighted": "PN_readmit_per_100k",
    "non_preventable_em_percent_x_weighted": "NonPreventable_readmit_pct",
    "preventable_em_percent_weighted": "Preventable_readmit_pct"
}, inplace=True)

# Drop TOT_RATIO as it is no longer needed
community_readmission = community_readmission.drop(columns=["TOT_RATIO"])

In [None]:
community_readmission

Unnamed: 0,CommunityAreaNumber,CommunityAreaName,HA_readmit_per_100k,HF_readmit_per_100k,PN_readmit_per_100k,NonPreventable_readmit_pct,Preventable_readmit_pct
0,1,Rogers Park,23.929434,58.712661,45.695651,10.970706,5.718031
1,2,West Ridge,40.979930,82.588839,44.950413,11.460243,5.430824
2,3,Uptown,33.381359,71.612356,39.319865,11.157251,5.162442
3,4,Lincoln Square,29.753350,57.172124,39.940210,11.320239,5.439591
4,5,North Center,33.209274,65.029675,17.378078,11.615685,4.971361
...,...,...,...,...,...,...,...
69,72,Beverly,32.052750,149.346675,38.910664,11.433137,5.757955
70,73,Washington Heights,45.395734,187.284662,50.449703,10.874660,6.931355
71,74,Mount Greenwood,0.000000,74.750124,0.055270,12.179975,4.097915
72,75,Morgan Park,30.403794,145.429947,36.624483,11.466093,5.675410


## Step 7: Integrating Readmission Data with the Existing Chicago Community Area Shapefile

In [None]:
import geopandas as gpd
community_areas = gpd.read_file("data/Chi-CCA/Chicago_2020.shp")
community_areas['area_numbe'] = community_areas['area_numbe'].astype(int)
community_areas.rename(columns = {'area_numbe': 'CommunityAreaNumber'}, inplace=True)
community_areas

Unnamed: 0,area_num_1,CommunityAreaNumber,community,shape_area,shape_len,districtno,district,GEOID,GEOG,2000_POP,...,KOREAN,OTHASIAN,OTHER_EURO,OTHUNSPEC,2000_WHITE,2000_HISP,2000_BLACK,2000_ASIAN,2000_OTHER,geometry
30,3.0,3,UPTOWN,65095640.0,46972.794555,1.0,Far North,3.0,Uptown,63551.0,...,625.0,1833.0,3456.0,3403.0,26784.0,12674.0,13415.0,8206.0,2472.0,"POLYGON ((-87.64102 41.9548, -87.644 41.95465,..."


In [None]:
community_readmission

Unnamed: 0,CommunityAreaNumber,CommunityAreaName,HA_readmit_per_100k,HF_readmit_per_100k,PN_readmit_per_100k,NonPreventable_readmit_pct,Preventable_readmit_pct
0,1,Rogers Park,23.929434,58.712661,45.695651,10.970706,5.718031
1,2,West Ridge,40.979930,82.588839,44.950413,11.460243,5.430824
2,3,Uptown,33.381359,71.612356,39.319865,11.157251,5.162442
3,4,Lincoln Square,29.753350,57.172124,39.940210,11.320239,5.439591
4,5,North Center,33.209274,65.029675,17.378078,11.615685,4.971361
...,...,...,...,...,...,...,...
69,72,Beverly,32.052750,149.346675,38.910664,11.433137,5.757955
70,73,Washington Heights,45.395734,187.284662,50.449703,10.874660,6.931355
71,74,Mount Greenwood,0.000000,74.750124,0.055270,12.179975,4.097915
72,75,Morgan Park,30.403794,145.429947,36.624483,11.466093,5.675410


In [None]:
final_CA_df = community_areas.merge(community_readmission, on = "CommunityAreaNumber", how='left')

final_CA_df.to_file("/Users/michelangelopagan/Desktop/DATA/Data20519/Spatial Analysis RQ Data/CA_withparams/comm_areas_processed.shp")


  final_CA_df.to_file("/Users/michelangelopagan/Desktop/DATA/Data20519/Spatial Analysis RQ Data/CA_withparams/comm_areas_processed.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


## Step 8: Creating Distance Metrics
1. `distance_to_academic` : The distance (in meters) to the nearest academic hopsital from the community area centroid.
2. `distance_to_nonacademic` : The distance (in meters) to the nearest non-academic hopsital from the community area centroid.

In [2]:
from shapely.geometry import Point
from geopandas.tools import sjoin
import geopandas as gpd

community_areas = gpd.read_file("CA_withparams/comm_areas_processed.shp")  # Polygon layer
academic_hospitals = gpd.read_file("Filtered/ChiAcademicHospitals.shp")  # Point layer
non_academic_hospitals = gpd.read_file("Filtered/ChiNonAcademicHospitals.shp")  # Point layer

print(community_areas.crs)

GEOGCS["unknown",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433],AXIS["Longitude",EAST],AXIS["Latitude",NORTH]]


In [2]:
community_areas["centroid"] = community_areas.geometry.centroid

# Find nearest academic hospital
community_areas["nearest_academic"] = community_areas["centroid"].apply(
    lambda x: academic_hospitals.distance(x).min()
)

# Find nearest non-academic hospital
community_areas["nearest_non_academic"] = community_areas["centroid"].apply(
    lambda x: non_academic_hospitals.distance(x).min()
)

# Save updated shapefile
#community_areas = community_areas.drop(columns=["centroid"])
#community_areas.to_file("CA_withparams/updated_community_areas.shp")


  community_areas["centroid"] = community_areas.geometry.centroid
  community_areas.to_file("CA_withparams/updated_community_areas.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [3]:
import geopandas as gpd
from shapely.ops import nearest_points

community_areas = gpd.read_file("CA_withparams/comm_areas_processed.shp")  # Polygon layer
academic_hospitals = gpd.read_file("Filtered/ChiAcademicHospitals.shp")  # Point layer
non_academic_hospitals = gpd.read_file("Filtered/ChiNonAcademicHospitals.shp")  # Point layer

community_areas = community_areas.to_crs(epsg=32616)  # UTM Zone 16N (for Chicago)
academic_hospitals = academic_hospitals.to_crs(epsg=32616)
non_academic_hospitals = non_academic_hospitals.to_crs(epsg=32616)

GEOGCS["unknown",DATUM["WGS_1984",SPHEROID["WGS 84",6378137,298.257223563,AUTHORITY["EPSG","7030"]],AUTHORITY["EPSG","6326"]],PRIMEM["Greenwich",0],UNIT["Degree",0.0174532925199433],AXIS["Longitude",EAST],AXIS["Latitude",NORTH]]
EPSG:3857
EPSG:3857


In [5]:
community_areas["centroid"] = community_areas.geometry.centroid

def nearest_geom(row, target_gdf):
    nearest = target_gdf.geometry.apply(lambda x: row.distance(x))
    return target_gdf.loc[nearest.idxmin(), "geometry"].distance(row)

community_areas["distance_to_academic"] = community_areas["centroid"].apply(
    lambda x: nearest_geom(x, academic_hospitals)
)
community_areas["distance_to_non_academic"] = community_areas["centroid"].apply(
    lambda x: nearest_geom(x, non_academic_hospitals)
)

In [20]:
community_areas = community_areas.drop(columns = ['centroid'])
community_areas.to_file("CA_withparams/updated_with_distance.shp")

  community_areas.to_file("CA_withparams/updated_with_distance.shp")
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(
  ogr_write(


In [21]:
community_areas.head(5)

Unnamed: 0,area_num_1,CommunityA,community,shape_area,shape_len,districtno,district,GEOID,GEOG,2000_POP,...,CATEGORIES,card,cpval,Ac_Density,Hospi_ount,Centroid_X,Centroid_Y,geometry,distance_to_academic,distance_to_non_academic
0,35.0,35,DOUGLAS,46004620.0,31027.05451,7.0,South Side,35.0,Douglas,26470.0,...,3.0,2.0,0.030937,0.0,0.0,-87.618678,41.835118,"POLYGON ((449429.813 4632712.182, 449429.135 4...",3629.450884,1327.708351
1,36.0,36,OAKLAND,16913960.0,19565.506153,7.0,South Side,36.0,Oakland,6110.0,...,3.0,0.0,0.0,0.0,0.0,-87.603216,41.82375,"POLYGON ((450818.899 4629619.879, 450806.01 46...",2486.008507,2983.833559
2,37.0,37,FULLER PARK,19916700.0,25339.08975,7.0,South Side,37.0,Fuller Park,3420.0,...,1.0,0.0,0.0,0.0,0.0,-87.632425,41.809085,"POLYGON ((447763.097 4627972.082, 447763.324 4...",1738.993887,3408.857018
3,38.0,38,GRAND BOULEVARD,48492500.0,28196.837157,7.0,South Side,38.0,Grand Boulevard,28006.0,...,2.0,2.0,0.030937,1.0,1.0,-87.61786,41.812949,"POLYGON ((449609.917 4629615.48, 449610.009 46...",1201.318651,3780.450836
4,39.0,39,KENWOOD,29071740.0,23325.167906,7.0,South Side,39.0,Kenwood,18363.0,...,2.0,2.0,0.030937,0.0,0.0,-87.596184,41.808916,"POLYGON ((450818.899 4629619.879, 450819.11 46...",1583.556456,4074.022567
