# Chicago Dataset

In [17]:
# Import dependencies
import os
import csv
import pandas as pd
import json
from pathlib import Path


## Demographic Metric

**Measuring**: ACS Median Age, ACS Racial Demographics, Education Level

Data was pulled from the 2020 Chicago Census Database and the American Community Survey (ACS) by the U.S. Census Bureau using various coded tables to ensure unified data for each metropolitan city. Two tables used the same code and were seperated to ensure proper measurement. 
* ACS Median Age: Table DP05
* Racial Demograpics: Table DP05
* Education Level: S1501

#### ACS Median Age (Table DP05)

In [2]:
# Import csv file age data (ACS 2020) 
chicago_age_data = '../chicago_data/chi_raw_csv/chi_age_demo.csv'

# Read csv file into DataFrame
chicago_age_df = pd.read_csv(chicago_age_data)
chicago_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,...,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years),SEX AND AGE!!Total population!!Under 18 years,SEX AND AGE!!Total population!!16 years and over,SEX AND AGE!!Total population!!18 years and over,SEX AND AGE!!Total population!!21 years and over,SEX AND AGE!!Total population!!62 years and over,SEX AND AGE!!Total population!!65 years and over,SEX AND AGE!!Total population!!18 years and over.1
0,Illinois,,,,,,,,,,...,,,,,,,,,,
1,Estimate,,12716164.0,6247083,6469081,755518,768804,829779,831088,844275,...,573017,270768,38.3,2855433,10193604,9860731,9363657,2443504,1990426,9860731.0
2,Percent,,12716164.0,49.1%,50.9%,5.9%,6.0%,6.5%,6.5%,6.6%,...,4.5%,2.1%,(X),22.5%,80.2%,77.5%,73.6%,19.2%,15.7%,9860731.0
3,"Cook County, Illinois",,,,,,,,,,...,,,,,,,,,,
4,Estimate,,5169517.0,2509686,2659831,315368,302232,322415,310116,334453,...,219634,104191,37.0,1128625,4167295,4040892,3859321,931306,757688,4040892.0


In [3]:
# Drop all rows where 'Label (Grouping)' column contains 'Illinois'
chicago_age_df = chicago_age_df[chicago_age_df['Label (Grouping)'] != 'Illinois']
chicago_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,...,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years),SEX AND AGE!!Total population!!Under 18 years,SEX AND AGE!!Total population!!16 years and over,SEX AND AGE!!Total population!!18 years and over,SEX AND AGE!!Total population!!21 years and over,SEX AND AGE!!Total population!!62 years and over,SEX AND AGE!!Total population!!65 years and over,SEX AND AGE!!Total population!!18 years and over.1
1,Estimate,,12716164.0,6247083,6469081,755518,768804,829779,831088,844275,...,573017,270768,38.3,2855433,10193604,9860731,9363657,2443504,1990426,9860731.0
2,Percent,,12716164.0,49.1%,50.9%,5.9%,6.0%,6.5%,6.5%,6.6%,...,4.5%,2.1%,(X),22.5%,80.2%,77.5%,73.6%,19.2%,15.7%,9860731.0
3,"Cook County, Illinois",,,,,,,,,,...,,,,,,,,,,
4,Estimate,,5169517.0,2509686,2659831,315368,302232,322415,310116,334453,...,219634,104191,37.0,1128625,4167295,4040892,3859321,931306,757688,4040892.0
5,Percent,,5169517.0,48.5%,51.5%,6.1%,5.8%,6.2%,6.0%,6.5%,...,4.2%,2.0%,(X),21.8%,80.6%,78.2%,74.7%,18.0%,14.7%,4040892.0


In [4]:
# Drop first 2 rows (contain unnecessary Illinois data)
chicago_age_df = chicago_age_df.drop(index=[1, 2])
chicago_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE,SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Male,SEX AND AGE!!Total population!!Female,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,...,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years),SEX AND AGE!!Total population!!Under 18 years,SEX AND AGE!!Total population!!16 years and over,SEX AND AGE!!Total population!!18 years and over,SEX AND AGE!!Total population!!21 years and over,SEX AND AGE!!Total population!!62 years and over,SEX AND AGE!!Total population!!65 years and over,SEX AND AGE!!Total population!!18 years and over.1
3,"Cook County, Illinois",,,,,,,,,,...,,,,,,,,,,
4,Estimate,,5169517.0,2509686,2659831,315368,302232,322415,310116,334453,...,219634,104191,37.0,1128625,4167295,4040892,3859321,931306,757688,4040892.0
5,Percent,,5169517.0,48.5%,51.5%,6.1%,5.8%,6.2%,6.0%,6.5%,...,4.2%,2.0%,(X),21.8%,80.6%,78.2%,74.7%,18.0%,14.7%,4040892.0


In [5]:
# Display columns and data types
chicago_age_df.dtypes

Label (Grouping)                                       object
SEX AND AGE                                           float64
SEX AND AGE!!Total population                          object
SEX AND AGE!!Total population!!Male                    object
SEX AND AGE!!Total population!!Female                  object
SEX AND AGE!!Total population!!Under 5 years           object
SEX AND AGE!!Total population!!5 to 9 years            object
SEX AND AGE!!Total population!!10 to 14 years          object
SEX AND AGE!!Total population!!15 to 19 years          object
SEX AND AGE!!Total population!!20 to 24 years          object
SEX AND AGE!!Total population!!25 to 34 years          object
SEX AND AGE!!Total population!!35 to 44 years          object
SEX AND AGE!!Total population!!45 to 54 years          object
SEX AND AGE!!Total population!!55 to 59 years          object
SEX AND AGE!!Total population!!60 to 64 years          object
SEX AND AGE!!Total population!!65 to 74 years          object
SEX AND 

In [6]:
# Rename columns to remove 'SEX AND AGE' from the column names
chicago_age_df.columns = chicago_age_df.columns.str.replace('SEX AND AGE!!', '')

# Drop unnecessary columns
chicago_age_df = chicago_age_df.drop(columns=['Total population!!16 years and over',
                                              'Total population!!18 years and over.1',
                                              'Total population!!62 years and over',
                                              'Total population!!21 years and over',
                                              'Total population!!Under 18 years',
                                              'Total population!!65 years and over',
                                              'Total population!!Male',
                                              'Total population!!Female',
                                              'SEX AND AGE',
                                              'Total population!!18 years and over'
                                            ])


In [7]:
chicago_age_df.dtypes

Label (Grouping)                        object
Total population                        object
Total population!!Under 5 years         object
Total population!!5 to 9 years          object
Total population!!10 to 14 years        object
Total population!!15 to 19 years        object
Total population!!20 to 24 years        object
Total population!!25 to 34 years        object
Total population!!35 to 44 years        object
Total population!!45 to 54 years        object
Total population!!55 to 59 years        object
Total population!!60 to 64 years        object
Total population!!65 to 74 years        object
Total population!!75 to 84 years        object
Total population!!85 years and over     object
Total population!!Median age (years)    object
dtype: object

In [17]:
# Rename columns for clarity
chicago_age_df.columns = ['',
  'Total Population',
  'Under 5',
  '5 to 9',
  '10 to 14',
  '15 to 19',
  '20 to 24',
  '25 to 34',
  '35 to 44',
  '45 to 54',
  '55 to 59',
  '60 to 64',
  '65 to 74',
  '75 to 84',
  '85 and Over',
  'Median Age'
]

chicago_age_df.head()

Unnamed: 0,Unnamed: 1,Total Population,Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,55 to 59,60 to 64,65 to 74,75 to 84,85 and Over,Median Age
4,Estimate,5169517,315368,302232,322415,310116,334453,844827,699725,647738,330348,304607,433863,219634,104191,37.0
5,Percent,5169517,6.1%,5.8%,6.2%,6.0%,6.5%,16.3%,13.5%,12.5%,6.4%,5.9%,8.4%,4.2%,2.0%,(X)


In [11]:
# Drop NaN values from the DataFrame
chicago_age_df = chicago_age_df.dropna()

In [18]:
# Save the cleaned DataFrame to a new CSV file
chicago_age_df.to_csv('../chicago_data/cleaned_chi_age.csv', index=False)

#### Racial Demographics (Table DP05)

In [13]:
# Import csv file for race data
chicago_race_data = '../chicago_data/chi_raw_csv/chi_race.csv'

# Read csv file into DataFrame
chicago_race_df = pd.read_csv(chicago_race_data)
chicago_race_df.head()

Unnamed: 0,Label (Grouping),RACE,Race alone or in combination with one or more other races!!Total population,Race alone or in combination with one or more other races!!Total population!!White,Race alone or in combination with one or more other races!!Total population!!Black or African American,Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native,Race alone or in combination with one or more other races!!Total population!!Asian,Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander,Race alone or in combination with one or more other races!!Total population!!Some other race
0,Illinois,,,,,,,,
1,Estimate,,12716164.0,9367474,1957092,104386,828847,16535,1013633
2,Percent,,12716164.0,73.7%,15.4%,0.8%,6.5%,0.1%,8.0%
3,"Cook County, Illinois",,,,,,,,
4,Estimate,,5169517.0,3041776,1263902,43551,445203,6818,640553


In [14]:
# Delete the first row (contains unnecessary Illinois data)
chicago_race_df = chicago_race_df[chicago_race_df['Label (Grouping)'] != 'Illinois']

In [15]:
# Delete first 2 rows
chicago_race_df = chicago_race_df.drop(index=[1, 2])
chicago_race_df.head()

Unnamed: 0,Label (Grouping),RACE,Race alone or in combination with one or more other races!!Total population,Race alone or in combination with one or more other races!!Total population!!White,Race alone or in combination with one or more other races!!Total population!!Black or African American,Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native,Race alone or in combination with one or more other races!!Total population!!Asian,Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander,Race alone or in combination with one or more other races!!Total population!!Some other race
3,"Cook County, Illinois",,,,,,,,
4,Estimate,,5169517.0,3041776,1263902,43551,445203,6818,640553
5,Percent,,5169517.0,58.8%,24.4%,0.8%,8.6%,0.1%,12.4%


In [16]:
# Rename columns to remove 'Race alone or in combination with one or more other races!!Total population!!' from the column names
chicago_race_df.columns = chicago_race_df.columns.str.replace('Race alone or in combination with one or more other races!!Total population!!', '')

# Delete 'Race' column
chicago_race_df = chicago_race_df.drop(columns=['RACE'])

# Display the cleaned DataFrame
chicago_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races!!Total population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some other race
3,"Cook County, Illinois",,,,,,,
4,Estimate,5169517.0,3041776,1263902,43551,445203,6818,640553
5,Percent,5169517.0,58.8%,24.4%,0.8%,8.6%,0.1%,12.4%


In [20]:
# Delete NaN values
chicago_race_df = chicago_race_df.dropna()

In [21]:
# Rename columns for clarity 
chicago_race_df.columns = ['', 
                           'Total Population', 
                           'White', 
                           'Black or African American', 
                           'American Indian and Alaska Native', 
                           'Asian', 
                           'Native Hawaiian and Other Pacific Islander', 
                           'Other']

# Display
chicago_race_df.head()

Unnamed: 0,Unnamed: 1,Total Population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Other
4,Estimate,5169517,3041776,1263902,43551,445203,6818,640553
5,Percent,5169517,58.8%,24.4%,0.8%,8.6%,0.1%,12.4%


In [22]:
# Save the cleaned DataFrame to a new CSV file
chicago_race_df.to_csv('../chicago_data/cleaned_chi_race.csv', index=False)

#### Education Level (Table S1501)

In [27]:
# Import csv file for education data 
chicago_edu_data = '../chicago_data/chi_raw_csv/chicago_edu_level.csv'

# Read the CSV file into a DataFrame
chicago_edu_df = pd.read_csv(chicago_edu_data)
chicago_edu_df.head()


Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Total!!Estimate","Cook County, Illinois!!Male!!Estimate","Cook County, Illinois!!Female!!Estimate"
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,455959.0,226496.0,229463.0
2,Less than high school graduate,53361.0,30020.0,23341.0
3,High school graduate (includes equival...,128409.0,69900.0,58509.0
4,Some college or associate's degree,191084.0,90133.0,100951.0


In [28]:
# Source data (filter only what is needed out of dataset)
chicago_edu_df = chicago_edu_df.iloc[:28]
chicago_edu_df.head()

Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Total!!Estimate","Cook County, Illinois!!Male!!Estimate","Cook County, Illinois!!Female!!Estimate"
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,455959.0,226496.0,229463.0
2,Less than high school graduate,53361.0,30020.0,23341.0
3,High school graduate (includes equival...,128409.0,69900.0,58509.0
4,Some college or associate's degree,191084.0,90133.0,100951.0


In [29]:
# Drop NaN values
chicago_edu_df = chicago_edu_df.dropna()

In [30]:
# Rename columns for clarity
chicago_edu_df.columns = ['Age by Educational Attainment',
                          'Total', 
                            'Male',
                            'Female']

# Display the cleaned DataFrame
chicago_edu_df.head()

Unnamed: 0,Age by Educational Attainment,Total,Male,Female
1,Population 18 to 24 years,455959,226496,229463
2,Less than high school graduate,53361,30020,23341
3,High school graduate (includes equival...,128409,69900,58509
4,Some college or associate's degree,191084,90133,100951
5,Bachelor's degree or higher,83105,36443,46662


In [33]:
# save the cleaned DataFrame to a new CSV file
chicago_edu_df.to_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_edu.csv', index=False)

#### Found: Population Count by Age and Zipcode

In [34]:
# Import new csv file
chicago_pop_counts = '../chicago_data/chi_raw_csv/Chicago_Population_Counts.csv'

# Read the csv file
chicago_pop_counts_df = pd.read_csv(chicago_pop_counts)
chicago_pop_counts_df.head()

Unnamed: 0,Geography Type,Year,Geography,Population - Total,Population - Age 0-17,Population - Age 18-29,Population - Age 30-39,Population - Age 40-49,Population - Age 50-59,Population - Age 60-69,...,Population - Age 18+,Population - Age 65+,Population - Female,Population - Male,Population - Latinx,Population - Asian Non-Latinx,Population - Black Non-Latinx,Population - White Non-Latinx,Population - Other Race Non-Latinx,Record ID
0,Citywide,2018,Chicago,2705988,548999,552935,456321,336457,312965,262991,...,2156989,349712,1386113,1319875,776661,179841.0,784266.0,899980,119467.0,Citywide-Chicago-2018
1,ZIP Code,2018,60601,14675,820,4606,2792,2190,1333,1340,...,13855,2075,7484,7191,1274,,,9677,,ZIP_Code-60601-2018
2,ZIP Code,2018,60602,1244,149,435,462,135,53,10,...,1095,5,551,693,81,,,788,,ZIP_Code-60602-2018
3,ZIP Code,2018,60603,1174,56,561,101,97,197,97,...,1118,112,601,573,115,,,707,,ZIP_Code-60603-2018
4,ZIP Code,2018,60604,782,38,303,104,51,101,130,...,744,93,413,369,34,,,479,,ZIP_Code-60604-2018


In [35]:
# Clean data by droping any rows that do not have 2020 in the 'Year' column
chicago_pop_counts_2020_df = chicago_pop_counts_df[chicago_pop_counts_df['Year'] == 2020]
chicago_pop_counts_2020_df.head()

Unnamed: 0,Geography Type,Year,Geography,Population - Total,Population - Age 0-17,Population - Age 18-29,Population - Age 30-39,Population - Age 40-49,Population - Age 50-59,Population - Age 60-69,...,Population - Age 18+,Population - Age 65+,Population - Female,Population - Male,Population - Latinx,Population - Asian Non-Latinx,Population - Black Non-Latinx,Population - White Non-Latinx,Population - Other Race Non-Latinx,Record ID
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,...,2146679,342174,1388469,1310878,772791,182251.0,776470.0,900055,67780.0,Citywide-Chicago-2020
121,ZIP Code,2020,60601,14513,825,4696,3048,1815,809,1974,...,13688,2605,7894,6619,1242,3528.0,679.0,8614,450.0,ZIP_CODE-60601-2020
122,ZIP Code,2020,60602,1596,115,332,860,191,81,17,...,1481,4,744,852,120,435.0,37.0,794,210.0,ZIP_CODE-60602-2020
123,ZIP Code,2020,60603,1186,15,423,248,68,176,146,...,1171,198,560,626,62,397.0,29.0,692,6.0,ZIP_CODE-60603-2020
124,ZIP Code,2020,60604,729,5,313,171,30,77,89,...,724,65,441,288,23,127.0,72.0,507,0.0,ZIP_CODE-60604-2020


In [36]:
# list all columns in the dataframe
chicago_pop_counts_2020_df.columns

Index(['Geography Type', 'Year', 'Geography', 'Population - Total',
       'Population - Age 0-17', 'Population - Age 18-29',
       'Population - Age 30-39', 'Population - Age 40-49',
       'Population - Age 50-59', 'Population - Age 60-69',
       'Population - Age 70-79', 'Population - Age 80+',
       'Population - Age 0-4', 'Population - Age 5-11',
       'Population - Age 12-17', 'Population - Age 5+', 'Population - Age 18+',
       'Population - Age 65+', 'Population - Female', 'Population - Male',
       'Population - Latinx', 'Population - Asian Non-Latinx',
       'Population - Black Non-Latinx', 'Population - White Non-Latinx',
       'Population - Other Race Non-Latinx', 'Record ID'],
      dtype='object')

In [37]:
# Rename columns to get rid of population in the title. 
chicago_pop_counts_2020_df = chicago_pop_counts_2020_df.rename(columns={'Population - Total': 'Total', 
                                                                        'Population - Age 0-17': 'Age 0-17', 
                                                                        'Population - Age 18-29': 'Age 18-29',
                                                                        'Population - Age 30-39': 'Age 30-39',
                                                                        'Population - Age 40-49': 'Age 40-49',
                                                                        'Population - Age 50-59': 'Age 50-59',
                                                                        'Population - Age 60-69': 'Age 60-69',
                                                                        'Population - Age 70-79': 'Age 70-79',
                                                                        'Population - Age 80+': 'Age 80+',
                                                                        'Population - Female': 'Female',
                                                                        'Population - Male': 'Male',
                                                                        'Population - Latinx': 'Latinx',
                                                                        'Population - Asian Non-Latinx': 'Asian Non-Latinx',
                                                                        'Population - Black Non-Latinx': 'Black Non-Latinx',
                                                                        'Population - White Non-Latinx': 'White Non-Latinx',
                                                                        'Population - Other Non-Latinx': 'Other Non-Latinx',
                                                                })
chicago_pop_counts_2020_df.head()

Unnamed: 0,Geography Type,Year,Geography,Total,Age 0-17,Age 18-29,Age 30-39,Age 40-49,Age 50-59,Age 60-69,...,Population - Age 18+,Population - Age 65+,Female,Male,Latinx,Asian Non-Latinx,Black Non-Latinx,White Non-Latinx,Population - Other Race Non-Latinx,Record ID
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,...,2146679,342174,1388469,1310878,772791,182251.0,776470.0,900055,67780.0,Citywide-Chicago-2020
121,ZIP Code,2020,60601,14513,825,4696,3048,1815,809,1974,...,13688,2605,7894,6619,1242,3528.0,679.0,8614,450.0,ZIP_CODE-60601-2020
122,ZIP Code,2020,60602,1596,115,332,860,191,81,17,...,1481,4,744,852,120,435.0,37.0,794,210.0,ZIP_CODE-60602-2020
123,ZIP Code,2020,60603,1186,15,423,248,68,176,146,...,1171,198,560,626,62,397.0,29.0,692,6.0,ZIP_CODE-60603-2020
124,ZIP Code,2020,60604,729,5,313,171,30,77,89,...,724,65,441,288,23,127.0,72.0,507,0.0,ZIP_CODE-60604-2020


In [38]:
# Drop columns that are not needed ('Population - Age 0-4', 'Population - Age 5-11', 'Population - Age 12-17', 'Population - Age 5+', 'Population - Age 18+', 'Population - Age 65+', "Record ID")

chicago_pop_counts_2020_df = chicago_pop_counts_2020_df.drop(columns=['Population - Age 0-4', 
                                                                      'Population - Age 5-11', 
                                                                      'Population - Age 12-17', 
                                                                      'Population - Age 5+', 
                                                                      'Population - Age 18+', 
                                                                      'Population - Age 65+', 
                                                                      'Record ID'])
chicago_pop_counts_2020_df.head()

Unnamed: 0,Geography Type,Year,Geography,Total,Age 0-17,Age 18-29,Age 30-39,Age 40-49,Age 50-59,Age 60-69,Age 70-79,Age 80+,Female,Male,Latinx,Asian Non-Latinx,Black Non-Latinx,White Non-Latinx,Population - Other Race Non-Latinx
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,145426,83472,1388469,1310878,772791,182251.0,776470.0,900055,67780.0
121,ZIP Code,2020,60601,14513,825,4696,3048,1815,809,1974,1070,276,7894,6619,1242,3528.0,679.0,8614,450.0
122,ZIP Code,2020,60602,1596,115,332,860,191,81,17,0,0,744,852,120,435.0,37.0,794,210.0
123,ZIP Code,2020,60603,1186,15,423,248,68,176,146,91,19,560,626,62,397.0,29.0,692,6.0
124,ZIP Code,2020,60604,729,5,313,171,30,77,89,36,8,441,288,23,127.0,72.0,507,0.0


In [39]:
# Save new cleaned data to a new csv file
chicago_pop_counts_2020_df.to_csv('../chicago_data/chi_cleaned_csv/chi_pop_counts_2020', index=False)

## Economic Metric

**Measuring**: Median Household Income, Unemployment Rate.

Data for this section was pulled from the Chicago 2020 Census Database using various coded tables:
* Median Household Income: Table S1901
* Unemployment Rate: Table S2301

##### Median Household Income (Table S1901)

In [40]:
# Import new csv file for Median Household Income
chicago_income_data = '../chicago_data/chi_raw_csv/chicago_income.csv'

# Read the csv file
chicago_income_df = pd.read_csv(chicago_income_data)
chicago_income_df.head()

Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Households!!Estimate","Cook County, Illinois!!Families!!Estimate","Cook County, Illinois!!Married-couple families!!Estimate","Cook County, Illinois!!Nonfamily households!!Estimate"
0,Total,1991474,1183425,807194,808049
1,"Less than $10,000",7.1%,4.0%,1.4%,12.2%
2,"$10,000 to $14,999",3.8%,2.2%,1.1%,6.5%
3,"$15,000 to $24,999",8.3%,6.1%,3.5%,12.0%
4,"$25,000 to $34,999",8.1%,7.1%,5.0%,9.9%


In [41]:
# Droping unnecessary rows and columns
chicago_income_df = chicago_income_df.iloc[:13]

In [43]:
# Rewrite for clarity
chicago_income_df.columns = ['',
                             'Household', 
                              'Families', 
                              'Married-couple Households', 
                              'Nonfamily Households']
chicago_income_df.head()

Unnamed: 0,Unnamed: 1,Household,Families,Married-couple Households,Nonfamily Households
0,Total,1991474,1183425,807194,808049
1,"Less than $10,000",7.1%,4.0%,1.4%,12.2%
2,"$10,000 to $14,999",3.8%,2.2%,1.1%,6.5%
3,"$15,000 to $24,999",8.3%,6.1%,3.5%,12.0%
4,"$25,000 to $34,999",8.1%,7.1%,5.0%,9.9%


In [44]:
# Save the cleaned DataFrame to a new CSV file
chicago_income_df.to_csv('../chicago_data/cleaned_chi_income.csv', index=False)

##### Unemployment Rate (Table S2301)

In [46]:
# Import new csv file for Unemployment Rate
chicago_employment_data = '../chicago_data/chi_raw_csv/chicago_employment_status.csv'

# Read the csv file
chicago_employment_df = pd.read_csv(chicago_employment_data)
chicago_employment_df.head()

Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Total!!Estimate","Cook County, Illinois!!Labor Force Participation Rate!!Estimate","Cook County, Illinois!!Employment/Population Ratio!!Estimate","Cook County, Illinois!!Unemployment rate!!Estimate"
0,Population 16 years and over,4167295.0,66.1%,61.5%,7.0%
1,AGE,,,,
2,16 to 19 years,247909.0,31.9%,24.0%,24.6%
3,20 to 24 years,334453.0,74.0%,63.5%,14.0%
4,25 to 29 years,432731.0,85.7%,79.3%,7.5%


In [47]:
# Filter the DataFrame to only include rows 0-11
chicago_employment_df = chicago_employment_df.iloc[0:12]

# Display the filtered DataFrame
chicago_employment_df.head()

Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Total!!Estimate","Cook County, Illinois!!Labor Force Participation Rate!!Estimate","Cook County, Illinois!!Employment/Population Ratio!!Estimate","Cook County, Illinois!!Unemployment rate!!Estimate"
0,Population 16 years and over,4167295.0,66.1%,61.5%,7.0%
1,AGE,,,,
2,16 to 19 years,247909.0,31.9%,24.0%,24.6%
3,20 to 24 years,334453.0,74.0%,63.5%,14.0%
4,25 to 29 years,432731.0,85.7%,79.3%,7.5%


In [49]:
# Drop any NaN values
chicago_employment_df = chicago_employment_df.dropna()

In [50]:
# Rename columns for clarity
chicago_employment_df.columns = ['Age Group', 
                                   'Total', 
                                   'Labor Force Participation', 
                                   'Employment Population Ratio', 
                                   'Unemployment Rate']
chicago_employment_df.head()

Unnamed: 0,Age Group,Total,Labor Force Participation,Employment Population Ratio,Unemployment Rate
0,Population 16 years and over,4167295,66.1%,61.5%,7.0%
2,16 to 19 years,247909,31.9%,24.0%,24.6%
3,20 to 24 years,334453,74.0%,63.5%,14.0%
4,25 to 29 years,432731,85.7%,79.3%,7.5%
5,30 to 34 years,412096,86.6%,81.4%,5.8%


In [51]:
# Save the cleaned DataFrame to a new CSV file
chicago_income_df.to_csv('../chicago_data/cleaned_chi_employment.csv', index=False)

## Housing  Metric 

**Measuring**: Median Rent

Data for this section was pulled from the Chicago 2020 Census Database using the same table code from the American Community Survey by the U.S. Census Bureau (ACS). 
* Median Rent: Table DP04

In [61]:
# Import new csv file
chicago_rent_data = '../chicago_data/chi_raw_csv/chicago_gross_rent.csv'

# Read the csv file
chicago_rent_df = pd.read_csv(chicago_rent_data)
chicago_rent_df.head()


Unnamed: 0,Label (Grouping),HOUSING OCCUPANCY!!Total housing units,GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Less than $500,GROSS RENT!!Occupied units paying rent!!$500 to $999,"GROSS RENT!!Occupied units paying rent!!$1,000 to $1,499","GROSS RENT!!Occupied units paying rent!!$1,500 to $1,999","GROSS RENT!!Occupied units paying rent!!$2,000 to $2,499","GROSS RENT!!Occupied units paying rent!!$2,500 to $2,999","GROSS RENT!!Occupied units paying rent!!$3,000 or more",GROSS RENT!!Occupied units paying rent!!Median (dollars),GROSS RENT!!Occupied units paying rent!!No rent paid
0,"Cook County, Illinois",,,,,,,,,,,
1,Estimate,2198489.0,826287.0,63894,239710,284235,136924,59665,23888,17971,1160,25249
2,Percent,2198489.0,826287.0,7.7%,29.0%,34.4%,16.6%,7.2%,2.9%,2.2%,(X),(X)


In [62]:
# Drop NaN values
chicago_rent_df = chicago_rent_df.dropna()

In [63]:
# Rename columns for clarity
chicago_rent_df.columns = ['',
                             'Total Housing Units',
                             'Occupied Housing Units Paying Rent',
                             'Less than $500',
                             '$500 to $999',
                             '$1,000 to $1,499',
                             '$1,500 to $1,999',
                             '$2,000 to $2,499',
                             '$2,500 to $2,999',
                             '$3,000 or more',
                             'Median Rent Price',
                             'No rent paid'
]
chicago_rent_df

Unnamed: 0,Unnamed: 1,Total Housing Units,Occupied Housing Units Paying Rent,Less than $500,$500 to $999,"$1,000 to $1,499","$1,500 to $1,999","$2,000 to $2,499","$2,500 to $2,999","$3,000 or more",Median Rent Price,No rent paid
1,Estimate,2198489,826287,63894,239710,284235,136924,59665,23888,17971,1160,25249
2,Percent,2198489,826287,7.7%,29.0%,34.4%,16.6%,7.2%,2.9%,2.2%,(X),(X)


In [64]:
# Save the cleaned DataFrame to a new CSV file
chicago_income_df.to_csv('../chicago_data/cleaned_chi_rent.csv', index=False)

## Quality of Life Metric

**Measuring**: Commute Time 

Data was pulled from the Census Database using the following survey by ASC in 2020. 
* Commute Time: Table S0801 
* Health Outcomes

In [65]:
# Import new csv file for Commute Time
chicago_commute_data = '../chicago_data/chi_raw_csv/chicago_commute.csv'

# Read the csv file
chicago_commute_df = pd.read_csv(chicago_commute_data)
chicago_commute_df.head()

Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Total!!Estimate","Cook County, Illinois!!Male!!Estimate","Cook County, Illinois!!Female!!Estimate"
0,Workers 16 years and over,2508211,1305661,1202550
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",67.7%,68.9%,66.4%
3,Drove alone,59.9%,61.3%,58.4%
4,Carpooled,7.8%,7.6%,8.0%


In [66]:
chicago_commute_df = chicago_commute_df.iloc[:14]
chicago_commute_df.head()

Unnamed: 0,Label (Grouping),"Cook County, Illinois!!Total!!Estimate","Cook County, Illinois!!Male!!Estimate","Cook County, Illinois!!Female!!Estimate"
0,Workers 16 years and over,2508211,1305661,1202550
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",67.7%,68.9%,66.4%
3,Drove alone,59.9%,61.3%,58.4%
4,Carpooled,7.8%,7.6%,8.0%


In [67]:
# Delete NaN values
chicago_commute_df = chicago_commute_df.dropna()

In [68]:
# Rewrite for clarity 
chicago_commute_df.columns = ['',
                              'Total Estimate',
                              'Male',
                              'Female'
                              ]
chicago_commute_df.head()

Unnamed: 0,Unnamed: 1,Total Estimate,Male,Female
0,Workers 16 years and over,2508211,1305661,1202550
2,"Car, truck, or van",67.7%,68.9%,66.4%
3,Drove alone,59.9%,61.3%,58.4%
4,Carpooled,7.8%,7.6%,8.0%
5,In 2-person carpool,5.9%,5.7%,6.1%


In [69]:
# Save the cleaned DataFrame to a new CSV file
chicago_commute_df.to_csv('../chicago_data/cleaned_chi_commute.csv', index=False)

#### Health Behaviors & Outcomes 

In [9]:
# Import file (chicago_health_behavior.csv)
chicago_health_data = '../chicago_data/chi_raw_csv/chicago_health_behaviors.csv'

# Read file 
chicago_health_df = pd.read_csv(chicago_health_data)
chicago_health_df.head()

Unnamed: 0,Entity DCID,Entity properties isoCode,Entity properties name,Variable DCID,Variable observation date,Variable observation metadata importName,Variable observation metadata provenanceUrl,Variable observation metadata scalingFactor,Variable observation metadata unit,Variable observation metadata unitDisplayName,Variable observation value,Variable properties name
0,geoId/1714000,,Chicago,Percent_Person_SleepLessThan7Hours,2020,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,35.2,Percentage of Adult Population That Sleeps Les...
1,geoId/1714000,,Chicago,Percent_Person_SleepLessThan7Hours,2022,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,37.9,Percentage of Adult Population That Sleeps Les...
2,geoId/1714000,,Chicago,Percent_Person_Obesity,2020,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,32.2,Percentage of Adult Population That Is Obese
3,geoId/1714000,,Chicago,Percent_Person_Obesity,2021,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,34.1,Percentage of Adult Population That Is Obese
4,geoId/1714000,,Chicago,Percent_Person_Obesity,2022,CDC500,https://www.cdc.gov/places/index.html,100,Percent,%,35.2,Percentage of Adult Population That Is Obese


In [10]:
# Delete unnecessary columns 
chicago_health_df = chicago_health_df.drop(columns=['Entity DCID', 'Entity properties isoCode','Variable observation metadata importName', 'Variable DCID', 'Variable observation metadata importName','Variable observation metadata provenanceUrl','Variable observation metadata scalingFactor', 'Variable observation metadata unit', 'Variable observation metadata unitDisplayName'])
chicago_health_df.head()

Unnamed: 0,Entity properties name,Variable observation date,Variable observation value,Variable properties name
0,Chicago,2020,35.2,Percentage of Adult Population That Sleeps Les...
1,Chicago,2022,37.9,Percentage of Adult Population That Sleeps Les...
2,Chicago,2020,32.2,Percentage of Adult Population That Is Obese
3,Chicago,2021,34.1,Percentage of Adult Population That Is Obese
4,Chicago,2022,35.2,Percentage of Adult Population That Is Obese


In [11]:
# Filter the DataFrame to only include rows where 'Variable observation date' is 2020
chicago_health_df = chicago_health_df[chicago_health_df['Variable observation date'] == 2020]

# Display the filtered DataFrame
chicago_health_df.head()

Unnamed: 0,Entity properties name,Variable observation date,Variable observation value,Variable properties name
0,Chicago,2020,35.2,Percentage of Adult Population That Sleeps Les...
2,Chicago,2020,32.2,Percentage of Adult Population That Is Obese
5,Chicago,2020,13.4,Percentage of Adult Population Who Binge Drink
8,Chicago,2020,28.6,Percentage of Adult Population Who Are Physica...
11,Chicago,2020,15.0,Percentage of Adult Population That Smokes


In [12]:
# Save the cleaned DataFrame to a new CSV file
chicago_health_df.to_csv('../chicago_data/cleaned_chi_health.csv', index=False)

### Creating JSON file 

In [None]:
# Load the datasets from Chicago
chicago_commute_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_commute.csv')
chicago_unemployment_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_employment.csv')
chicago_rent_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_rent.csv')
chicago_income_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_income.csv')
chicago_edu_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_edu.csv')
chicago_age_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_age.csv')
chicago_race_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_race.csv')
chicago_health_df = pd.read_csv('../chicago_data/chi_cleaned_csv/cleaned_chi_health.csv')

In [None]:
# Combine all data into a dictionary
chicago_data = {
    "commute": chicago_commute_df.to_dict(orient='records'),
    "unemployment": chicago_unemployment_df.to_dict(orient='records'),
    "rent": chicago_rent_df.to_dict(orient='records'),
    "income": chicago_income_df.to_dict(orient='records'),
    "education": chicago_edu_df.to_dict(orient='records'),
    "age": chicago_age_df.to_dict(orient='records'),
    "race": chicago_race_df.to_dict(orient='records'),
    "health": chicago_health_df.to_dict(orient='records')
}

In [10]:
# Save the dictionary as a JSON file
with open('../chicago_data/chicago_data.json', 'w') as json_file:
    json.dump(chicago_data, json_file)

Sources: 
* [chicago_age_demo.csv]()
* [chicago_housing.csv]()
* [chicago_race.csv]()
* [chicago_commute.csv]()
* [chicago_edu_level.csv]()
* [chicago_gross_rent.csv]()
* [chicago_income.csv]()
* [chicago_health_behaviors.csv](https://datacommons.org/place/geoId/1714000?category=Health)