In [12]:
# Import dependencies
import os
import csv
import pandas as pd

### Demographic Data Metric

##### Cenus Population Data

In [13]:
# Import csv file for chicago_data
chicago_data = 'data_files/chicago_data/chicago_demographics.csv'

# Read the csv file for chicago_data
chicago_data_df = pd.read_csv(chicago_data)
chicago_data_df.head()

Unnamed: 0,Fact,Fact Note,"Chicago city, Illinois","Value Note for Chicago city, Illinois"
0,"Population estimates, July 1, 2024, (V2024)",,,
1,"Population estimates, July 1, 2023, (V2023)",,2664452.0,
2,"Population estimates base, April 1, 2020, (V2024)",,,
3,"Population estimates base, April 1, 2020, (V2023)",,2746352.0,
4,"Population, percent change - April 1, 2020 (es...",,,


In [14]:
# Filter the dataframe to only include rows with '2020' in the 'Fact' column
chicago_data_2020_df = chicago_data_df[chicago_data_df['Fact'].str.contains('2020')]
chicago_data_2020_df

Unnamed: 0,Fact,Fact Note,"Chicago city, Illinois","Value Note for Chicago city, Illinois"
2,"Population estimates base, April 1, 2020, (V2024)",,,
3,"Population estimates base, April 1, 2020, (V2023)",,2746352,
4,"Population, percent change - April 1, 2020 (es...",,,
5,"Population, percent change - April 1, 2020 (es...",,-3.0%,
6,"Population, Census, April 1, 2020",,2746388,
62,"Population per square mile, 2020",,12059.8,
64,"Land area in square miles, 2020",,227.73,


In [15]:
# Clean the data by dropping the columns that are not needed (want to drop "Fact Note", "Value Note for Chicago City, Illinois") 
chicago_data_2020_df = chicago_data_2020_df.drop(columns=['Fact Note', 'Value Note for Chicago city, Illinois'])
chicago_data_2020_df.head()

Unnamed: 0,Fact,"Chicago city, Illinois"
2,"Population estimates base, April 1, 2020, (V2024)",
3,"Population estimates base, April 1, 2020, (V2023)",2746352
4,"Population, percent change - April 1, 2020 (es...",
5,"Population, percent change - April 1, 2020 (es...",-3.0%
6,"Population, Census, April 1, 2020",2746388


In [16]:
# Drop the rows with NaN values
chicago_data_2020_df = chicago_data_2020_df.dropna()

In [17]:
chicago_data_2020_df.head()

Unnamed: 0,Fact,"Chicago city, Illinois"
3,"Population estimates base, April 1, 2020, (V2023)",2746352
5,"Population, percent change - April 1, 2020 (es...",-3.0%
6,"Population, Census, April 1, 2020",2746388
62,"Population per square mile, 2020",12059.8
64,"Land area in square miles, 2020",227.73


In [18]:
# save the cleaned data to a new csv file
chicago_data_2020_df.to_csv('cleaned_data/chicago_data_2020.csv', index=False)

##### Population counts by age csv file

In [19]:
# Import new csv file
chicago_pop_counts = 'data_files/chicago_data/Chicago_Population_Counts.csv'

# Read the csv file
chicago_pop_counts_df = pd.read_csv(chicago_pop_counts)
chicago_pop_counts_df.head()

Unnamed: 0,Geography Type,Year,Geography,Population - Total,Population - Age 0-17,Population - Age 18-29,Population - Age 30-39,Population - Age 40-49,Population - Age 50-59,Population - Age 60-69,...,Population - Age 18+,Population - Age 65+,Population - Female,Population - Male,Population - Latinx,Population - Asian Non-Latinx,Population - Black Non-Latinx,Population - White Non-Latinx,Population - Other Race Non-Latinx,Record ID
0,Citywide,2018,Chicago,2705988,548999,552935,456321,336457,312965,262991,...,2156989,349712,1386113,1319875,776661,179841.0,784266.0,899980,119467.0,Citywide-Chicago-2018
1,ZIP Code,2018,60601,14675,820,4606,2792,2190,1333,1340,...,13855,2075,7484,7191,1274,,,9677,,ZIP_Code-60601-2018
2,ZIP Code,2018,60602,1244,149,435,462,135,53,10,...,1095,5,551,693,81,,,788,,ZIP_Code-60602-2018
3,ZIP Code,2018,60603,1174,56,561,101,97,197,97,...,1118,112,601,573,115,,,707,,ZIP_Code-60603-2018
4,ZIP Code,2018,60604,782,38,303,104,51,101,130,...,744,93,413,369,34,,,479,,ZIP_Code-60604-2018


In [20]:
# Clean data by droping any rows that do not have 2020 in the 'Year' column
chicago_pop_counts_2020_df = chicago_pop_counts_df[chicago_pop_counts_df['Year'] == 2020]
chicago_pop_counts_2020_df.head()

Unnamed: 0,Geography Type,Year,Geography,Population - Total,Population - Age 0-17,Population - Age 18-29,Population - Age 30-39,Population - Age 40-49,Population - Age 50-59,Population - Age 60-69,...,Population - Age 18+,Population - Age 65+,Population - Female,Population - Male,Population - Latinx,Population - Asian Non-Latinx,Population - Black Non-Latinx,Population - White Non-Latinx,Population - Other Race Non-Latinx,Record ID
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,...,2146679,342174,1388469,1310878,772791,182251.0,776470.0,900055,67780.0,Citywide-Chicago-2020
121,ZIP Code,2020,60601,14513,825,4696,3048,1815,809,1974,...,13688,2605,7894,6619,1242,3528.0,679.0,8614,450.0,ZIP_CODE-60601-2020
122,ZIP Code,2020,60602,1596,115,332,860,191,81,17,...,1481,4,744,852,120,435.0,37.0,794,210.0,ZIP_CODE-60602-2020
123,ZIP Code,2020,60603,1186,15,423,248,68,176,146,...,1171,198,560,626,62,397.0,29.0,692,6.0,ZIP_CODE-60603-2020
124,ZIP Code,2020,60604,729,5,313,171,30,77,89,...,724,65,441,288,23,127.0,72.0,507,0.0,ZIP_CODE-60604-2020


In [21]:
# list all columns in the dataframe
chicago_pop_counts_2020_df.columns

Index(['Geography Type', 'Year', 'Geography', 'Population - Total',
       'Population - Age 0-17', 'Population - Age 18-29',
       'Population - Age 30-39', 'Population - Age 40-49',
       'Population - Age 50-59', 'Population - Age 60-69',
       'Population - Age 70-79', 'Population - Age 80+',
       'Population - Age 0-4', 'Population - Age 5-11',
       'Population - Age 12-17', 'Population - Age 5+', 'Population - Age 18+',
       'Population - Age 65+', 'Population - Female', 'Population - Male',
       'Population - Latinx', 'Population - Asian Non-Latinx',
       'Population - Black Non-Latinx', 'Population - White Non-Latinx',
       'Population - Other Race Non-Latinx', 'Record ID'],
      dtype='object')

In [22]:
# Rename columns to get rid of population in the title. 
chicago_pop_counts_2020_df = chicago_pop_counts_2020_df.rename(columns={'Population - Total': 'Total', 
                                                                        'Population - Age 0-17': 'Age 0-17', 
                                                                        'Population - Age 18-29': 'Age 18-29',
                                                                        'Population - Age 30-39': 'Age 30-39',
                                                                        'Population - Age 40-49': 'Age 40-49',
                                                                        'Population - Age 50-59': 'Age 50-59',
                                                                        'Population - Age 60-69': 'Age 60-69',
                                                                        'Population - Age 70-79': 'Age 70-79',
                                                                        'Population - Age 80+': 'Age 80+',
                                                                        'Population - Female': 'Female',
                                                                        'Population - Male': 'Male',
                                                                        'Population - Latinx': 'Latinx',
                                                                        'Population - Asian Non-Latinx': 'Asian Non-Latinx',
                                                                        'Population - Black Non-Latinx': 'Black Non-Latinx',
                                                                        'Population - White Non-Latinx': 'White Non-Latinx',
                                                                        'Population - Other Non-Latinx': 'Other Non-Latinx',
                                                                })
chicago_pop_counts_2020_df.head()

Unnamed: 0,Geography Type,Year,Geography,Total,Age 0-17,Age 18-29,Age 30-39,Age 40-49,Age 50-59,Age 60-69,...,Population - Age 18+,Population - Age 65+,Female,Male,Latinx,Asian Non-Latinx,Black Non-Latinx,White Non-Latinx,Population - Other Race Non-Latinx,Record ID
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,...,2146679,342174,1388469,1310878,772791,182251.0,776470.0,900055,67780.0,Citywide-Chicago-2020
121,ZIP Code,2020,60601,14513,825,4696,3048,1815,809,1974,...,13688,2605,7894,6619,1242,3528.0,679.0,8614,450.0,ZIP_CODE-60601-2020
122,ZIP Code,2020,60602,1596,115,332,860,191,81,17,...,1481,4,744,852,120,435.0,37.0,794,210.0,ZIP_CODE-60602-2020
123,ZIP Code,2020,60603,1186,15,423,248,68,176,146,...,1171,198,560,626,62,397.0,29.0,692,6.0,ZIP_CODE-60603-2020
124,ZIP Code,2020,60604,729,5,313,171,30,77,89,...,724,65,441,288,23,127.0,72.0,507,0.0,ZIP_CODE-60604-2020


In [23]:
# Drop columns that are not needed ('Population - Age 0-4', 'Population - Age 5-11', 'Population - Age 12-17', 'Population - Age 5+', 'Population - Age 18+', 'Population - Age 65+', "Record ID")

chicago_pop_counts_2020_df = chicago_pop_counts_2020_df.drop(columns=['Population - Age 0-4', 
                                                                      'Population - Age 5-11', 
                                                                      'Population - Age 12-17', 
                                                                      'Population - Age 5+', 
                                                                      'Population - Age 18+', 
                                                                      'Population - Age 65+', 
                                                                      'Record ID'])
chicago_pop_counts_2020_df.head()

Unnamed: 0,Geography Type,Year,Geography,Total,Age 0-17,Age 18-29,Age 30-39,Age 40-49,Age 50-59,Age 60-69,Age 70-79,Age 80+,Female,Male,Latinx,Asian Non-Latinx,Black Non-Latinx,White Non-Latinx,Population - Other Race Non-Latinx
120,Citywide,2020,Chicago,2699347,552668,548747,463143,336591,313865,255435,145426,83472,1388469,1310878,772791,182251.0,776470.0,900055,67780.0
121,ZIP Code,2020,60601,14513,825,4696,3048,1815,809,1974,1070,276,7894,6619,1242,3528.0,679.0,8614,450.0
122,ZIP Code,2020,60602,1596,115,332,860,191,81,17,0,0,744,852,120,435.0,37.0,794,210.0
123,ZIP Code,2020,60603,1186,15,423,248,68,176,146,91,19,560,626,62,397.0,29.0,692,6.0
124,ZIP Code,2020,60604,729,5,313,171,30,77,89,36,8,441,288,23,127.0,72.0,507,0.0


In [24]:
# Save new cleaned data to a new csv file
chicago_pop_counts_2020_df.to_csv('cleaned_data/chicago_pop_counts_2020.csv', index=False)

### Housing Data Metric 

In [25]:
# read csv file 

