# Houston Dataset

In [None]:
# Import dependencies
import os
import csv
import pandas as pd
import json
from pathlib import Path

## Demographic Metric

**Measuring**: ACS Median Age, Racial Demographics, Education Level

Data was pulled from the 2020 Houston Census Database and the American Community Survey (ACS) by the U.S. Census Bureau using various coded tables to ensure unified data for each metropolitan city. Two tables used the same code and were seperated to ensure proper measurement. 
* ACS Median Age: Table DP05
* ACS Race: Table DP05
* Education Level: S1501

##### ACS Median Age (Table DP05)

In [2]:
# Import csv file for Texas ACS Population Density
houston_age_data = '../houston_data/houston_raw_csv/houston_age.csv'

# Read the CSV file into a DataFrame
houston_age_df = pd.read_csv(houston_age_data)
houston_age_df.head()

Unnamed: 0,Label (Grouping),SEX AND AGE!!Total population,SEX AND AGE!!Total population!!Under 5 years,SEX AND AGE!!Total population!!5 to 9 years,SEX AND AGE!!Total population!!10 to 14 years,SEX AND AGE!!Total population!!15 to 19 years,SEX AND AGE!!Total population!!20 to 24 years,SEX AND AGE!!Total population!!25 to 34 years,SEX AND AGE!!Total population!!35 to 44 years,SEX AND AGE!!Total population!!45 to 54 years,SEX AND AGE!!Total population!!55 to 59 years,SEX AND AGE!!Total population!!60 to 64 years,SEX AND AGE!!Total population!!65 to 74 years,SEX AND AGE!!Total population!!75 to 84 years,SEX AND AGE!!Total population!!85 years and over,SEX AND AGE!!Total population!!Median age (years)
0,"Harris County, Texas",,,,,,,,,,,,,,,
1,Estimate,4835125.0,323950,342195,353847,346582,328705,734824,706339,601580,266006,245511,365995,171533,48058,34.8
2,Percent,4835125.0,6.7%,7.1%,7.3%,7.2%,6.8%,15.2%,14.6%,12.4%,5.5%,5.1%,7.6%,3.5%,1.0%,(X)


In [3]:
# Display column names and data types
print(houston_age_df.dtypes)


Label (Grouping)                                     object
SEX AND AGE!!Total population                        object
SEX AND AGE!!Total population!!Under 5 years         object
SEX AND AGE!!Total population!!5 to 9 years          object
SEX AND AGE!!Total population!!10 to 14 years        object
SEX AND AGE!!Total population!!15 to 19 years        object
SEX AND AGE!!Total population!!20 to 24 years        object
SEX AND AGE!!Total population!!25 to 34 years        object
SEX AND AGE!!Total population!!35 to 44 years        object
SEX AND AGE!!Total population!!45 to 54 years        object
SEX AND AGE!!Total population!!55 to 59 years        object
SEX AND AGE!!Total population!!60 to 64 years        object
SEX AND AGE!!Total population!!65 to 74 years        object
SEX AND AGE!!Total population!!75 to 84 years        object
SEX AND AGE!!Total population!!85 years and over     object
SEX AND AGE!!Total population!!Median age (years)    object
dtype: object


In [4]:
# Remove 'SEX AND AGE' from every column name
houston_age_df.columns = houston_age_df.columns.str.replace('SEX AND AGE!!', '')

# Drop NaN values
houston_age_df.dropna(inplace=True)

# Display the updated DataFrame
houston_age_df.head()

Unnamed: 0,Label (Grouping),Total population,Total population!!Under 5 years,Total population!!5 to 9 years,Total population!!10 to 14 years,Total population!!15 to 19 years,Total population!!20 to 24 years,Total population!!25 to 34 years,Total population!!35 to 44 years,Total population!!45 to 54 years,Total population!!55 to 59 years,Total population!!60 to 64 years,Total population!!65 to 74 years,Total population!!75 to 84 years,Total population!!85 years and over,Total population!!Median age (years)
1,Estimate,4835125,323950,342195,353847,346582,328705,734824,706339,601580,266006,245511,365995,171533,48058,34.8
2,Percent,4835125,6.7%,7.1%,7.3%,7.2%,6.8%,15.2%,14.6%,12.4%,5.5%,5.1%,7.6%,3.5%,1.0%,(X)


In [52]:
# Rename columns for clarity
houston_age_df.columns = ['',
  'Total Population',
  'Under 5',
  '5 to 9',
  '10 to 14',
  '15 to 19',
  '20 to 24',
  '25 to 34',
  '35 to 44',
  '45 to 54',
  '55 to 59',
  '60 to 64',
  '65 to 74',
  '75 to 84',
  '85 and Over',
  'Median Age'
]

houston_age_df.head()

Unnamed: 0,Unnamed: 1,Total Population,Under 5,5 to 9,10 to 14,15 to 19,20 to 24,25 to 34,35 to 44,45 to 54,55 to 59,60 to 64,65 to 74,75 to 84,85 and Over,Median Age
1,Estimate,4835125,323950,342195,353847,346582,328705,734824,706339,601580,266006,245511,365995,171533,48058,34.8
2,Percent,4835125,6.7%,7.1%,7.3%,7.2%,6.8%,15.2%,14.6%,12.4%,5.5%,5.1%,7.6%,3.5%,1.0%,(X)


In [57]:
# Save the cleaned DataFrame to a new CSV file
houston_age_df.to_csv('../houston_data/cleaned_hou_age.csv', index=False)

#### Racial Demographics (Table DP05)

In [7]:
# Import csv file for race
houston_race_data = "../houston_data/houston_raw_csv/houston_race.csv"

# Read the CSV file into a DataFrame
houston_race_df = pd.read_csv(houston_race_data)
houston_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races!!Total population,Race alone or in combination with one or more other races!!Total population!!White,Race alone or in combination with one or more other races!!Total population!!Black or African American,Race alone or in combination with one or more other races!!Total population!!American Indian and Alaska Native,Race alone or in combination with one or more other races!!Total population!!Asian,Race alone or in combination with one or more other races!!Total population!!Native Hawaiian and Other Pacific Islander,Race alone or in combination with one or more other races!!Total population!!Some Other Race
0,"Harris County, Texas",,,,,,,
1,Estimate,4835125.0,2666766,1046412,141031,409092,12471,1722952
2,Percent,4835125.0,55.2%,21.6%,2.9%,8.5%,0.3%,35.6%


In [8]:
# Drop all NaN Values
houston_race_df = houston_race_df.dropna()

In [9]:
# Rename columns to remove 'Race alone or in combination with one or more other races!!Total population!!' from the column names
houston_race_df.columns = houston_race_df.columns.str.replace('Race alone or in combination with one or more other races!!Total population!!', '')
houston_race_df.head()

Unnamed: 0,Label (Grouping),Race alone or in combination with one or more other races!!Total population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Some Other Race
1,Estimate,4835125,2666766,1046412,141031,409092,12471,1722952
2,Percent,4835125,55.2%,21.6%,2.9%,8.5%,0.3%,35.6%


In [54]:
# Rename columns for clarity 
houston_race_df.columns = ['', 
                           'Total Population', 
                           'White', 
                           'Black or African American', 
                           'American Indian and Alaska Native', 
                           'Asian', 
                           'Native Hawaiian and Other Pacific Islander', 
                           'Other']

# Display
houston_race_df.head()


Unnamed: 0,Unnamed: 1,Total Population,White,Black or African American,American Indian and Alaska Native,Asian,Native Hawaiian and Other Pacific Islander,Other
1,Estimate,4835125,2666766,1046412,141031,409092,12471,1722952
2,Percent,4835125,55.2%,21.6%,2.9%,8.5%,0.3%,35.6%


In [56]:
# Save as new CSV file
houston_race_df.to_csv('../houston_data/cleaned_hou_race.csv', index=False)

#### Education Level (Table S1501)

In [23]:
# Import csv file for Texas edu level 
houston_edu_data = '../houston_data/houston_raw_csv/houston_edu.csv'

# Read the CSV file into a DataFrame
houston_edu_df = pd.read_csv(houston_edu_data)
houston_edu_df

Unnamed: 0,Label (Grouping),"Harris County, Texas!!Total!!Estimate","Harris County, Texas!!Male!!Estimate","Harris County, Texas!!Female!!Estimate"
0,AGE BY EDUCATIONAL ATTAINMENT,,,
1,Population 18 to 24 years,435750,221736,214014
2,Less than high school graduate,71828,40706,31122
3,High school graduate (includes equival...,140708,78510,62198
4,Some college or associate's degree,177602,83928,93674
...,...,...,...,...
63,Less than high school graduate,24671,29109,16801
64,High school graduate (includes equival...,30842,36537,23888
65,Some college or associate's degree,38664,47368,32054
66,Bachelor's degree,60215,73460,53063


In [46]:
# Source data (filter only what is needed out of dataset)
houston_edu_df = houston_edu_df.iloc[:28]
houston_edu_df.head()

Unnamed: 0,Age by Educational Attainment,Total,Male,Female
1,Population 18 to 24 years,435750,221736,214014
2,Less than high school graduate,71828,40706,31122
3,High school graduate (includes equival...,140708,78510,62198
4,Some college or associate's degree,177602,83928,93674
5,Bachelor's degree or higher,45612,18592,27020


In [27]:
# Drop NaN values
houston_edu_df = houston_edu_df.dropna()

In [29]:
# Rename columns for clarity 
houston_edu_df.columns = ['Age by Educational Attainment',
                          'Total', 
                            'Male',
                            'Female']
houston_edu_df.head()

Unnamed: 0,Age by Educational Attainment,Total,Male,Female
1,Population 18 to 24 years,435750,221736,214014
2,Less than high school graduate,71828,40706,31122
3,High school graduate (includes equival...,140708,78510,62198
4,Some college or associate's degree,177602,83928,93674
5,Bachelor's degree or higher,45612,18592,27020


In [30]:
# save as new csv file with cleaned data
houston_edu_df.to_csv('../houston_data/cleaned_hou_edu_level.csv', index=False)

## Economic Metric

**Measuring**: Median Household Income, Unemployment Rate, Poverty Rate.

Data for this section was pulled from the Houston 2020 Census Database using various coded tables:
* Median Household Income: Table S1901
* Unemployment Rate: Table S2301
* Poverty Rate: S1501 - WE ARE CUTTING

##### Median Household Income (Table S1901)

In [47]:
# Import csv file from Median Household Income
houston_income_data = '../houston_data/houston_raw_csv/houston_median_income.csv'

# Read the CSV file into a DataFrame
houston_income_df = pd.read_csv(houston_income_data)
houston_income_df.head()

Unnamed: 0,Label (Grouping),"Harris County, Texas!!Households!!Estimate","Harris County, Texas!!Families!!Estimate","Harris County, Texas!!Married-couple families!!Estimate","Harris County, Texas!!Nonfamily households!!Estimate"
0,Total,1635749,1113437,766641,522312
1,"Less than $10,000",6.0%,4.5%,1.7%,11.1%
2,"$10,000 to $14,999",3.6%,2.6%,1.4%,6.1%
3,"$15,000 to $24,999",8.5%,7.1%,4.4%,12.2%
4,"$25,000 to $34,999",9.4%,8.5%,6.5%,11.6%


In [49]:
# Droping unnecessary rows and columns
houston_income_df = houston_income_df.iloc[:13]


In [50]:
# Rename columns for clarity
houston_income_df.columns = ['',
                             'Household', 
                              'Families', 
                              'Married-couple Households', 
                              'Nonfamily Households']
houston_income_df.head()

Unnamed: 0,Unnamed: 1,Household,Families,Married-couple Households,Nonfamily Households
0,Total,1635749,1113437,766641,522312
1,"Less than $10,000",6.0%,4.5%,1.7%,11.1%
2,"$10,000 to $14,999",3.6%,2.6%,1.4%,6.1%
3,"$15,000 to $24,999",8.5%,7.1%,4.4%,12.2%
4,"$25,000 to $34,999",9.4%,8.5%,6.5%,11.6%


In [51]:
# Save as new csv file with cleaned data
houston_income_df.to_csv('../houston_data/cleaned_hou_income.csv', index=False)

##### Unemployment Rate (Table S2301)

In [31]:
# Import csv file from unemployment rate
houston_unemployment_data = '../houston_data/houston_raw_csv/houston_employment.csv'

# Read the CSV file into a DataFrame
houston_unemployment_df = pd.read_csv(houston_unemployment_data)
houston_unemployment_df.head()

Unnamed: 0,Label (Grouping),"Harris County, Texas!!Total!!Estimate","Harris County, Texas!!Labor Force Participation Rate!!Estimate","Harris County, Texas!!Employment/Population Ratio!!Estimate","Harris County, Texas!!Unemployment rate!!Estimate"
0,Population 16 years and over,3565983.0,67.4%,63.2%,6.2%
1,AGE,,,,
2,16 to 19 years,253685.0,31.8%,25.2%,20.6%
3,20 to 24 years,316390.0,73.3%,65.1%,10.9%
4,25 to 29 years,380200.0,81.6%,76.4%,6.2%


In [17]:
# Filter the DataFrame to only include rows 0-11
houston_unemployment_age_df = houston_unemployment_df.iloc[0:12]

# Display the filtered DataFrame
houston_unemployment_age_df.head()

Unnamed: 0,Label (Grouping),"Harris County, Texas!!Total!!Estimate","Harris County, Texas!!Labor Force Participation Rate!!Estimate","Harris County, Texas!!Employment/Population Ratio!!Estimate","Harris County, Texas!!Unemployment rate!!Estimate"
0,Population 16 years and over,3565983.0,67.4%,63.2%,6.2%
1,AGE,,,,
2,16 to 19 years,253685.0,31.8%,25.2%,20.6%
3,20 to 24 years,316390.0,73.3%,65.1%,10.9%
4,25 to 29 years,380200.0,81.6%,76.4%,6.2%


In [18]:
# Drop any NaN values
houston_unemployment_df = houston_unemployment_df.dropna()

In [19]:
# Drop 'Harris County, Texas!!' from every column name
houston_unemployment_df.columns = houston_unemployment_df.columns.str.replace('Harris County, Texas!!', '')

# Display the updated DataFrame
houston_unemployment_df.head()

Unnamed: 0,Label (Grouping),Total!!Estimate,Labor Force Participation Rate!!Estimate,Employment/Population Ratio!!Estimate,Unemployment rate!!Estimate
0,Population 16 years and over,3565983,67.4%,63.2%,6.2%
2,16 to 19 years,253685,31.8%,25.2%,20.6%
3,20 to 24 years,316390,73.3%,65.1%,10.9%
4,25 to 29 years,380200,81.6%,76.4%,6.2%
5,30 to 34 years,373426,82.3%,77.5%,5.6%


In [20]:
# Rename columns for clarity
houston_unemployment_df.columns = ['Age Group', 
                                   'Total', 
                                   'Labor Force Participation', 
                                   'Employment Population Ratio', 
                                   'Unemployment Rate']
houston_unemployment_df.head()

Unnamed: 0,Age Group,Total,Labor Force Participation,Employment Population Ratio,Unemployment Rate
0,Population 16 years and over,3565983,67.4%,63.2%,6.2%
2,16 to 19 years,253685,31.8%,25.2%,20.6%
3,20 to 24 years,316390,73.3%,65.1%,10.9%
4,25 to 29 years,380200,81.6%,76.4%,6.2%
5,30 to 34 years,373426,82.3%,77.5%,5.6%


In [21]:
# Save the cleaned DataFrame to a new CSV file
houston_unemployment_df.to_csv('../houston_data/cleaned_hou_unemployment.csv', index=False)

## Housing Metric

**Measuring**: Median Home Price, Median Rent, Homeownership Rate

Data for this section was pulled from the Houston 2020 Census Database using the same table code from the American Community Survey by the U.S. Census Bureau (ACS). 
* Median Rent: Table DP04

#### Median Rent (DP04)

In [33]:
# Import csv file 
houston_rent_data = '../houston_data/houston_raw_csv/houston_rent.csv'

# Read the CSV file into a DataFrame
houston_rent_df = pd.read_csv(houston_rent_data)
houston_rent_df.head()

Unnamed: 0,Label (Grouping),UNITS IN STRUCTURE!!Total housing units,GROSS RENT!!Occupied units paying rent,GROSS RENT!!Occupied units paying rent!!Less than $500,GROSS RENT!!Occupied units paying rent!!$500 to $999,"GROSS RENT!!Occupied units paying rent!!$1,000 to $1,499","GROSS RENT!!Occupied units paying rent!!$1,500 to $1,999","GROSS RENT!!Occupied units paying rent!!$2,000 to $2,499","GROSS RENT!!Occupied units paying rent!!$2,500 to $2,999","GROSS RENT!!Occupied units paying rent!!$3,000 or more",GROSS RENT!!Occupied units paying rent!!Median (dollars),GROSS RENT!!Occupied units paying rent!!No rent paid
0,"Harris County, Texas",,,,,,,,,,,
1,Estimate,1795219.0,716666.0,25671,256351,274196,115480,27320,8892,8756,1115,21170
2,Percent,1795219.0,716666.0,3.6%,35.8%,38.3%,16.1%,3.8%,1.2%,1.2%,(X),(X)


In [None]:
# Drop NaN values
houston_rent_df = houston_rent_df.dropna()

In [35]:
# Remove 'GROSS RENT!!Occupied units paying rent!!' from every column name
houston_rent_df.columns = houston_rent_df.columns.str.replace('GROSS RENT!!Occupied units paying rent!!', '')

# Display the updated DataFrame
houston_rent_df.head()

Unnamed: 0,Label (Grouping),UNITS IN STRUCTURE!!Total housing units,GROSS RENT!!Occupied units paying rent,Less than $500,$500 to $999,"$1,000 to $1,499","$1,500 to $1,999","$2,000 to $2,499","$2,500 to $2,999","$3,000 or more",Median (dollars),No rent paid
1,Estimate,1795219,716666,25671,256351,274196,115480,27320,8892,8756,1115,21170
2,Percent,1795219,716666,3.6%,35.8%,38.3%,16.1%,3.8%,1.2%,1.2%,(X),(X)


In [36]:
# Rename columns for clarity
houston_rent_df.columns = ['',
                             'Total Housing Units',
                             'Occupied Housing Units Paying Rent',
                             'Less than $500',
                             '$500 to $999',
                             '$1,000 to $1,499',
                             '$1,500 to $1,999',
                             '$2,000 to $2,499',
                             '$2,500 to $2,999',
                             '$3,000 or more',
                             'Median Rent Price',
                             'No rent paid'
]
houston_rent_df.head()

Unnamed: 0,Unnamed: 1,Total Housing Units,Occupied Housing Units Paying Rent,Less than $500,$500 to $999,"$1,000 to $1,499","$1,500 to $1,999","$2,000 to $2,499","$2,500 to $2,999","$3,000 or more",Median Rent Price,No rent paid
1,Estimate,1795219,716666,25671,256351,274196,115480,27320,8892,8756,1115,21170
2,Percent,1795219,716666,3.6%,35.8%,38.3%,16.1%,3.8%,1.2%,1.2%,(X),(X)


In [None]:
# Save the cleaned DataFrame to a new CSV file
houston_rent_df.to_csv('../houston_data/cleaned_hou_rent.csv', index=False)

## Quality of Life Metric

**Measuring**: Commute Time 

Data was pulled from the Census Database using the following survey by ASC in 2020. 
* Commute Time: Table S0801 

In [40]:
# Import csv file from commute time
houston_commute_data = '../houston_data/houston_raw_csv/houston_commute.csv'

# Read the CSV file into a DataFrame
houston_commute_df = pd.read_csv(houston_commute_data)
houston_commute_df

Unnamed: 0,Label (Grouping),"Harris County, Texas!!Total!!Estimate","Harris County, Texas!!Male!!Estimate","Harris County, Texas!!Female!!Estimate"
0,Workers 16 years and over,2209641,1227892,981749
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",88.1%,89.0%,87.0%
3,Drove alone,78.0%,79.1%,76.7%
4,Carpooled,10.1%,9.9%,10.2%
...,...,...,...,...
58,Private vehicle occupancy,13.8%,(X),(X)
59,Place of work,15.9%,(X),(X)
60,Time of departure to go to work,25.1%,(X),(X)
61,Travel time to work,19.2%,(X),(X)


In [42]:
houston_commute_df = houston_commute_df.iloc[:14]
houston_commute_df.head()

Unnamed: 0,Label (Grouping),"Harris County, Texas!!Total!!Estimate","Harris County, Texas!!Male!!Estimate","Harris County, Texas!!Female!!Estimate"
0,Workers 16 years and over,2209641,1227892,981749
1,MEANS OF TRANSPORTATION TO WORK,,,
2,"Car, truck, or van",88.1%,89.0%,87.0%
3,Drove alone,78.0%,79.1%,76.7%
4,Carpooled,10.1%,9.9%,10.2%


In [43]:
# Drop NaN values
houston_commute_df = houston_commute_df.dropna()

In [44]:
# Rewrite for clarity 
houston_commute_df.columns = ['',
                              'Total Estimate',
                              'Male',
                              'Female'
                              ]
houston_commute_df.head()

Unnamed: 0,Unnamed: 1,Total Estimate,Male,Female
0,Workers 16 years and over,2209641,1227892,981749
2,"Car, truck, or van",88.1%,89.0%,87.0%
3,Drove alone,78.0%,79.1%,76.7%
4,Carpooled,10.1%,9.9%,10.2%
5,In 2-person carpool,7.4%,7.1%,7.7%


In [45]:
# Save as new csv file with cleaned data
houston_commute_df.to_csv('../houston_data/cleaned_hou_commute.csv', index=False)

### Creating JSON File

In [10]:
# Load the datasets from Houston 
houston_commute_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_commute.csv')
houston_unemployment_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_unemployment.csv')
houston_rent_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_rent.csv')
houston_income_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_income.csv')
houston_edu_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_edu_level.csv')
houston_age_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_age.csv')
houston_race_csv = Path('../houston_data/houston_cleaned_csv/cleaned_hou_race.csv')

# Read the csv files
houston_commute_df = pd.read_csv(houston_commute_csv)
houston_unemployment_df = pd.read_csv(houston_unemployment_csv)
houston_rent_df = pd.read_csv(houston_rent_csv)
houston_income_df = pd.read_csv(houston_income_csv)
houston_edu_df = pd.read_csv(houston_edu_csv)
houston_age_df = pd.read_csv(houston_age_csv)
houston_race_df = pd.read_csv(houston_race_csv)

In [11]:
# Combine all data into a dictionary
houston_data = {
    "commute": houston_commute_df.to_dict(orient='records'),
    "unemployment": houston_unemployment_df.to_dict(orient='records'),
    "rent": houston_rent_df.to_dict(orient='records'),
    "income": houston_income_df.to_dict(orient='records'),
    "education": houston_edu_df.to_dict(orient='records'),
    "age": houston_age_df.to_dict(orient='records'),
    "race": houston_race_df.to_dict(orient='records')
}

In [13]:
# Save the dictionary as a JSON file
with open('../houston_data/houston_data.json', 'w') as json_file:
    json.dump(houston_data, json_file)