# Houston Dataset

In [1]:
# Import dependencies
import os
import csv
import pandas as pd
import openpyxl

## Demographic Metric

**Measuring**: ACS Population Density, Median Age, Education Level

Data was pulled from the 2020 Houston Census Database and the American Community Survey (ACS) by the U.S. Census Bureau using various coded tables to ensure unified data for each metropolitan city. Two tables used the same code and were seperated to ensure proper measurement. 
* ACS Population Density: Table DP05
* Median Age: Table DP05
* Education Level: S1501

##### ACS Population Density & Median Age (Table DP05)

In [2]:
# Import csv file for Texas ACS Population Density
houston_population_data = '../houston_data/tx_ACS_pop_density.csv'

# Read the CSV file into a DataFrame
houston_population_df = pd.read_csv(houston_population_data)
houston_population_df.head()

Unnamed: 0,Label (Grouping),Texas!!Estimate,Texas!!Margin of Error,Texas!!Percent,Texas!!Percent Margin of Error,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Margin of Error"
0,SEX AND AGE,,,,,,,,
1,Total population,28635442.0,*****,28635442,(X),6979613.0,*****,6979613,(X)
2,Male,14221720.0,"±2,292",49.7%,±0.1,3461935.0,±408,49.6%,±0.1
3,Female,14413722.0,"±2,292",50.3%,±0.1,3517678.0,±408,50.4%,±0.1
4,Sex ratio (males per 100 females),98.7,±0.1,(X),(X),98.4,±0.1,(X),(X)


In [3]:
# Display column names and data types
print(houston_population_df.dtypes)


Label (Grouping)                                                            object
Texas!!Estimate                                                             object
Texas!!Margin of Error                                                      object
Texas!!Percent                                                              object
Texas!!Percent Margin of Error                                              object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Estimate                   object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Margin of Error            object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent                    object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Margin of Error    object
dtype: object


QUESTIONS TO ASK:
- DO WE KEEP THE TEXAS DATA HERE? 
- IS IT TALKING ABOUT THE WHOLE OF TEXAS? IS THIS RELEVANT? IN A COMPARISON??

In [None]:
# Drop columns (i.e any columns that contain 'Margin of Error')
houston_population_df = houston_population_df.drop(columns=['Texas!!Margin of Error', 
                                                            'Texas!!Percent Margin of Error', 
                                                            'Houston-The Woodlands-Sugar Land, TX Metro Area!!Margin of Error',
                                                            'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Margin of Error'])

# Rename columns for clarity
houston_population_df = houston_population_df.rename(columns={
    'Texas!!Estimate': 'Texas Population',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Estimate': 'Houston Metro Population',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent': 'Houston Metro Population Percent'
})

# Display the first few rows of the cleaned DataFrame
houston_population_df.head()

Unnamed: 0,Label (Grouping),Texas Population,Texas!!Percent,Houston Metro Population,Houston Metro Population Percent
0,SEX AND AGE,,,,
1,Total population,28635442.0,28635442,6979613.0,6979613
2,Male,14221720.0,49.7%,3461935.0,49.6%
3,Female,14413722.0,50.3%,3517678.0,50.4%
4,Sex ratio (males per 100 females),98.7,(X),98.4,(X)


In [59]:
# Save the cleaned DataFrame to a new CSV file
houston_population_df.to_csv('../houston_data/cleaned_ACS_pop_density.csv', index=False)

Note: will have to seperate the pop density and median age eventual...

#### Education Level (Table S1501)

In [6]:
# Import csv file for Texas edu level 
houston_edu_data = '../houston_data/tx_edu_level.csv'

# Read the CSV file into a DataFrame
houston_edu_df = pd.read_csv(houston_edu_data)
houston_edu_df.head()

Unnamed: 0,Label (Grouping),Texas!!Total!!Estimate,Texas!!Total!!Margin of Error,Texas!!Percent!!Estimate,Texas!!Percent!!Margin of Error,Texas!!Male!!Estimate,Texas!!Male!!Margin of Error,Texas!!Percent Male!!Estimate,Texas!!Percent Male!!Margin of Error,Texas!!Female!!Estimate,...,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Male!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Male!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Female!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Female!!Margin of Error"
0,AGE BY EDUCATIONAL ATTAINMENT,,,,,,,,,,...,,,,,,,,,,
1,Population 18 to 24 years,2804109.0,"±2,484",(X),(X),1445292.0,"±2,083",(X),(X),1358817.0,...,(X),(X),322856.0,±501,(X),(X),310209.0,±450,(X),(X)
2,Less than high school graduate,393121.0,"±6,126",14.0%,±0.2,225073.0,"±4,785",15.6%,±0.3,168048.0,...,15.8%,±0.5,55243.0,"±2,565",17.1%,±0.8,44494.0,"±1,751",14.3%,±0.6
3,High school graduate (includes equival...,945458.0,"±8,382",33.7%,±0.3,528851.0,"±5,368",36.6%,±0.4,416607.0,...,32.6%,±0.7,115202.0,"±2,948",35.7%,±0.9,91110.0,"±2,909",29.4%,±0.9
4,Some college or associate's degree,1193185.0,"±9,051",42.6%,±0.3,578268.0,"±5,669",40.0%,±0.4,614917.0,...,41.0%,±0.6,124807.0,"±2,788",38.7%,±0.9,134818.0,"±3,011",43.5%,±1.0


In [7]:
# Display column names and data types
print(houston_edu_df.dtypes)

Label (Grouping)                                                                    object
Texas!!Total!!Estimate                                                              object
Texas!!Total!!Margin of Error                                                       object
Texas!!Percent!!Estimate                                                            object
Texas!!Percent!!Margin of Error                                                     object
Texas!!Male!!Estimate                                                               object
Texas!!Male!!Margin of Error                                                        object
Texas!!Percent Male!!Estimate                                                       object
Texas!!Percent Male!!Margin of Error                                                object
Texas!!Female!!Estimate                                                             object
Texas!!Female!!Margin of Error                                                      object

In [10]:
# Drop unnecessary columns (i.e anything with "Margin of Error")
houston_edu_df = houston_edu_df.drop(columns=['Texas!!Total!!Margin of Error', 
                                               'Texas!!Percent!!Margin of Error',
                                               'Texas!!Male!!Margin of Error',
                                               'Texas!!Percent Male!!Margin of Error',
                                               'Texas!!Female!!Margin of Error',
                                               'Texas!!Percent Female!!Margin of Error',
                                               'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error',
                                               'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent!!Margin of Error',
                                               'Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Margin of Error',
                                               'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Male!!Margin of Error',
                                               'Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Margin of Error',
                                               'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Female!!Margin of Error'
                                            ])

In [51]:
# Drop NaN values
houston_edu_df = houston_edu_df.dropna()

In [52]:
# Rename columns for clarity
houston_edu_df = houston_edu_df.rename(columns={
    'Label (Grouping)': 'Age by Educational Attainment',
    'Texas!!Total!!Estimate': 'Texas Population',
    'Texas!!Percent!!Estimate': 'Texas Population %',
    'Texas!!Male!!Estimate': 'Male Population',
    'Texas!!Percent Male!!Estimate': 'Male Population %',
    'Texas!!Female!!Estimate': 'Female Population',
    'Texas!!Percent Female!!Estimate': 'Female Population %',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate': 'Houston Metro Population',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent!!Estimate': 'Houston Metro Population %',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Estimate': 'Houston Male Population',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Male!!Estimate': 'Houston Male %',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Estimate': 'Houston Female Population',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Female!!Estimate': 'Houston Female %'
    })

# Display the first few rows of the cleaned DataFrame
houston_edu_df.head()

Unnamed: 0,Age by Educational Attainment,Texas Population,Texas Population %,Male Population,Male Population %,Female Population,Female Population %,Houston Metro Population,Houston Metro Population %,Houston Male Population,Houston Male %,Houston Female Population,Houston Female %
1,Population 18 to 24 years,2804109,(X),1445292,(X),1358817,(X),633065,(X),322856,(X),310209,(X)
2,Less than high school graduate,393121,14.0%,225073,15.6%,168048,12.4%,99737,15.8%,55243,17.1%,44494,14.3%
3,High school graduate (includes equival...,945458,33.7%,528851,36.6%,416607,30.7%,206312,32.6%,115202,35.7%,91110,29.4%
4,Some college or associate's degree,1193185,42.6%,578268,40.0%,614917,45.3%,259625,41.0%,124807,38.7%,134818,43.5%
5,Bachelor's degree or higher,272345,9.7%,113100,7.8%,159245,11.7%,67391,10.6%,27604,8.5%,39787,12.8%


    quesiton to ask: will we want to seperate this further??? 

In [60]:
# save as new csv file with cleaned data
houston_edu_df.to_csv('../houston_data/cleaned_hou_edu_level.csv', index=False)

## Economic Metric

**Measuring**: Median Household Income, Unemployment Rate, Poverty Rate.

Data for this section was pulled from the Houston 2020 Census Database using various coded tables:
* Median Household Income: Table S1901
* Unemployment Rate: Table S2301
* Poverty Rate: S1501

##### Median Household Income (Table S1901)

In [12]:
# Import csv file from Median Household Income
houston_income_data = '../houston_data/tx_median_income.csv'

# Read the CSV file into a DataFrame
houston_income_df = pd.read_csv(houston_income_data)
houston_income_df.head()

Unnamed: 0,Label (Grouping),Texas!!Households!!Estimate,Texas!!Households!!Margin of Error,Texas!!Families!!Estimate,Texas!!Families!!Margin of Error,Texas!!Married-couple families!!Estimate,Texas!!Married-couple families!!Margin of Error,Texas!!Nonfamily households!!Estimate,Texas!!Nonfamily households!!Margin of Error,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Households!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Households!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Families!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Families!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Married-couple families!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Married-couple families!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Nonfamily households!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Nonfamily households!!Margin of Error"
0,Total,9906070,"±18,881",6838900,"±21,582",4974588,"±25,868",3067170,"±12,481",2407993,"±5,139",1702047,"±8,704",1234726,"±9,041",705946,"±7,632"
1,"Less than $10,000",6.0%,±0.1,3.9%,±0.1,1.6%,±0.1,11.9%,±0.2,5.3%,±0.1,3.8%,±0.2,1.5%,±0.1,10.8%,±0.3
2,"$10,000 to $14,999",3.9%,±0.1,2.5%,±0.1,1.3%,±0.1,7.5%,±0.1,3.3%,±0.1,2.2%,±0.1,1.2%,±0.1,6.3%,±0.3
3,"$15,000 to $24,999",8.4%,±0.1,6.5%,±0.1,4.1%,±0.1,13.5%,±0.2,7.8%,±0.2,6.1%,±0.2,3.8%,±0.2,12.6%,±0.4
4,"$25,000 to $34,999",8.9%,±0.1,7.7%,±0.1,5.7%,±0.1,12.1%,±0.2,8.5%,±0.2,7.3%,±0.2,5.4%,±0.2,11.5%,±0.4


In [13]:
# Display column names and data types
print(houston_income_df.dtypes)

Label (Grouping)                                                                             object
Texas!!Households!!Estimate                                                                  object
Texas!!Households!!Margin of Error                                                           object
Texas!!Families!!Estimate                                                                    object
Texas!!Families!!Margin of Error                                                             object
Texas!!Married-couple families!!Estimate                                                     object
Texas!!Married-couple families!!Margin of Error                                              object
Texas!!Nonfamily households!!Estimate                                                        object
Texas!!Nonfamily households!!Margin of Error                                                 object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Households!!Estimate                        object


In [14]:
# Drop unnecessary columns (i.e anything with "Margin of Error")
houston_income_df = houston_income_df.drop(columns=['Texas!!Households!!Margin of Error', 
                                                     'Texas!!Families!!Margin of Error',
                                                     'Texas!!Married-couple families!!Margin of Error',
                                                     'Texas!!Nonfamily households!!Margin of Error',
                                                     'Houston-The Woodlands-Sugar Land, TX Metro Area!!Households!!Margin of Error',
                                                     'Houston-The Woodlands-Sugar Land, TX Metro Area!!Families!!Margin of Error',
                                                     'Houston-The Woodlands-Sugar Land, TX Metro Area!!Married-couple families!!Margin of Error',
                                                     'Houston-The Woodlands-Sugar Land, TX Metro Area!!Nonfamily households!!Margin of Error'
                                                     ])

In [50]:
# Rename columns for clarity
houston_income_df = houston_income_df.rename(columns={
    'Label (Grouping)': 'Income Grouping',
    'Texas!!Households!!Estimate': 'Texas Households',
    'Texas!!Families!!Estimate': 'Texas Families',
    'Texas!!Married-couple families!!Estimate': 'Texas Married-couple Households',
    'Texas!!Nonfamily households!!Estimate': 'Texas Nonfamily Households',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Households!!Estimate': 'Houston Metro Households',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Families!!Estimate': 'Houston Metro Families',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Married-couple families!!Estimate': 'Houston Metro Married-couple Households',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Nonfamily households!!Estimate': 'Houston Metro Nonfamily Households'
})

houston_income_df.head()

Unnamed: 0,Income Grouping,Texas Households,Texas Families,Texas Married-couple Households,Texas Nonfamily Households,Houston Metro Households,Houston Metro Families,Houston Metro Married-couple Households,Houston Metro Nonfamily Households
0,Total,9906070,6838900,4974588,3067170,2407993,1702047,1234726,705946
1,"Less than $10,000",6.0%,3.9%,1.6%,11.9%,5.3%,3.8%,1.5%,10.8%
2,"$10,000 to $14,999",3.9%,2.5%,1.3%,7.5%,3.3%,2.2%,1.2%,6.3%
3,"$15,000 to $24,999",8.4%,6.5%,4.1%,13.5%,7.8%,6.1%,3.8%,12.6%
4,"$25,000 to $34,999",8.9%,7.7%,5.7%,12.1%,8.5%,7.3%,5.4%,11.5%


In [61]:
# Save as new csv file with cleaned data
houston_income_df.to_csv('../houston_data/cleaned_hou_income.csv', index=False)

##### Unemployment Rate (Table S2301)

In [16]:
# Import csv file from unemployment rate
houston_unemployment_data = '../houston_data/tx_employment_status.csv'

# Read the CSV file into a DataFrame
houston_unemployment_df = pd.read_csv(houston_unemployment_data)
houston_unemployment_df.head()

Unnamed: 0,Label (Grouping),Texas!!Total!!Estimate,Texas!!Total!!Margin of Error,Texas!!Labor Force Participation Rate!!Estimate,Texas!!Labor Force Participation Rate!!Margin of Error,Texas!!Employment/Population Ratio!!Estimate,Texas!!Employment/Population Ratio!!Margin of Error,Texas!!Unemployment rate!!Estimate,Texas!!Unemployment rate!!Margin of Error,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Labor Force Participation Rate!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Labor Force Participation Rate!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Employment/Population Ratio!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Employment/Population Ratio!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Unemployment rate!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Unemployment rate!!Margin of Error"
0,Population 16 years and over,22078090.0,"±4,615",64.8%,±0.1,61.0%,±0.1,5.3%,±0.1,5328736.0,"±2,632",66.6%,±0.2,62.6%,±0.2,5.9%,±0.1
1,AGE,,,,,,,,,,,,,,,,
2,16 to 19 years,1627356.0,"±5,341",34.3%,±0.3,28.1%,±0.3,16.9%,±0.5,382828.0,"±2,768",31.9%,±0.8,25.8%,±0.7,19.0%,±1.3
3,20 to 24 years,2000883.0,"±3,068",73.4%,±0.3,65.1%,±0.3,9.5%,±0.3,454655.0,±906,73.2%,±0.8,65.0%,±0.9,11.1%,±0.6
4,25 to 29 years,2130985.0,"±2,205",81.3%,±0.3,75.7%,±0.3,5.8%,±0.2,522073.0,±700,81.4%,±0.6,76.4%,±0.7,5.9%,±0.4


In [17]:
# Display column names and data types
print(houston_unemployment_df.dtypes)

Label (Grouping)                                                                                    object
Texas!!Total!!Estimate                                                                              object
Texas!!Total!!Margin of Error                                                                       object
Texas!!Labor Force Participation Rate!!Estimate                                                     object
Texas!!Labor Force Participation Rate!!Margin of Error                                              object
Texas!!Employment/Population Ratio!!Estimate                                                        object
Texas!!Employment/Population Ratio!!Margin of Error                                                 object
Texas!!Unemployment rate!!Estimate                                                                  object
Texas!!Unemployment rate!!Margin of Error                                                           object
Houston-The Woodlands-Sugar Land, TX 

In [18]:
# Clean data by dropping unnecessary columns (i.e anything with "Margin of Error")
houston_unemployment_df = houston_unemployment_df.drop(columns=['Texas!!Total!!Margin of Error', 
                                                                  'Texas!!Labor Force Participation Rate!!Margin of Error', 
                                                                  'Texas!!Employment/Population Ratio!!Margin of Error',
                                                                  'Texas!!Unemployment rate!!Margin of Error',
                                                                  'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error',
                                                                  'Houston-The Woodlands-Sugar Land, TX Metro Area!!Labor Force Participation Rate!!Margin of Error',
                                                                  'Houston-The Woodlands-Sugar Land, TX Metro Area!!Employment/Population Ratio!!Margin of Error',
                                                                  'Houston-The Woodlands-Sugar Land, TX Metro Area!!Unemployment rate!!Margin of Error'
                                                                 ])

In [48]:
# Drop NaN values
houston_unemployment_df = houston_unemployment_df.dropna()

In [49]:
# Rename for clarity 
houston_unemployment_df = houston_unemployment_df.rename(columns={
    'Label (Grouping)': 'Age',
    'Texas!!Total!!Estimate': 'Texas Total',
    'Texas!!Labor Force Participation Rate!!Estimate': 'Texas Labor Force Participation Rate',
    'Texas!!Employment/Population Ratio!!Estimate': 'Texas Employment/Population Ratio',
    'Texas!!Unemployment rate!!Estimate': 'Texas Unemployment Rate',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate': 'Houston Metro Total',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Labor Force Participation Rate!!Estimate': 'Houston Metro Labor Force Participation',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Employment/Population Ratio!!Estimate': 'Houston Metro Employment/Population Ratio',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Unemployment rate!!Estimate': 'Houston Metro Unemployment Rate'
})

# Display the first few rows of the cleaned DataFrame
houston_unemployment_df.head()

Unnamed: 0,Age,Texas Total,Texas Labor Force Participation,Texas Employment/Population Ratio,Texas Unemployment Rate,Houston Metro Total,Houston Metro Labor Force Participation,Houston Metro Employment/Population Ratio,Houston Metro Unemployment Rate
0,Population 16 years and over,22078090,64.8%,61.0%,5.3%,5328736,66.6%,62.6%,5.9%
2,16 to 19 years,1627356,34.3%,28.1%,16.9%,382828,31.9%,25.8%,19.0%
3,20 to 24 years,2000883,73.4%,65.1%,9.5%,454655,73.2%,65.0%,11.1%
4,25 to 29 years,2130985,81.3%,75.7%,5.8%,522073,81.4%,76.4%,5.9%
5,30 to 34 years,2079503,81.4%,77.0%,4.7%,526366,81.9%,77.4%,5.3%


In [62]:
# save as new csv file with cleaned data
houston_unemployment_df.to_csv('../houston_data/cleaned_hou_unemployment.csv', index=False)

#### Poverty Rate (S1501)

In [20]:
# Import csv file from poverty rate
houston_poverty_data = '../houston_data/tx_poverty_rate.csv'

# Read the CSV file into a DataFrame
houston_poverty_df = pd.read_csv(houston_poverty_data)
houston_poverty_df.head()

Unnamed: 0,Label (Grouping),Texas!!Total!!Estimate,Texas!!Total!!Margin of Error,Texas!!Below poverty level!!Estimate,Texas!!Below poverty level!!Margin of Error,Texas!!Percent below poverty level!!Estimate,Texas!!Percent below poverty level!!Margin of Error,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Below poverty level!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Below poverty level!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent below poverty level!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent below poverty level!!Margin of Error"
0,Population for whom poverty status is determined,28013446.0,"±3,263",3984260.0,"±39,546",14.2%,±0.1,6891791.0,"±3,606",927139.0,"±18,239",13.5%,±0.3
1,AGE,,,,,,,,,,,,
2,Under 18 years,7293273.0,"±3,160",1462277.0,"±22,028",20.0%,±0.3,1835442.0,"±1,956",352610.0,"±9,992",19.2%,±0.5
3,Under 5 years,1970973.0,"±1,960",430469.0,"±8,385",21.8%,±0.4,498414.0,"±1,006",102306.0,"±4,121",20.5%,±0.8
4,5 to 17 years,5322300.0,"±2,924",1031808.0,"±16,612",19.4%,±0.3,1337028.0,"±1,882",250304.0,"±7,407",18.7%,±0.6


In [21]:
# Display column names and data types
print(houston_poverty_df.dtypes)

Label (Grouping)                                                                                 object
Texas!!Total!!Estimate                                                                           object
Texas!!Total!!Margin of Error                                                                    object
Texas!!Below poverty level!!Estimate                                                             object
Texas!!Below poverty level!!Margin of Error                                                      object
Texas!!Percent below poverty level!!Estimate                                                     object
Texas!!Percent below poverty level!!Margin of Error                                              object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate                                 object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error                          object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Below poverty l

In [22]:
# Clean data by dropping unnecessary columns (i.e anything with "Margin of Error")
houston_poverty_df = houston_poverty_df.drop(columns=['Texas!!Total!!Margin of Error',
                                                       'Texas!!Below poverty level!!Margin of Error',
                                                       'Texas!!Percent below poverty level!!Margin of Error',
                                                       'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error',
                                                       'Houston-The Woodlands-Sugar Land, TX Metro Area!!Below poverty level!!Margin of Error',
                                                       'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent below poverty level!!Margin of Error'
                                                      ])

In [46]:
# Drop NaN values
houston_poverty_df = houston_poverty_df.dropna()

In [47]:
# Rename columns for clarity 
houston_poverty_df = houston_poverty_df.rename(columns={
    'Label (Grouping)': 'Age',
    'Texas!!Total!!Estimate': 'Texas Total',
    'Texas!!Below poverty level!!Estimate': 'Texas Below Poverty Level',
    'Texas!!Percent below poverty level!!Estimate': 'Texas % Below Poverty Level',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate': 'Houston Metro Total',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Below poverty level!!Estimate': 'Houston Metro Below Poverty Level',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent below poverty level!!Estimate': 'Houston Metro % Below Poverty Level'
})

# Display the first few rows of the cleaned DataFrame
houston_poverty_df.head()

Unnamed: 0,Age,Texas Total,Texas Below Poverty Level,Texas % Below Poverty Level,Houston Metro Total,Houston Metro Below Poverty Level,Houston Metro % Below Poverty Level
0,Population for whom poverty status is determined,28013446,3984260,14.2%,6891791,927139,13.5%
2,Under 18 years,7293273,1462277,20.0%,1835442,352610,19.2%
3,Under 5 years,1970973,430469,21.8%,498414,102306,20.5%
4,5 to 17 years,5322300,1031808,19.4%,1337028,250304,18.7%
5,Related children of householder un...,7268253,1439337,19.8%,1830458,348183,19.0%


In [63]:
# save as new csv file with cleaned data
houston_unemployment_df.to_csv('../houston_data/cleaned_hou_poverty.csv', index=False)

## Housing Metric

**Measuring**: Median Home Price, Median Rent, Homeownership Rate

Data for this section was pulled from the Houston 2020 Census Database using the same table code from the American Community Survey by the U.S. Census Bureau (ACS). 
* Median Home Price: Table DP04
* Median Rent: Table DP04
* Homeownership Rate: Table DP04

In [24]:
# Import csv file from Houston housing metrics
houston_housing_data = '../houston_data/tx_housing.csv'

# Read the CSV file into a DataFrame
houston_housing_df = pd.read_csv(houston_housing_data)
houston_housing_df.head()

Unnamed: 0,Label (Grouping),Texas!!Estimate,Texas!!Margin of Error,Texas!!Percent,Texas!!Percent Margin of Error,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent","Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Margin of Error"
0,HOUSING OCCUPANCY,,,,,,,,
1,Total housing units,11112975.0,"±3,353",11112975,(X),2646682.0,±980,2646682,(X)
2,Occupied housing units,9906070.0,"±18,881",89.1%,±0.2,2407993.0,"±5,139",91.0%,±0.2
3,Vacant housing units,1206905.0,"±17,491",10.9%,±0.2,238689.0,"±5,172",9.0%,±0.2
4,Homeowner vacancy rate,1.4,±0.1,(X),(X),1.4,±0.1,(X),(X)


    NOTE: will need to seperate all these out.

In [25]:
# display column names and data types
print(houston_housing_df.dtypes)

Label (Grouping)                                                            object
Texas!!Estimate                                                             object
Texas!!Margin of Error                                                      object
Texas!!Percent                                                              object
Texas!!Percent Margin of Error                                              object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Estimate                   object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Margin of Error            object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent                    object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Margin of Error    object
dtype: object


In [26]:
# Clean data by dropping unnecessary columns (i.e anything with "Margin of Error")
houston_housing_df = houston_housing_df.drop(columns=['Texas!!Margin of Error',
                                                       'Texas!!Percent Margin of Error',
                                                       'Houston-The Woodlands-Sugar Land, TX Metro Area!!Margin of Error',
                                                       'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent Margin of Error'
                                                     ])

In [None]:
# Delete NaN values
houston_housing_df = houston_housing_df.dropna()

In [45]:
# Rename columns for clarity
houston_housing_df = houston_housing_df.rename(columns={
    'Label (Grouping)': 'Housing Occupancy Status',
    'Texas!!Estimate': 'Texas Total',
    'Texas!!Percent': 'Texas %',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Estimate': 'Houston Metro Housing Units',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Percent': 'Houston Metro % Housing Units'
})

# Display the first few rows of the cleaned DataFrame
houston_housing_df.head()

Unnamed: 0,Housing Occupancy Status,Texas Total,Texas %,Houston Metro Housing Units,Houston Metro % Housing Units
1,Total housing units,11112975.0,11112975,2646682.0,2646682
2,Occupied housing units,9906070.0,89.1%,2407993.0,91.0%
3,Vacant housing units,1206905.0,10.9%,238689.0,9.0%
4,Homeowner vacancy rate,1.4,(X),1.4,(X)
5,Rental vacancy rate,7.8,(X),8.5,(X)


In [64]:
# save as new csv file with cleaned data
houston_unemployment_df.to_csv('../houston_data/cleaned_hou_housing.csv', index=False)

## Quality of Life Metric

**Measuring**: Commute Time 

Data was pulled from the Census Database using the following survey by ASC in 2020. 
* Commute Time: Table S0801 

In [28]:
# Import csv file from commute time
houston_commute_data = '../houston_data/tx_commute.csv'

# Read the CSV file into a DataFrame
houston_commute_df = pd.read_csv(houston_commute_data)
houston_commute_df.head()

Unnamed: 0,Label (Grouping),Texas!!Total!!Estimate,Texas!!Total!!Margin of Error,Texas!!Male!!Estimate,Texas!!Male!!Margin of Error,Texas!!Female!!Estimate,Texas!!Female!!Margin of Error,"Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Margin of Error","Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Estimate","Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Margin of Error"
0,Workers 16 years and over,13307640,"±22,902",7264180,"±15,254",6043460,"±16,237",3271855,"±11,374",1812680,"±8,170",1459175,"±7,841"
1,MEANS OF TRANSPORTATION TO WORK,,,,,,,,,,,,
2,"Car, truck, or van",88.6%,±0.1,88.9%,±0.1,88.2%,±0.1,88.5%,±0.2,89.3%,±0.3,87.5%,±0.3
3,Drove alone,78.7%,±0.1,79.2%,±0.1,78.0%,±0.2,78.9%,±0.3,79.9%,±0.4,77.7%,±0.4
4,Carpooled,9.9%,±0.1,9.7%,±0.1,10.2%,±0.1,9.6%,±0.2,9.4%,±0.3,9.8%,±0.2


In [29]:
# Display column names and data types
print(houston_commute_df.dtypes)

Label (Grouping)                                                            object
Texas!!Total!!Estimate                                                      object
Texas!!Total!!Margin of Error                                               object
Texas!!Male!!Estimate                                                       object
Texas!!Male!!Margin of Error                                                object
Texas!!Female!!Estimate                                                     object
Texas!!Female!!Margin of Error                                              object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate            object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error     object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Estimate             object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Margin of Error      object
Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Estimate           object
Hous

In [38]:
# Clean data by dropping unnecessary columns (i.e anything with "Margin of Error")
columns_to_drop = ['Texas!!Total!!Margin of Error',
                   'Texas!!Male!!Margin of Error',
                   'Texas!!Female!!Margin of Error',
                   'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Margin of Error',
                   'Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Margin of Error',
                   'Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Margin of Error']

# Check if columns exist before dropping
columns_to_drop = [col for col in columns_to_drop if col in houston_commute_df.columns]

houston_commute_df = houston_commute_df.drop(columns=columns_to_drop)

In [40]:
# Delete NaN values
houston_commute_df = houston_commute_df.dropna()

In [43]:
# Rename columns for clarity
houston_commute_df = houston_commute_df.rename(columns={
    'Label (Grouping)': 'Means of Transportation',
    'Texas!!Total!!Estimate': 'Texas Total Commute Time',
    'Texas!!Male!!Estimate': 'Male Total',
    'Texas!!Female!!Estimate': 'Female Total',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Total!!Estimate': 'Houston Metro Total',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Male!!Estimate': 'Houston Male Total',
    'Houston-The Woodlands-Sugar Land, TX Metro Area!!Female!!Estimate': 'Houston Female Total'
})

# Display the first few rows of the cleaned DataFrame
houston_commute_df.head()

Unnamed: 0,Means of Transportation,Texas Total,Male Total,Female Total,Houston Metro Total,Houston Male Total,Houston Female Total
0,Workers 16 years and over,13307640,7264180,6043460,3271855,1812680,1459175
2,"Car, truck, or van",88.6%,88.9%,88.2%,88.5%,89.3%,87.5%
3,Drove alone,78.7%,79.2%,78.0%,78.9%,79.9%,77.7%
4,Carpooled,9.9%,9.7%,10.2%,9.6%,9.4%,9.8%
5,In 2-person carpool,7.3%,7.0%,7.6%,7.0%,6.8%,7.4%


In [65]:
# Save as new csv file with cleaned data
houston_commute_df.to_csv('../houston_data/cleaned_hou_commute.csv', index=False)