## This file is the data analysis for Austin household income over several years
Data source: https://datausa.io/profile/geo/austin-tx/?race-income-income_geo=incomeRace5

In [1]:
# Import Necessary Libraries
import numpy as np
import pandas as pd

In [2]:
# Load in data sets

# Set names of data set files for local access
path_1 = 'austin_household_income.csv'

austin_income1_df = pd.read_csv(path_1)

### Sanity Checks

#### Data set 1

In [3]:
austin_income1_df.shape

(352, 8)

In [4]:
austin_income1_df.head(5)

Unnamed: 0,Household Income Bucket ID,Household Income Bucket,Year,Household Income,Household Income Moe,Place,Place ID,share
0,0,"< $10,000",2013,8380364,19177.0,United States,01000US,0.072488
1,0,"< $10,000",2014,8395338,16627.0,United States,01000US,0.072242
2,0,"< $10,000",2015,8421482,18763.0,United States,01000US,0.072024
3,0,"< $10,000",2016,8243664,18695.0,United States,01000US,0.07003
4,0,"< $10,000",2017,7942251,17662.0,United States,01000US,0.066839


In [5]:
austin_income1_df.tail(5)

Unnamed: 0,Household Income Bucket ID,Household Income Bucket,Year,Household Income,Household Income Moe,Place,Place ID,share
347,15,"$200,000+",2019,41778,1220.0,"Austin, TX",16000US4805000,0.109829
348,15,"$200,000+",2020,44716,1591.0,"Austin, TX",16000US4805000,0.113125
349,15,"$200,000+",2021,51144,1903.0,"Austin, TX",16000US4805000,0.126556
350,15,"$200,000+",2022,66699,1837.0,"Austin, TX",16000US4805000,0.157651
351,15,"$200,000+",2023,76544,2249.0,"Austin, TX",16000US4805000,0.173847


In [6]:
austin_income1_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 352 entries, 0 to 351
Data columns (total 8 columns):
 #   Column                      Non-Null Count  Dtype  
---  ------                      --------------  -----  
 0   Household Income Bucket ID  352 non-null    int64  
 1   Household Income Bucket     352 non-null    object 
 2   Year                        352 non-null    int64  
 3   Household Income            352 non-null    int64  
 4   Household Income Moe        352 non-null    float64
 5   Place                       352 non-null    object 
 6   Place ID                    352 non-null    object 
 7   share                       352 non-null    float64
dtypes: float64(2), int64(3), object(3)
memory usage: 22.1+ KB


In [7]:
austin_income1_df.describe()

Unnamed: 0,Household Income Bucket ID,Year,Household Income,Household Income Moe,share
count,352.0,352.0,352.0,352.0,352.0
mean,7.5,2018.0,3777064.0,13721.650568,0.0625
std,4.616334,3.166779,4352742.0,15329.638323,0.028219
min,0.0,2013.0,9564.0,666.0,0.021722
25%,3.75,2015.0,17728.0,1071.0,0.041112
50%,7.5,2018.0,2030578.0,3528.0,0.051712
75%,11.25,2021.0,6105956.0,19755.75,0.078656
max,15.0,2023.0,16202720.0,59924.0,0.173847


### Remove unecessary columns

In [8]:
# Make a cleaned copy
austin_income1_df_cleaned = austin_income1_df.drop(
    columns=["Household Income Bucket ID", "Household Income Moe", "Place ID"]
).copy()

#Remove the United States data points
austin_income1_df_cleaned = austin_income1_df_cleaned[austin_income1_df_cleaned['Place'] == "Austin, TX"].copy()

# Quick check
print(austin_income1_df_cleaned.head())
print(austin_income1_df_cleaned['Place'].unique())

    Household Income Bucket  Year  Household Income       Place     share
176               < $10,000  2013             26966  Austin, TX  0.079830
177               < $10,000  2014             26767  Austin, TX  0.077746
178               < $10,000  2015             25675  Austin, TX  0.073108
179               < $10,000  2016             24851  Austin, TX  0.069339
180               < $10,000  2017             23261  Austin, TX  0.064389
['Austin, TX']


In [9]:
austin_income1_df_cleaned.head()

Unnamed: 0,Household Income Bucket,Year,Household Income,Place,share
176,"< $10,000",2013,26966,"Austin, TX",0.07983
177,"< $10,000",2014,26767,"Austin, TX",0.077746
178,"< $10,000",2015,25675,"Austin, TX",0.073108
179,"< $10,000",2016,24851,"Austin, TX",0.069339
180,"< $10,000",2017,23261,"Austin, TX",0.064389


In [10]:
austin_income1_df_cleaned.tail(5)

Unnamed: 0,Household Income Bucket,Year,Household Income,Place,share
347,"$200,000+",2019,41778,"Austin, TX",0.109829
348,"$200,000+",2020,44716,"Austin, TX",0.113125
349,"$200,000+",2021,51144,"Austin, TX",0.126556
350,"$200,000+",2022,66699,"Austin, TX",0.157651
351,"$200,000+",2023,76544,"Austin, TX",0.173847


In [11]:
austin_income1_df_cleaned.describe()

Unnamed: 0,Year,Household Income,share
count,176.0,176.0,176.0
mean,2018.0,23671.261364,0.0625
std,3.1713,12356.19453,0.03081
min,2013.0,9564.0,0.021722
25%,2015.0,14442.5,0.038574
50%,2018.0,17728.0,0.051818
75%,2021.0,31011.0,0.07985
max,2023.0,76544.0,0.173847


In [12]:
# Save to CSV
austin_income1_df_cleaned.to_csv("austin_income1_cleaned.csv", index=False)