Housing Costs + Property Taxes over 5-year span (2018-2022)

In [56]:
import requests
import pandas as pd
import csv

def process_census_data(file_path):
    census_data_dict = {}
    with open(file_path, 'r', newline='', encoding='utf-8') as csvfile:
        csv_reader = csv.reader(csvfile)
        next(csv_reader, None)
        for row in csv_reader:
            if len(row) == 2:
                census_data_dict[row[0]] = row[1]
    return census_data_dict

def process_year_data(year):
    file_path = f'ACSST5Y{year}.S2506-Data.csv'
    meta_data_file_path = 'ACSST5Y2022.S2506-Column-Metadata.csv'

    df = pd.read_csv(file_path)
    census_data_dict = process_census_data(meta_data_file_path)

    df = df.rename(columns=census_data_dict)

    filtered_columns = [
        col for col in df.columns
        if "MONTHLY HOUSING COSTS" in col or "REAL ESTATE TAXES" in col or "Geography" in col or "Geographic Area Name" in col or "Median household income" in col
    ]

    df = df[filtered_columns]
    df = df.iloc[1:]
    df['Geographic Area Name'] = df['Geographic Area Name'].str.replace('ZCTA5', '', regex=False).str.strip()

    # Add a year column
    df['Year'] = year

    return df

# Process data for years 2018 to 2022
years = range(2018, 2023)
dfs = [process_year_data(year) for year in years]

# Combine all dataframes
combined_df = pd.concat(dfs, ignore_index=True)

# Display the first few rows of the combined dataframe
print(combined_df.head())

# combined_df.to_csv('combined_housing_data_2018_2022.csv', index=False)

# Print some information about the combined dataset
# print(f"\nTotal number of rows: {len(combined_df)}")
print(f"Years included: {combined_df['Year'].unique()}")
# print(f"\nColumns in the dataset:")
# for col in combined_df.columns:
#     print(col)

  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)
  df = pd.read_csv(file_path)


        Geography Geographic Area Name  \
0  8600000US00601                00601   
1  8600000US00602                00602   
2  8600000US00603                00603   
3  8600000US00606                00606   
4  8600000US00610                00610   

  Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2022 INFLATION-ADJUSTED DOLLARS)!!Median household income (dollars)  \
0                                                 85                                                                                                                                                                     
1                                                538                                                                                                                                                                     
2                                                493                                         

Data Quality checks

In [57]:
combined_df.isna().sum().sum() / (combined_df.shape[0] * combined_df.shape[1])

0.010241686167461773

In [58]:
total_nan = combined_df.isna().sum().sum()

nan_percentage = combined_df.isna().mean() * 100


nan_percentage_sorted = nan_percentage.sort_values(ascending=False)


In [59]:
combined_df = combined_df.loc[:, ~combined_df.columns.str.contains('Margin of Error', case=False)]

In [60]:
# combined_df.sample(10).to_csv('housing_costs_sample.csv')

In [61]:
import pandas as pd
import numpy as np

def process_housing_costs(df):

    columns_to_keep = [
        'Geographic Area Name',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2022 INFLATION-ADJUSTED DOLLARS)!!Median household income (dollars)',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!Median (dollars)',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!Less than $200',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$200 to $399',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$400 to $599',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$600 to $799',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$800 to $999',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$1,000 to $1,499',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$1,500 to $1,999',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$2,000 to $2,499',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$2,500 to $2,999',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!$3,000 or more',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!REAL ESTATE TAXES!!Median (dollars)',
        'Year'
    ]

    df_selected = df[columns_to_keep]

    new_column_names = {
        'Geographic Area Name': 'Area',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!HOUSEHOLD INCOME IN THE PAST 12 MONTHS (IN 2022 INFLATION-ADJUSTED DOLLARS)!!Median household income (dollars)': 'Median_Income',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!MONTHLY HOUSING COSTS!!Median (dollars)': 'Median_Monthly_Cost',
        'Estimate!!Owner-occupied housing units with a mortgage!!Owner-occupied housing units with a mortgage!!REAL ESTATE TAXES!!Median (dollars)': 'Median_RE_Taxes'
    }
    df_selected.rename(columns=new_column_names, inplace=True)

    # Convert columns to numeric, replacing '-' with NaN
    numeric_columns = df_selected.columns.drop(['Area', 'Year'])
    for col in numeric_columns:
        df_selected[col] = pd.to_numeric(df_selected[col].replace('-', np.nan), errors='coerce')

    # Calculate derived features
    df_selected['Housing_Cost_Less_1000'] = df_selected.iloc[:, 3:8].sum(axis=1)
    df_selected['Housing_Cost_1000_1999'] = df_selected.iloc[:, 8:10].sum(axis=1)
    df_selected['Housing_Cost_2000_2999'] = df_selected.iloc[:, 10:12].sum(axis=1)
    df_selected['Housing_Cost_3000_Plus'] = df_selected.iloc[:, 12]

    # Calculate affordability ratio
    df_selected['Affordability_Ratio'] = df_selected['Median_Monthly_Cost'] / (df_selected['Median_Income'] / 12)

    # Select final columns
    final_columns = ['Area', 'Median_Income', 'Median_Monthly_Cost', 'Housing_Cost_Less_1000',
                     'Housing_Cost_1000_1999', 'Housing_Cost_2000_2999', 'Housing_Cost_3000_Plus',
                     'Median_RE_Taxes', 'Affordability_Ratio', 'Year']

    df_final = df_selected[final_columns]

    return df_final


processed_df = process_housing_costs(combined_df)

# Display some information about the processed dataset
display(f"\nProcessed dataset shape: {processed_df.shape}")
display("\nColumns in the processed dataset:")
display(processed_df.columns.tolist())
display("\nSample of the processed data:")
display(processed_df.head())

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected.rename(columns=new_column_names, inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected[col] = pd.to_numeric(df_selected[col].replace('-', np.nan), errors='coerce')
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_selected['Housing_Cost_Less_1000'] = df_selected.iloc[:, 3:8].sum(axis=1)
A value is trying to be set on a copy

'\nProcessed dataset shape: (166908, 10)'

'\nColumns in the processed dataset:'

['Area',
 'Median_Income',
 'Median_Monthly_Cost',
 'Housing_Cost_Less_1000',
 'Housing_Cost_1000_1999',
 'Housing_Cost_2000_2999',
 'Housing_Cost_3000_Plus',
 'Median_RE_Taxes',
 'Affordability_Ratio',
 'Year']

'\nSample of the processed data:'

Unnamed: 0,Area,Median_Income,Median_Monthly_Cost,Housing_Cost_Less_1000,Housing_Cost_1000_1999,Housing_Cost_2000_2999,Housing_Cost_3000_Plus,Median_RE_Taxes,Affordability_Ratio,Year
0,601,85.0,92.0,514,28,0,801.0,,12.988235,2018
1,602,538.0,416.0,1783,233,0,877.0,,9.27881,2018
2,603,493.0,739.0,3169,402,51,839.0,,17.98783,2018
3,606,23.0,78.0,167,0,0,551.0,,40.695652,2018
4,610,462.0,426.0,1783,95,0,743.0,,11.064935,2018
