# Predicting U.S real estate housing sale price using demographic data and identifying similar neighborhood clusters

### Team 15
- Aparna Gopalakrishnan
- Mexi Liang
- Paul Schickler 

## Data Collection
We have to retrieve US Census Data via Census API. First, we need to request an API key from the Census website (https://api.census.gov/data/key_signup.html). After getting a key, we need to select fields that we are interested in and the year we want to get from. Then make a call to the Census API to retrieve data on the selected fields for the specific year.  

In [3]:
import pandas as pd
import requests
import json
import pyarrow as pa
from pyarrow import csv

In [1]:
key = 'your_key'

In [2]:
def get_census_data_by_zip(api_key, fields, year):
    """
    Fetch census data by ZIP Code Tabulation Areas (ZCTAs) for specified fields.
    
    :param api_key: Your Census API Key.
    :param fields: List of fields to fetch.
    :param year: Census year (default is 2020).
    :return: DataFrame with fetched data.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5"
    
    # Combine the fields into a comma-separated string
    fields_str = ",".join(fields)

    # Construct the final URL
    url = f"{base_url}?get={fields_str}&for=zip%20code%20tabulation%20area:*"
    # url = 'https://api.census.gov/data/2018/zbp?get=NAME,GEO_ID,NAICS2017_LABEL,EMPSZES_LABEL,EMPSZES,ESTAB&for=zip%20code:*&NAICS2017=00&key={}'.format(key)

    headers = {
        "Content-Type": "application/json",
    }

    # Make the API request
    response = requests.get(url, headers=headers, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # Convert data to DataFrame
        df = pd.DataFrame(data[1:], columns=data[0])
        df['year'] = year
        return df
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

The Census API endpoint can only take 50 fields at a time for a specific year; therefore, we had to make multiple calls and then merge the responses together to get our final census data. We have downloaed the available labels for us to use to our local census_data_label.csv. First, we hand picked features that we are interested in and created a column called include_in_master_table, labeled feature that we want to include in our master table with value 1. 


In [None]:
label = 'assets/census_data_labels.csv'
label_df = pd.read_csv(label2)
label_df = label_df[label_df['include_in_master_table'] == 1]
features = list(label_df['Name'])
health_features = ['B27019_001E', 'B27019_002E', 'B27019_004E', 'B27019_009E', 'B27019_014E', 'B27019_019E',
                  'B27019_023E', 'B27019_025E', 'B27019_030E', 'B27019_035E', 'B27019_040E']

features = features+health_features

In [None]:
# additional features selected after inital data exploring 
add_features = ['B15002_001E', 'B15002_002E', 'B15002_014E',  'B15002_015E', 'B15002_016E', 'B15002_017E', 'B15002_018E',
              'B15002_019E', 'B15002_031E', 'B15002_032E', 'B15002_033E', 'B15002_034E', 'B15002_035E',
              'B28002_001E', 'B28002_002E', 'B28002_013E',
              'B25127_001E', 'B25127_002E', 'B25127_045E',
              'B25105_001E',
              'B25099_001E', 'B25099_002E', 'B25099_003E',
              'B25035_001E', 
              'B25087_001E', 'B25087_002E', 'B25087_020E',
              'B08201_001E', 'B08201_002E', 'B08201_003E', 'B08201_004E', 'B08201_005E', 'B08201_006E']

In [None]:

year = 2021 # eg.2021

df = get_census_data_by_zip(key, features, year)
out = pa.Table.from_pandas(df)
file_name = 'census{}.csv'.format(year)
csv.write_csv(out, file_name) 

# uncomment lines below to pull census data for add_features and save as a csv file
# df = get_census_data_by_zip(key, add_features, year)
# out = pa.Table.from_pandas(df)
# file_name = 'census_add{}.csv'.format(year)
# csv.write_csv(out, file_name) 

Census data were downloaded to our local census dir. We read from the census data files and merged all the data files we got together to get our master census table. Data is joined on year and zipcode. 

In [None]:
census2021 = pd.read_csv('census/census2021.csv', converters={'zip code tabulation area': str, 'year:': str})
census2020 = pd.read_csv('census/census2020.csv', converters={'zip code tabulation area': str, 'year:': str})
census2019 = pd.read_csv('census/census2019.csv', converters={'zip code tabulation area': str, 'year:': str})
census2018 = pd.read_csv('census/census2018.csv', converters={'zip code tabulation area': str, 'year:': str})
census2017 = pd.read_csv('census/census2017.csv', converters={'zip code tabulation area': str, 'year:': str})

census2021.dropna(how='all', axis=1, inplace=True)
census2020.dropna(how='all', axis=1,inplace=True)
census2019.dropna(how='all', axis=1,inplace=True)
census2018.dropna(how='all', axis=1,inplace=True)
census2017.dropna(how='all', axis=1,inplace=True)


census2021h = pd.read_csv('census/census_add2021.csv', converters={'zip code tabulation area': str, 'year:': str})
census2020h = pd.read_csv('census/census_add2020.csv', converters={'zip code tabulation area': str, 'year:': str})
census2019h = pd.read_csv('census/census_add2019.csv', converters={'zip code tabulation area': str, 'year:': str})
census2018h = pd.read_csv('census/census_add2018.csv', converters={'zip code tabulation area': str, 'year:': str})
census2017h = pd.read_csv('census/census_add2017.csv', converters={'zip code tabulation area': str, 'year:': str})

census2021h.dropna(how='all', axis=1, inplace=True)
census2020h.dropna(how='all', axis=1,inplace=True)
census2019h.dropna(how='all', axis=1,inplace=True)
census2018h.dropna(how='all', axis=1,inplace=True)
census2017h.dropna(how='all', axis=1,inplace=True)



census1 = pd.concat([census2021, census2020, census2019, census2018, census2017])
census2 = pd.concat([census2021h, census2020h, census2019h, census2018h, census2017h])
census = census1.merge(census2, on=['zip code tabulation area', 'year'])


In order to have meaningful features, we engineered some new features using existing features. 

In [None]:
census.drop_duplicates(inplace=True)
census.rename(columns={'zip code tabulation area':'zipcode', 
                       'B25035_001E': 'median_year_structure_built',
                      'B25099_001E': 'median_household_income',
                      'B25099_002E': 'median_household_income_with_mortgage',
                      'B25099_003E': 'median_household_income_with_no_mortgage',
                      'B25105_001E': 'median_monthly_housing_costs'}, inplace=True)


census['year'] = census['year'].apply(str)

census['house_occupied_ratio'] = census['B25002_002E']/census['B25002_001E']
census['house_occupied_ratio'] = census['house_occupied_ratio'].apply(lambda x: 0 if x < 0 else x)

census['house_vacant_ratio'] = census['B25002_003E']/census['B25002_001E']
census['house_vacant_ratio'] = census['house_vacant_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_one_or_more_computing_device_ratio'] = census['B28010_002E']/census['B28010_001E']
census['has_one_or_more_computing_device_ratio'] = census['has_one_or_more_computing_device_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_no_computing_device_ratio'] = census['B28010_007E']/census['B28010_001E']
census['has_no_computing_device_ratio'] = census['has_no_computing_device_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_health_insurance_ratio'] = (census['B27019_004E']+census['B27019_009E']+census['B27019_014E']+census['B27019_019E']+census['B27019_025E']+census['B27019_030E']+census['B27019_035E']+census['B27019_040E'])/census['B27019_001E']
census['has_health_insurance_ratio'] = census['has_health_insurance_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_health_insurance_ratio_26_to_64'] =  (census['B27019_004E']+census['B27019_009E']+census['B27019_014E']+census['B27019_019E'])/census['B27019_002E']
census['has_health_insurance_ratio_26_to_64'] = census['has_health_insurance_ratio_26_to_64'].apply(lambda x: 0 if x < 0 else x)

census['has_health_insurance_ratio_65_and_over'] = (census['B27019_025E']+census['B27019_030E']+census['B27019_035E']+census['B27019_040E'])/census['B27019_023E']
census['has_health_insurance_ratio_65_and_over'] = census['has_health_insurance_ratio_65_and_over'].apply(lambda x: 0 if x < 0 else x)

census['household_no_vehicles_ratio'] = census['B08201_002E']/census['B08201_001E']
census['household_no_vehicles_ratio'] = census['household_no_vehicles_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_1_vehicle_ratio'] = census['B08201_003E']/census['B08201_001E']
census['household_1_vehicle_ratio'] = census['household_1_vehicle_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_2_vehicles_ratio'] = census['B08201_004E']/census['B08201_001E']
census['household_2_vehicles_ratio'] = census['household_2_vehicles_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_3_vehicles_ratio'] = census['B08201_005E']/census['B08201_001E']
census['household_3_vehicles_ratio'] = census['household_3_vehicles_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_4_vehicles_ratio_or_more'] = census['B08201_006E']/census['B08201_001E']
census['household_4_vehicles_ratio_or_more'] = census['household_4_vehicles_ratio_or_more'].apply(lambda x: 0 if x < 0 else x)

census['housing_units_with_mortgage_ratio'] = census['B25087_002E'] /census['B25087_001E']
census['housing_units_with_mortgage_ratio'] = census['housing_units_with_mortgage_ratio'].apply(lambda x: 0 if x < 0 else x)

census['housing_units_no_mortgage_ratio'] = census['B25087_020E'] /census['B25087_001E']
census['housing_units_no_mortgage_ratio'] = census['housing_units_no_mortgage_ratio'].apply(lambda x: 0 if x < 0 else x)

census['owner_occupied_ratio'] = census['B25127_002E']/census['B25127_001E']
census['owner_occupied_ratio'] = census['owner_occupied_ratio'].apply(lambda x: 0 if x < 0 else x)

census['renter_occupied_ratio'] = census['B25127_045E']/census['B25127_001E']
census['renter_occupied_ratio'] = census['renter_occupied_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_internet_ratio'] = census['B28002_002E']/census['B28002_001E']
census['has_internet_ratio'] = census['has_internet_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_no_internet_ratio'] = census['B28002_013E']/census['B28002_001E']
census['has_no_internet_ratio'] = census['has_no_internet_ratio'].apply(lambda x: 0 if x < 0 else x)

census['male_25_old_and_over_has_associate_or_higher_ratio'] = (census['B15002_014E']+census['B15002_015E']+census['B15002_016E']+census['B15002_017E']+census['B15002_018E'])/census['B15002_002E']
census['male_25_old_and_over_has_associate_or_higher_ratio'] = census['male_25_old_and_over_has_associate_or_higher_ratio'].apply(lambda x: 0 if x < 0 else x)

census['female_25_old_and_over_has_associate_or_higher_ratio'] = (census['B15002_031E']+census['B15002_032E']+census['B15002_033E']+census['B15002_034E']+census['B15002_035E'])/census['B15002_019E']
census['female_25_old_and_over_has_associate_or_higher_ratio'] = census['female_25_old_and_over_has_associate_or_higher_ratio'].apply(lambda x: 0 if x < 0 else x)

cols_to_treat = ['house_occupied_ratio', 'house_vacant_ratio',
       'has_one_or_more_computing_device_ratio', 'has_no_computing_device_ratio',
       'has_health_insurance_ratio', 'has_health_insurance_ratio_26_to_64', 'has_health_insurance_ratio_65_and_over',
        'household_no_vehicles_ratio', 'household_1_vehicle_ratio', 'household_2_vehicles_ratio', 'household_3_vehicles_ratio', 'household_4_vehicles_ratio_or_more',
        'housing_units_with_mortgage_ratio', 'housing_units_no_mortgage_ratio', 
        'owner_occupied_ratio', 'renter_occupied_ratio',
        'has_internet_ratio', 'has_no_internet_ratio',
        'male_25_old_and_over_has_associate_or_higher_ratio', 'female_25_old_and_over_has_associate_or_higher_ratio']

census.fillna(0, inplace=True)

In [None]:
# Getting master census data
out = pa.Table.from_pandas(census)
file_name = 'census_final.csv'
# csv.write_csv(out, file_name)

We downloaded the zillow dataset from Zillow Research and we just want to look at home sales data than from 2017 till today so we need to drop data that's aged before 2017. 

In [None]:
zillow_df = pd.read_csv('assets/ZHVI_Single_Family_Homes.csv', converters={'RegionName': str})

to_drop =['2000-01-31',
 '2000-02-29',
 '2000-03-31',
 '2000-04-30',
 '2000-05-31',
 '2000-06-30',
 '2000-07-31',
 '2000-08-31',
 '2000-09-30',
 '2000-10-31',
 '2000-11-30',
 '2000-12-31',
 '2001-01-31',
 '2001-02-28',
 '2001-03-31',
 '2001-04-30',
 '2001-05-31',
 '2001-06-30',
 '2001-07-31',
 '2001-08-31',
 '2001-09-30',
 '2001-10-31',
 '2001-11-30',
 '2001-12-31',
 '2002-01-31',
 '2002-02-28',
 '2002-03-31',
 '2002-04-30',
 '2002-05-31',
 '2002-06-30',
 '2002-07-31',
 '2002-08-31',
 '2002-09-30',
 '2002-10-31',
 '2002-11-30',
 '2002-12-31',
 '2003-01-31',
 '2003-02-28',
 '2003-03-31',
 '2003-04-30',
 '2003-05-31',
 '2003-06-30',
 '2003-07-31',
 '2003-08-31',
 '2003-09-30',
 '2003-10-31',
 '2003-11-30',
 '2003-12-31',
 '2004-01-31',
 '2004-02-29',
 '2004-03-31',
 '2004-04-30',
 '2004-05-31',
 '2004-06-30',
 '2004-07-31',
 '2004-08-31',
 '2004-09-30',
 '2004-10-31',
 '2004-11-30',
 '2004-12-31',
 '2005-01-31',
 '2005-02-28',
 '2005-03-31',
 '2005-04-30',
 '2005-05-31',
 '2005-06-30',
 '2005-07-31',
 '2005-08-31',
 '2005-09-30',
 '2005-10-31',
 '2005-11-30',
 '2005-12-31',
 '2006-01-31',
 '2006-02-28',
 '2006-03-31',
 '2006-04-30',
 '2006-05-31',
 '2006-06-30',
 '2006-07-31',
 '2006-08-31',
 '2006-09-30',
 '2006-10-31',
 '2006-11-30',
 '2006-12-31',
 '2007-01-31',
 '2007-02-28',
 '2007-03-31',
 '2007-04-30',
 '2007-05-31',
 '2007-06-30',
 '2007-07-31',
 '2007-08-31',
 '2007-09-30',
 '2007-10-31',
 '2007-11-30',
 '2007-12-31',
 '2008-01-31',
 '2008-02-29',
 '2008-03-31',
 '2008-04-30',
 '2008-05-31',
 '2008-06-30',
 '2008-07-31',
 '2008-08-31',
 '2008-09-30',
 '2008-10-31',
 '2008-11-30',
 '2008-12-31',
 '2009-01-31',
 '2009-02-28',
 '2009-03-31',
 '2009-04-30',
 '2009-05-31',
 '2009-06-30',
 '2009-07-31',
 '2009-08-31',
 '2009-09-30',
 '2009-10-31',
 '2009-11-30',
 '2009-12-31',
 '2010-01-31',
 '2010-02-28',
 '2010-03-31',
 '2010-04-30',
 '2010-05-31',
 '2010-06-30',
 '2010-07-31',
 '2010-08-31',
 '2010-09-30',
 '2010-10-31',
 '2010-11-30',
 '2010-12-31',
 '2011-01-31',
 '2011-02-28',
 '2011-03-31',
 '2011-04-30',
 '2011-05-31',
 '2011-06-30',
 '2011-07-31',
 '2011-08-31',
 '2011-09-30',
 '2011-10-31',
 '2011-11-30',
 '2011-12-31',
 '2012-01-31',
 '2012-02-29',
 '2012-03-31',
 '2012-04-30',
 '2012-05-31',
 '2012-06-30',
 '2012-07-31',
 '2012-08-31',
 '2012-09-30',
 '2012-10-31',
 '2012-11-30',
 '2012-12-31',
 '2013-01-31',
 '2013-02-28',
 '2013-03-31',
 '2013-04-30',
 '2013-05-31',
 '2013-06-30',
 '2013-07-31',
 '2013-08-31',
 '2013-09-30',
 '2013-10-31',
 '2013-11-30',
 '2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31']


# drop data before 2017
zillow_df.drop(columns=to_drop, inplace=True)

In [None]:
# merging zillow data and census data
zillow_df = zillow_df.rename(columns={'RegionName':'zipcode'})
master = zillow_df.merge(census, on=['zipcode'])

In [None]:
# creating master data file

out = pa.Table.from_pandas(master)
file_name = 'assets/master_v4.csv'
# csv.write_csv(out, file_name)

In [None]:
master_df = pd.read_csv('assets/master.csv', converters={'zipcode': str, 'year:': str})

In [None]:
master_df['price_changed_in_2017'] = master_df['2017-12-31'] - master_df['2017-01-31']
master_df['price_changed_in_2018'] = master_df['2018-12-31'] - master_df['2018-01-31']
master_df['price_changed_in_2019'] = master_df['2019-12-31'] - master_df['2019-01-31']
master_df['price_changed_in_2020'] = master_df['2020-12-31'] - master_df['2020-01-31']
master_df['price_changed_in_2021'] = master_df['2021-12-31'] - master_df['2021-01-31']

In [None]:
most_increased = master_df[master_df['year'] == 2021].sort_values('price_changed_in_5yr', ascending=False).head(10)
least_increased = master_df[master_df['year'] == 2021].sort_values('price_changed_in_5yr', ascending=True).head(10)


In [None]:
most_increased.head()

In [None]:
import pandas as pd
import pandas_bokeh
import matplotlib.pyplot as plt
import pgeocode
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
pandas_bokeh.output_notebook()
import plotly.graph_objects as go

nomi = pgeocode.Nominatim('us')

# edf = pd.read_csv('myFile.tsv', sep='\t',header=None, index_col=False ,names=['colC','zipcode','count'])
most_increased['Latitude'] = list(nomi.query_postal_code(most_increased['zipcode'].tolist()).latitude)
most_increased['Longitude'] = list(nomi.query_postal_code(most_increased['zipcode'].tolist()).longitude)
# most_increased
fig = go.Figure(data=go.Scattergeo(
        lon = most_increased['Longitude'],
        lat = most_increased['Latitude'],
        text = most_increased['City'],
        mode = 'markers',
        marker=dict(color=most_increased['price_changed_in_5yr'],
               colorscale='Viridis',
               showscale=True)
        ))

fig.update_layout(
        title = 'Home Sales Price increased the most from 2017 to 2021 TOP 10',
        geo_scope='usa',
    )
fig.show()

In [None]:
least_increased['Latitude'] = list(nomi.query_postal_code(least_increased['zipcode'].tolist()).latitude)
least_increased['Longitude'] = list(nomi.query_postal_code(least_increased['zipcode'].tolist()).longitude)
least_increased
fig = go.Figure(data=go.Scattergeo(
        lon = least_increased['Longitude'],
        lat = least_increased['Latitude'],
        text = least_increased['City'],
        mode = 'markers',
        marker=dict(color=least_increased['price_changed_in_5yr'],
               colorscale='Viridis',
               showscale=True)
        ))

fig.update_layout(
        title = 'Home Sales Price decreased the most from 2017 to 2021 TOP 10',
        geo_scope='usa',
    )
fig.show()