In [10]:
import pandas as pd
import requests
import json
import pyarrow as pa
from pyarrow import csv

In [11]:
# pip install pyarrow

In [12]:
key = '658f931dafd58fa7948b59ebdb5c381ff7f84235'

In [13]:
def get_census_data_by_zip(api_key, fields, year):
    """
    Fetch census data by ZIP Code Tabulation Areas (ZCTAs) for specified fields.
    
    :param api_key: Your Census API Key.
    :param fields: List of fields to fetch.
    :param year: Census year (default is 2020).
    :return: DataFrame with fetched data.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5"
    
    # Combine the fields into a comma-separated string
    fields_str = ",".join(fields)

    # Construct the final URL
    url = f"{base_url}?get={fields_str}&for=zip%20code%20tabulation%20area:*"
    # url = 'https://api.census.gov/data/2018/zbp?get=NAME,GEO_ID,NAICS2017_LABEL,EMPSZES_LABEL,EMPSZES,ESTAB&for=zip%20code:*&NAICS2017=00&key={}'.format(key)

    headers = {
        "Content-Type": "application/json",
    }

    # Make the API request
    response = requests.get(url, headers=headers, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # Convert data to DataFrame
        df = pd.DataFrame(data[1:], columns=data[0])
        df['year'] = year
        return df
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

In [177]:
# get_census_data_by_zip(key, FIELDS50, 2018)

In [39]:
# get labels
# label = 'census data labels.xlsx - raw labels.csv'
# label_df = pd.read_csv(label)
# label_df = label_df[label_df['include'] == 1]
#     'S1101_C01_002E': 'Estimate!!Total!!HOUSEHOLDS!!Average household size',
#     'S1101_C01_003E': 'Estimate!!Total!!FAMILIES!!Total families',
#     'S1101_C01_004E': 'Estimate!!Total!!FAMILIES!!Average family size',
#     'S1101_C01_009E': 'Estimate!!Total!!Total households',
#     'S1101_C01_018E': 'Estimate!!Total!!Total households!!HOUSING TENURE!!Owner-occupied housing units',
#     'S1101_C01_019E': 'Estimate!!Total!!Total households!!HOUSING TENURE!!Renter-occupied housing units',
#     'S1101_C05_003E': 'Estimate!!Nonfamily household!!FAMILIES!!Total families'
#     B28010_001E	Estimate!!Total: Computers in Household
#     'B28010_002E': 'Estimate!!Total:!!Has one or more types of computing devices:',
#     'B28010_005E': 'Estimate!!Total:!!Has one or more types of computing devices:!!Smartphone, tablet or other portable wireless computer or other computer',
#     'B28010_007E': 'Estimate!!Total:!!No Computer'
# B27019_001E	Estimate!!Total:
# B27019_002E	Estimate!!Total:!!26 to 64 years:
# B25002_001E	Estimate!!Total:	Occupancy Status
# B25002_002E	Estimate!!Total:!!Occupied	Occupancy Status
# B25002_003E	Estimate!!Total:!!Vacant	Occupancy Status

# B27019_001E	Estimate!!Total:
# B27019_002E	Estimate!!Total:!!26 to 64 years:
# B27019_004E	Estimate!!Total:!!26 to 64 years:!!Less than high school graduate:!!With health insurance coverage
# B27019_009E	Estimate!!Total:!!26 to 64 years:!!High school graduate (includes equivalency):!!With health insurance coverage
# B27019_014E	Estimate!!Total:!!26 to 64 years:!!Some college or associate's degree:!!With health insurance coverage
# B27019_019E	Estimate!!Total:!!26 to 64 years:!!Bachelor's degree or higher:!!With health insurance coverage

# B27019_023E	Estimate!!Total:!!65 years and over:
# B27019_025E	Estimate!!Total:!!65 years and over:!!Less than high school graduate:!!With health insurance coverage
# B27019_030E	Estimate!!Total:!!65 years and over:!!High school graduate (includes equivalency):!!With health insurance coverage
# B27019_035E	Estimate!!Total:!!65 years and over:!!Some college or associate's degree:!!With health insurance coverage
# B27019_040E	Estimate!!Total:!!65 years and over:!!Bachelor's degree or higher:!!With health insurance coverage



label2 = 'assets/census data labels v2.csv'
label_df = pd.read_csv(label2)
label_df = label_df[label_df['include_in_master_table'] == 1]
features = list(label_df['Name'])
health_features = ['B27019_001E', 'B27019_002E', 'B27019_004E', 'B27019_009E', 'B27019_014E', 'B27019_019E',
                  'B27019_023E', 'B27019_025E', 'B27019_030E', 'B27019_035E', 'B27019_040E']
# print(len(features))
# print(len(health_features))
# print(features+health_features)
features = features+health_features

In [45]:
# len(features) 50
year = 2021

df = get_census_data_by_zip(key, features, year)
out = pa.Table.from_pandas(df)
file_name = 'census{}.csv'.format(year)
csv.write_csv(out, file_name)

In [29]:
'''
education features

B15002_001E	Estimate!!Total:	Sex by Educational Attainment for the Population 25 Years and Over
B15002_002E	Estimate!!Total:!!Male:	Sex by Educational Attainment for the Population 25 Years and Over
B15002_014E	Estimate!!Total:!!Male:!!Associate's degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_015E	Estimate!!Total:!!Male:!!Bachelor's degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_016E	Estimate!!Total:!!Male:!!Master's degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_017E	Estimate!!Total:!!Male:!!Professional school degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_018E	Estimate!!Total:!!Male:!!Doctorate degree	Sex by Educational Attainment for the Population 25 Years and Over

B15002_019E	Estimate!!Total:!!Female:	Sex by Educational Attainment for the Population 25 Years and Over
B15002_031E	Estimate!!Total:!!Female:!!Associate's degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_032E	Estimate!!Total:!!Female:!!Bachelor's degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_033E	Estimate!!Total:!!Female:!!Master's degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_034E	Estimate!!Total:!!Female:!!Professional school degree	Sex by Educational Attainment for the Population 25 Years and Over
B15002_035E	Estimate!!Total:!!Female:!!Doctorate degree	Sex by Educational Attainment for the Population 25 Years and Over

census['male_25_old_and_over_has_associate_or_higher_ratio'] = (census['B15002_014E']+census['B15002_015E']+
census['B15002_016E']+census['B15002_017E']+census['B15002_018E'])/census['B15002_002E']
census['female_25_old_and_over_has_associate_or_higher_ratio'] = (census['B15002_031E']+census['B15002_032E']+
census['B15002_033E']+census['B15002_034E']+census['B15002_035E'])/census['B15002_019E']

internet
B28002_001E	Estimate!!Total:	Presence and Types of Internet Subscriptions in Household
B28002_002E	Estimate!!Total:!!With an Internet subscription	Presence and Types of Internet Subscriptions in Household
B28002_013E	Estimate!!Total:!!No Internet access	Presence and Types of Internet Subscriptions in Household

census['has_internet_ratio'] = census['B28002_002E']/census['B28002_001E']
census['has_no_internet_ratio'] = census['B28002_013E']/census['B28002_001E']


occupancy 
B25127_001E	Estimate!!Total:	Tenure by Year Structure Built by Units in Structure
B25127_002E	Estimate!!Total:!!Owner occupied:	Tenure by Year Structure Built by Units in Structure
B25127_045E	Estimate!!Total:!!Renter occupied:	Tenure by Year Structure Built by Units in Structure

census['owner_occupied_ratio'] = census['B25127_002E']/census['B25127_001E']
census['renter_occupied_ratio'] = census['B25127_045E']/census['B25127_001E']

B25105_001E	Estimate!!Median monthly housing costs	Median Monthly Housing Costs (Dollars)

median household income
B25099_001E	Estimate!!Median household income --!!Total:	Mortgage Status by Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)
B25099_002E	Estimate!!Median household income --!!Total:!!Median household income for units with a mortgage	Mortgage Status by Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)
B25099_003E	Estimate!!Median household income --!!Total:!!Median household income for units without a mortgage	Mortgage Status by Median Household Income in the Past 12 Months (in 2022 Inflation-Adjusted Dollars)



B25035_001E	Estimate!!Median year structure built	Median Year Structure Built
census['median_year_structure_built']

B25087_001E	Estimate!!Total:	Mortgage Status and Selected Monthly Owner Costs
B25087_002E	Estimate!!Total:!!Housing units with a mortgage:	Mortgage Status and Selected Monthly Owner Costs
B25087_020E	Estimate!!Total:!!Housing units without a mortgage:	Mortgage Status and Selected Monthly Owner Costs

census['housing_units_with_mortgage_ratio'] = census['B25087_002E'] /census['B25087_001E']
census['housing_units_no_mortgage_ratio'] = census['B25087_020E'] /census['B25087_001E']


vehicle status
B08201_001E	Estimate!!Total:	Household Size by Vehicles Available
B08201_002E	Estimate!!Total:!!No vehicle available	Household Size by Vehicles Available
B08201_003E	Estimate!!Total:!!1 vehicle available	Household Size by Vehicles Available
B08201_004E	Estimate!!Total:!!2 vehicles available	Household Size by Vehicles Available
B08201_005E	Estimate!!Total:!!3 vehicles available	Household Size by Vehicles Available
B08201_006E	Estimate!!Total:!!4 or more vehicles available	Household Size by Vehicles Available

census['household_no_vehicles_ratio'] = census['B08201_002E']/census['B08201_001E']
census['household_1_vehicle_ratio'] = census['B08201_003E']/census['B08201_001E']
census['household_2_vehicles_ratio'] = census['B08201_004E']/census['B08201_001E']
census['household_3_vehicles_ratio'] = census['B08201_005E']/census['B08201_001E']
census['household_4_vehicles_ratio_or_more'] = census['B08201_006E']/census['B08201_001E']
'''



add_features = ['B15002_001E', 'B15002_002E', 'B15002_014E',  'B15002_015E', 'B15002_016E', 'B15002_017E', 'B15002_018E',
              'B15002_019E', 'B15002_031E', 'B15002_032E', 'B15002_033E', 'B15002_034E', 'B15002_035E',
              'B28002_001E', 'B28002_002E', 'B28002_013E',
              'B25127_001E', 'B25127_002E', 'B25127_045E',
              'B25105_001E',
              'B25099_001E', 'B25099_002E', 'B25099_003E',
              'B25035_001E', 
              'B25087_001E', 'B25087_002E', 'B25087_020E',
              'B08201_001E', 'B08201_002E', 'B08201_003E', 'B08201_004E', 'B08201_005E', 'B08201_006E']


In [34]:
year =2021

df = get_census_data_by_zip(key, add_features, year)
out = pa.Table.from_pandas(df)
file_name = 'census_add{}.csv'.format(year)
# csv.write_csv(out, file_name)

# df = get_census_data_by_zip(key, features, year)
# out = pa.Table.from_pandas(df)
# file_name = 'census{}.csv'.format(year)
# csv.write_csv(out, file_name)

In [123]:
# list(df.columns)

census2021 = pd.read_csv('census/census2021.csv', converters={'zip code tabulation area': str, 'year:': str})
census2020 = pd.read_csv('census/census2020.csv', converters={'zip code tabulation area': str, 'year:': str})
census2019 = pd.read_csv('census/census2019.csv', converters={'zip code tabulation area': str, 'year:': str})
census2018 = pd.read_csv('census/census2018.csv', converters={'zip code tabulation area': str, 'year:': str})
census2017 = pd.read_csv('census/census2017.csv', converters={'zip code tabulation area': str, 'year:': str})

census2021.dropna(how='all', axis=1, inplace=True)
census2020.dropna(how='all', axis=1,inplace=True)
census2019.dropna(how='all', axis=1,inplace=True)
census2018.dropna(how='all', axis=1,inplace=True)
census2017.dropna(how='all', axis=1,inplace=True)


census2021h = pd.read_csv('census/census_add2021.csv', converters={'zip code tabulation area': str, 'year:': str})
census2020h = pd.read_csv('census/census_add2020.csv', converters={'zip code tabulation area': str, 'year:': str})
census2019h = pd.read_csv('census/census_add2019.csv', converters={'zip code tabulation area': str, 'year:': str})
census2018h = pd.read_csv('census/census_add2018.csv', converters={'zip code tabulation area': str, 'year:': str})
census2017h = pd.read_csv('census/census_add2017.csv', converters={'zip code tabulation area': str, 'year:': str})

census2021h.dropna(how='all', axis=1, inplace=True)
census2020h.dropna(how='all', axis=1,inplace=True)
census2019h.dropna(how='all', axis=1,inplace=True)
census2018h.dropna(how='all', axis=1,inplace=True)
census2017h.dropna(how='all', axis=1,inplace=True)



census1 = pd.concat([census2021, census2020, census2019, census2018, census2017])
census2 = pd.concat([census2021h, census2020h, census2019h, census2018h, census2017h])
census = census1.merge(census2, on=['zip code tabulation area', 'year'])

print(census2021.shape)
print(census1.shape)
print(census2.shape)
print(census.shape)
# census[['zip code tabulation area', 'has_health_insurance_ratio', 'has_health_insurance_ratio_26_to_64', 'has_health_insurance_ratio_65_and_over']]

(33774, 49)
(166254, 50)
(166254, 36)
(166254, 84)


In [141]:
census.drop_duplicates(inplace=True)
census.rename(columns={'zip code tabulation area':'zipcode', 
                       'B25035_001E': 'median_year_structure_built',
                      'B25099_001E': 'median_household_income',
                      'B25099_002E': 'median_household_income_with_mortgage',
                      'B25099_003E': 'median_household_income_with_no_mortgage',
                      'B25105_001E': 'median_monthly_housing_costs'}, inplace=True)
census['year'] = census['year'].apply(str)
# census[census['zip code tabulation area'] == 19713]
# census[['zip code tabulation area','year', 'B25002_001E', 'B25002_002E', 'B25002_003E']]
census['house_occupied_ratio'] = census['B25002_002E']/census['B25002_001E']
census['house_occupied_ratio'] = census['house_occupied_ratio'].apply(lambda x: 0 if x < 0 else x)

census['house_vacant_ratio'] = census['B25002_003E']/census['B25002_001E']
census['house_vacant_ratio'] = census['house_vacant_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_one_or_more_computing_device_ratio'] = census['B28010_002E']/census['B28010_001E']
census['has_one_or_more_computing_device_ratio'] = census['has_one_or_more_computing_device_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_no_computing_device_ratio'] = census['B28010_007E']/census['B28010_001E']
census['has_no_computing_device_ratio'] = census['has_no_computing_device_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_health_insurance_ratio'] = (census['B27019_004E']+census['B27019_009E']+census['B27019_014E']+census['B27019_019E']+census['B27019_025E']+census['B27019_030E']+census['B27019_035E']+census['B27019_040E'])/census['B27019_001E']
census['has_health_insurance_ratio'] = census['has_health_insurance_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_health_insurance_ratio_26_to_64'] =  (census['B27019_004E']+census['B27019_009E']+census['B27019_014E']+census['B27019_019E'])/census['B27019_002E']
census['has_health_insurance_ratio_26_to_64'] = census['has_health_insurance_ratio_26_to_64'].apply(lambda x: 0 if x < 0 else x)

census['has_health_insurance_ratio_65_and_over'] = (census['B27019_025E']+census['B27019_030E']+census['B27019_035E']+census['B27019_040E'])/census['B27019_023E']
census['has_health_insurance_ratio_65_and_over'] = census['has_health_insurance_ratio_65_and_over'].apply(lambda x: 0 if x < 0 else x)

census['household_no_vehicles_ratio'] = census['B08201_002E']/census['B08201_001E']
census['household_no_vehicles_ratio'] = census['household_no_vehicles_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_1_vehicle_ratio'] = census['B08201_003E']/census['B08201_001E']
census['household_1_vehicle_ratio'] = census['household_1_vehicle_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_2_vehicles_ratio'] = census['B08201_004E']/census['B08201_001E']
census['household_2_vehicles_ratio'] = census['household_2_vehicles_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_3_vehicles_ratio'] = census['B08201_005E']/census['B08201_001E']
census['household_3_vehicles_ratio'] = census['household_3_vehicles_ratio'].apply(lambda x: 0 if x < 0 else x)

census['household_4_vehicles_ratio_or_more'] = census['B08201_006E']/census['B08201_001E']
census['household_4_vehicles_ratio_or_more'] = census['household_4_vehicles_ratio_or_more'].apply(lambda x: 0 if x < 0 else x)

census['housing_units_with_mortgage_ratio'] = census['B25087_002E'] /census['B25087_001E']
census['housing_units_with_mortgage_ratio'] = census['housing_units_with_mortgage_ratio'].apply(lambda x: 0 if x < 0 else x)

census['housing_units_no_mortgage_ratio'] = census['B25087_020E'] /census['B25087_001E']
census['housing_units_no_mortgage_ratio'] = census['housing_units_no_mortgage_ratio'].apply(lambda x: 0 if x < 0 else x)

census['owner_occupied_ratio'] = census['B25127_002E']/census['B25127_001E']
census['owner_occupied_ratio'] = census['owner_occupied_ratio'].apply(lambda x: 0 if x < 0 else x)

census['renter_occupied_ratio'] = census['B25127_045E']/census['B25127_001E']
census['renter_occupied_ratio'] = census['renter_occupied_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_internet_ratio'] = census['B28002_002E']/census['B28002_001E']
census['has_internet_ratio'] = census['has_internet_ratio'].apply(lambda x: 0 if x < 0 else x)

census['has_no_internet_ratio'] = census['B28002_013E']/census['B28002_001E']
census['has_no_internet_ratio'] = census['has_no_internet_ratio'].apply(lambda x: 0 if x < 0 else x)

census['male_25_old_and_over_has_associate_or_higher_ratio'] = (census['B15002_014E']+census['B15002_015E']+census['B15002_016E']+census['B15002_017E']+census['B15002_018E'])/census['B15002_002E']
census['male_25_old_and_over_has_associate_or_higher_ratio'] = census['male_25_old_and_over_has_associate_or_higher_ratio'].apply(lambda x: 0 if x < 0 else x)

census['female_25_old_and_over_has_associate_or_higher_ratio'] = (census['B15002_031E']+census['B15002_032E']+census['B15002_033E']+census['B15002_034E']+census['B15002_035E'])/census['B15002_019E']
census['female_25_old_and_over_has_associate_or_higher_ratio'] = census['female_25_old_and_over_has_associate_or_higher_ratio'].apply(lambda x: 0 if x < 0 else x)

cols_to_treat = ['house_occupied_ratio', 'house_vacant_ratio',
       'has_one_or_more_computing_device_ratio', 'has_no_computing_device_ratio',
       'has_health_insurance_ratio', 'has_health_insurance_ratio_26_to_64', 'has_health_insurance_ratio_65_and_over',
        'household_no_vehicles_ratio', 'household_1_vehicle_ratio', 'household_2_vehicles_ratio', 'household_3_vehicles_ratio', 'household_4_vehicles_ratio_or_more',
        'housing_units_with_mortgage_ratio', 'housing_units_no_mortgage_ratio', 
        'owner_occupied_ratio', 'renter_occupied_ratio',
        'has_internet_ratio', 'has_no_internet_ratio',
        'male_25_old_and_over_has_associate_or_higher_ratio', 'female_25_old_and_over_has_associate_or_higher_ratio']

census.fillna(0, inplace=True)

In [142]:
census[['zipcode', 'has_health_insurance_ratio', 'has_health_insurance_ratio_26_to_64', 'has_health_insurance_ratio_65_and_over']]

Unnamed: 0,zipcode,has_health_insurance_ratio,has_health_insurance_ratio_26_to_64,has_health_insurance_ratio_65_and_over
0,00601,0.942744,0.921803,0.995078
1,00602,0.946758,0.927786,0.994841
2,00603,0.928244,0.903989,0.980371
3,00606,0.967347,0.959384,0.984721
4,00610,0.945542,0.923600,1.000000
...,...,...,...,...
166249,99923,0.000000,0.000000,0.000000
166250,99925,0.700468,0.630058,1.000000
166251,99926,0.692155,0.620818,0.980000
166252,99927,0.152542,0.152542,0.000000


In [143]:
census.shape

(166254, 104)

In [68]:
out = pa.Table.from_pandas(census)
file_name = 'census_final.csv'
# csv.write_csv(out, file_name)

In [64]:
census_final = pd.read_csv(file_name)
census_final.shape

(166254, 105)

In [358]:
zillow_seasonal = pd.read_csv('Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
zillow_seasonal.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,213207.449193,...,484545.423106,482998.627982,480762.710077,479445.795903,478396.792362,479623.197138,481652.685632,484818.440189,487650.169224,490161.927693
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,138706.618807,...,525459.564754,528481.179889,530866.429595,533099.554913,536015.609542,540209.612968,545621.736221,551618.956629,557787.374606,563740.488083
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,104005.403992,...,287817.295002,285751.839606,283305.514483,280947.848337,279077.781291,277875.217836,277392.281244,277773.660692,278539.454227,279441.879993
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,149493.2119,...,508472.778857,505169.076617,499583.980419,492648.821682,486263.256758,479571.79702,472397.526197,466948.659683,465108.150687,465102.144897
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,102704.068566,...,277720.958818,276694.069269,275123.071172,273514.546846,272181.096873,271383.921551,271240.340967,271842.305613,272691.518967,273570.680709


In [144]:
zillow_df = pd.read_csv('ZHVI_Single_Family_Homes.csv', converters={'RegionName': str})

In [145]:
zillow_df.shape

(26264, 293)

In [146]:
# list(zillow_df.columns)[-20::]

In [147]:


to_drop =['2000-01-31',
 '2000-02-29',
 '2000-03-31',
 '2000-04-30',
 '2000-05-31',
 '2000-06-30',
 '2000-07-31',
 '2000-08-31',
 '2000-09-30',
 '2000-10-31',
 '2000-11-30',
 '2000-12-31',
 '2001-01-31',
 '2001-02-28',
 '2001-03-31',
 '2001-04-30',
 '2001-05-31',
 '2001-06-30',
 '2001-07-31',
 '2001-08-31',
 '2001-09-30',
 '2001-10-31',
 '2001-11-30',
 '2001-12-31',
 '2002-01-31',
 '2002-02-28',
 '2002-03-31',
 '2002-04-30',
 '2002-05-31',
 '2002-06-30',
 '2002-07-31',
 '2002-08-31',
 '2002-09-30',
 '2002-10-31',
 '2002-11-30',
 '2002-12-31',
 '2003-01-31',
 '2003-02-28',
 '2003-03-31',
 '2003-04-30',
 '2003-05-31',
 '2003-06-30',
 '2003-07-31',
 '2003-08-31',
 '2003-09-30',
 '2003-10-31',
 '2003-11-30',
 '2003-12-31',
 '2004-01-31',
 '2004-02-29',
 '2004-03-31',
 '2004-04-30',
 '2004-05-31',
 '2004-06-30',
 '2004-07-31',
 '2004-08-31',
 '2004-09-30',
 '2004-10-31',
 '2004-11-30',
 '2004-12-31',
 '2005-01-31',
 '2005-02-28',
 '2005-03-31',
 '2005-04-30',
 '2005-05-31',
 '2005-06-30',
 '2005-07-31',
 '2005-08-31',
 '2005-09-30',
 '2005-10-31',
 '2005-11-30',
 '2005-12-31',
 '2006-01-31',
 '2006-02-28',
 '2006-03-31',
 '2006-04-30',
 '2006-05-31',
 '2006-06-30',
 '2006-07-31',
 '2006-08-31',
 '2006-09-30',
 '2006-10-31',
 '2006-11-30',
 '2006-12-31',
 '2007-01-31',
 '2007-02-28',
 '2007-03-31',
 '2007-04-30',
 '2007-05-31',
 '2007-06-30',
 '2007-07-31',
 '2007-08-31',
 '2007-09-30',
 '2007-10-31',
 '2007-11-30',
 '2007-12-31',
 '2008-01-31',
 '2008-02-29',
 '2008-03-31',
 '2008-04-30',
 '2008-05-31',
 '2008-06-30',
 '2008-07-31',
 '2008-08-31',
 '2008-09-30',
 '2008-10-31',
 '2008-11-30',
 '2008-12-31',
 '2009-01-31',
 '2009-02-28',
 '2009-03-31',
 '2009-04-30',
 '2009-05-31',
 '2009-06-30',
 '2009-07-31',
 '2009-08-31',
 '2009-09-30',
 '2009-10-31',
 '2009-11-30',
 '2009-12-31',
 '2010-01-31',
 '2010-02-28',
 '2010-03-31',
 '2010-04-30',
 '2010-05-31',
 '2010-06-30',
 '2010-07-31',
 '2010-08-31',
 '2010-09-30',
 '2010-10-31',
 '2010-11-30',
 '2010-12-31',
 '2011-01-31',
 '2011-02-28',
 '2011-03-31',
 '2011-04-30',
 '2011-05-31',
 '2011-06-30',
 '2011-07-31',
 '2011-08-31',
 '2011-09-30',
 '2011-10-31',
 '2011-11-30',
 '2011-12-31',
 '2012-01-31',
 '2012-02-29',
 '2012-03-31',
 '2012-04-30',
 '2012-05-31',
 '2012-06-30',
 '2012-07-31',
 '2012-08-31',
 '2012-09-30',
 '2012-10-31',
 '2012-11-30',
 '2012-12-31',
 '2013-01-31',
 '2013-02-28',
 '2013-03-31',
 '2013-04-30',
 '2013-05-31',
 '2013-06-30',
 '2013-07-31',
 '2013-08-31',
 '2013-09-30',
 '2013-10-31',
 '2013-11-30',
 '2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31']


# drop data before 2017
zillow_df.drop(columns=to_drop, inplace=True)

zillow_df.head(5)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,484812.248823,483270.636456,481038.183265,479721.873619,478668.36715,479881.403656,481894.597228,485046.382049,487872.99138,490385.169211
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,394156.300927,...,644904.087044,648741.824038,650794.311229,652508.254601,655175.941212,660548.696785,668432.531461,677540.593485,687378.218275,696773.567354
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,171583.47755,...,287849.265624,285784.837604,283338.919177,280982.358438,279112.875127,277911.26653,277429.5663,277812.780825,278579.748133,279482.307522
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,719129.427216,...,886897.867278,885628.363831,885552.350921,884145.746022,881743.541864,878675.145891,875712.760185,872584.255384,871420.265788,871160.471818
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,168888.858279,...,278537.547066,277502.648841,275922.593554,274305.254214,272963.850362,272159.780823,272011.02478,272611.025961,273460.285442,274335.613083


In [148]:
zillow_df.shape

(26264, 89)

In [149]:
zillow_df = zillow_df.rename(columns={'RegionName':'zipcode'})
master = zillow_df.merge(census, on=['zipcode'])
master.shape

(131073, 192)

In [150]:
master.head()

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,household_3_vehicles_ratio,household_4_vehicles_ratio_or_more,housing_units_with_mortgage_ratio,housing_units_no_mortgage_ratio,owner_occupied_ratio,renter_occupied_ratio,has_internet_ratio,has_no_internet_ratio,male_25_old_and_over_has_associate_or_higher_ratio,female_25_old_and_over_has_associate_or_higher_ratio
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,0.165541,0.053898,0.726426,0.273574,0.738505,0.261495,0.979717,0.011842,0.764861,0.696519
1,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,0.169958,0.052421,0.74162,0.25838,0.75132,0.24868,0.973933,0.014074,0.748471,0.642034
2,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,0.165988,0.053643,0.765346,0.234654,0.756774,0.243226,0.965359,0.022037,0.74714,0.638257
3,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,0.174863,0.049997,0.748063,0.251937,0.777856,0.222144,0.962283,0.024956,0.739351,0.651352
4,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,0.169937,0.045658,0.766256,0.233744,0.799307,0.200693,0.952196,0.034251,0.745646,0.636017


In [151]:
out = pa.Table.from_pandas(master)
file_name = 'master_v4.csv'
csv.write_csv(out, file_name)

In [80]:
master_df = pd.read_csv('master_v3.csv', converters={'zipcode': str, 'year:': str})

In [81]:
master_df[master_df['zipcode'] == '02108']

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,household_4_vehicles_ratio_or_more,housing_units_with_mortgage_ratio,housing_units_no_mortgage_ratio,owner_occupied_ratio,renter_occupied_ratio,has_internet_ratio,has_no_internet_ratio,male_25_old_and_over_has_associate_or_higher_ratio,female_25_old_and_over_has_associate_or_higher_ratio,__index_level_0__
69969,58622,14268,2108,zip,MA,MA,Boston,"Boston-Cambridge-Newton, MA-NH",Suffolk County,4003640.0,...,0.0,0.465845,0.534155,0.468766,0.531234,0.953806,0.040945,0.721717,0.826853,69969
69970,58622,14268,2108,zip,MA,MA,Boston,"Boston-Cambridge-Newton, MA-NH",Suffolk County,4003640.0,...,0.0,0.459796,0.540204,0.431574,0.568426,0.946725,0.047898,0.825406,0.854296,69970
69971,58622,14268,2108,zip,MA,MA,Boston,"Boston-Cambridge-Newton, MA-NH",Suffolk County,4003640.0,...,0.0,0.547674,0.452326,0.411286,0.588714,0.952654,0.042085,0.878559,0.888014,69971
69972,58622,14268,2108,zip,MA,MA,Boston,"Boston-Cambridge-Newton, MA-NH",Suffolk County,4003640.0,...,0.0,0.444853,0.555147,0.450829,0.549171,0.941989,0.048619,0.847159,0.872976,69972
69973,58622,14268,2108,zip,MA,MA,Boston,"Boston-Cambridge-Newton, MA-NH",Suffolk County,4003640.0,...,0.0,0.511732,0.488268,0.474047,0.525953,0.971928,0.017479,0.832889,0.877477,69973


In [82]:
master_df['price_changed_in_5yr'] = master_df['2021-12-31'] - master_df['2017-01-31']

In [116]:
most_increased = master_df[master_df['year'] == 2021].sort_values('price_changed_in_5yr', ascending=False).head(30)

In [117]:
least_increased = master_df[master_df['year'] == 2021].sort_values('price_changed_in_5yr', ascending=True).head(30)

In [118]:
# pip install plotly

In [119]:
most_increased

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,housing_units_with_mortgage_ratio,housing_units_no_mortgage_ratio,owner_occupied_ratio,renter_occupied_ratio,has_internet_ratio,has_no_internet_ratio,male_25_old_and_over_has_associate_or_higher_ratio,female_25_old_and_over_has_associate_or_higher_ratio,__index_level_0__,price_changed_in_5yr
44873,72636,9121,33480,zip,FL,FL,Palm Beach,"Miami-Fort Lauderdale-Pompano Beach, FL",Palm Beach County,3957452.0,...,0.257601,0.742399,0.832971,0.167029,0.945463,0.021597,0.731085,0.696547,44873,4316371.0
43147,93816,8769,81611,zip,CO,CO,Aspen,"Glenwood Springs, CO",Pitkin County,3921426.0,...,0.611484,0.388516,0.635846,0.364154,0.941916,0.026553,0.724855,0.740129,43147,3518660.0
31468,96086,6389,90210,zip,CA,CA,Beverly Hills,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,2520645.0,...,0.673748,0.326252,0.746319,0.253681,0.940481,0.052095,0.735661,0.665279,31468,3430541.0
43922,96149,8928,90402,zip,CA,CA,Santa Monica,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,2345816.0,...,0.612658,0.387342,0.724623,0.275377,0.964941,0.016307,0.839153,0.858457,43922,2222702.0
55951,97518,11388,94027,zip,CA,CA,Atherton,"San Francisco-Oakland-Berkeley, CA",San Mateo County,5482394.0,...,0.53927,0.46073,0.884513,0.115487,0.958407,0.016372,0.948379,0.834272,55951,2124587.0
31383,96116,6372,90265,zip,CA,CA,Malibu,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,1801163.0,...,0.62793,0.37207,0.792202,0.207798,0.969209,0.026922,0.750367,0.739861,31383,1944029.0
71339,93798,14553,81435,zip,CO,CO,,,San Miguel County,2036972.0,...,0.582896,0.417104,0.534912,0.465088,0.89366,0.061396,0.688791,0.722369,71339,1907623.0
114515,72914,24277,33921,zip,FL,FL,,"Cape Coral-Fort Myers, FL",Lee County,1561109.0,...,0.22439,0.77561,0.942529,0.057471,0.914943,0.085057,0.666667,0.619145,114515,1815062.0
78014,94026,15964,83014,zip,WY,WY,Wilson,"Jackson, WY-ID",Teton County,2047947.0,...,0.430085,0.569915,0.770612,0.229388,0.966531,0.0,0.608856,0.766533,78014,1800859.0
83114,99524,17025,98039,zip,WA,WA,Medina,"Seattle-Tacoma-Bellevue, WA",King County,2465796.0,...,0.534342,0.465658,0.814991,0.185009,0.968691,0.031309,0.929448,0.831313,83114,1799851.0


In [121]:
import pandas as pd
import pandas_bokeh
import matplotlib.pyplot as plt
import pgeocode
import geopandas as gpd
from shapely.geometry import Point
from geopandas import GeoDataFrame
pandas_bokeh.output_notebook()
import plotly.graph_objects as go

nomi = pgeocode.Nominatim('us')

# edf = pd.read_csv('myFile.tsv', sep='\t',header=None, index_col=False ,names=['colC','zipcode','count'])
most_increased['Latitude'] = list(nomi.query_postal_code(most_increased['zipcode'].tolist()).latitude)
most_increased['Longitude'] = list(nomi.query_postal_code(most_increased['zipcode'].tolist()).longitude)

fig = go.Figure(data=go.Scattergeo(
        lon = most_increased['Longitude'],
        lat = most_increased['Latitude'],
        text = most_increased['City'],
        mode = 'markers',
        marker_color = most_increased['price_changed_in_5yr'],
        ))

fig.update_layout(
        title = 'Home Sales Price increased the most from 2017 to 2021 TOP 30',
        geo_scope='usa',
    )
fig.show()

In [122]:
least_increased['Latitude'] = list(nomi.query_postal_code(least_increased['zipcode'].tolist()).latitude)
least_increased['Longitude'] = list(nomi.query_postal_code(least_increased['zipcode'].tolist()).longitude)

fig = go.Figure(data=go.Scattergeo(
        lon = least_increased['Longitude'],
        lat = least_increased['Latitude'],
        text = least_increased['City'],
        mode = 'markers',
        marker_color = least_increased['price_changed_in_5yr'],
        ))

fig.update_layout(
        title = 'Home Sales Price decreased the most from 2017 to 2021 TOP 30',
        geo_scope='usa',
    )
fig.show()

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,owner_occupied_ratio,renter_occupied_ratio,has_internet_ratio,has_no_internet_ratio,male_25_old_and_over_has_associate_or_higher_ratio,female_25_old_and_over_has_associate_or_higher_ratio,__index_level_0__,price_changed_in_5yr,Latitude,Longitude
44873,72636,9121,33480,zip,FL,FL,Palm Beach,"Miami-Fort Lauderdale-Pompano Beach, FL",Palm Beach County,3957452.0,...,0.832971,0.167029,0.945463,0.021597,0.731085,0.696547,44873,4316371.0,,
43147,93816,8769,81611,zip,CO,CO,Aspen,"Glenwood Springs, CO",Pitkin County,3921426.0,...,0.635846,0.364154,0.941916,0.026553,0.724855,0.740129,43147,3518660.0,,
31468,96086,6389,90210,zip,CA,CA,Beverly Hills,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,2520645.0,...,0.746319,0.253681,0.940481,0.052095,0.735661,0.665279,31468,3430541.0,,
43922,96149,8928,90402,zip,CA,CA,Santa Monica,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,2345816.0,...,0.724623,0.275377,0.964941,0.016307,0.839153,0.858457,43922,2222702.0,,
55951,97518,11388,94027,zip,CA,CA,Atherton,"San Francisco-Oakland-Berkeley, CA",San Mateo County,5482394.0,...,0.884513,0.115487,0.958407,0.016372,0.948379,0.834272,55951,2124587.0,,
31383,96116,6372,90265,zip,CA,CA,Malibu,"Los Angeles-Long Beach-Anaheim, CA",Los Angeles County,1801163.0,...,0.792202,0.207798,0.969209,0.026922,0.750367,0.739861,31383,1944029.0,,
71339,93798,14553,81435,zip,CO,CO,,,San Miguel County,2036972.0,...,0.534912,0.465088,0.89366,0.061396,0.688791,0.722369,71339,1907623.0,,
114515,72914,24277,33921,zip,FL,FL,,"Cape Coral-Fort Myers, FL",Lee County,1561109.0,...,0.942529,0.057471,0.914943,0.085057,0.666667,0.619145,114515,1815062.0,,
78014,94026,15964,83014,zip,WY,WY,Wilson,"Jackson, WY-ID",Teton County,2047947.0,...,0.770612,0.229388,0.966531,0.0,0.608856,0.766533,78014,1800859.0,,
83114,99524,17025,98039,zip,WA,WA,Medina,"Seattle-Tacoma-Bellevue, WA",King County,2465796.0,...,0.814991,0.185009,0.968691,0.031309,0.929448,0.831313,83114,1799851.0,,


In [395]:
column_names = zillow_df.columns
    
    # Get all the column names used to pivot the data 
column_names_ids = ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName', 'State', 'City', 'Metro', 'CountyName']

# Get all the column names used to pivot the data 
column_names_values = list(set(column_names).difference(column_names_ids))
    
zillow_df = zillow_df.melt(id_vars=column_names_ids,
                    var_name='date',
                    value_vars=column_names_values,
                    value_name='sale_price')
zillow_df = zillow_df.rename(columns={'RegionName':'zipcode'})
zillow_df['year'] = zillow_df['date'].apply(lambda x: x.split('-')[0])
zillow_df['year'] = zillow_df['year'].apply(str)

In [396]:
zillow_df['sale_price_lagged_1month'] = (zillow_df.sort_values(by=['date'], ascending=True)
                       .groupby(['zipcode'])['sale_price'].shift(1))

zillow_df['sale_price_lagged_12months'] = (zillow_df.sort_values(by=['date'], ascending=True)
                       .groupby(['zipcode'])['sale_price'].shift(12))

zillow_df['month_over_month_change'] = (zillow_df['sale_price_lagged_1month'] - zillow_df['sale_price'])/zillow_df['sale_price']
zillow_df['year_over_year_change'] = (zillow_df['sale_price_lagged_12months'] - zillow_df['sale_price'])/zillow_df['sale_price']


In [57]:
zillow_df[zillow_df['RegionName'] == '19713']

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31
2893,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,200619.482897,...,277510.631733,277958.135956,278222.105761,279009.132871,280325.282797,282153.942023,284090.682863,285872.520046,287665.858067,289718.975144


In [385]:
# pip install altair

In [398]:
import altair as alt

zillow_df['date'] = pd.to_datetime(zillow_df['date'])

df_19713 = zillow_df[zillow_df['zipcode'] == '19713']
alt.Chart(df_19713).mark_line().encode(
    x='date:T',
    y= 'sale_price'
)

In [399]:
# list(master.columns)
master = zillow_df.merge(census, how='left', on=['zipcode', 'year'])
# zillow_df.shape # (2101120, 11)
master
# census['year'].dtypes

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,date,...,B27019_030E,B27019_035E,B27019_040E,house_occupied_ratio,house_vacant_ratio,has_one_or_more_computing_device_ratio,has_no_computing_device_ratio,has_health_insurance_ratio,has_health_insurance_ratio_26_to_64,has_health_insurance_ratio_65_and_over
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,2021-12-31,...,,,,0.959002,0.040998,0.991112,0.008888,,,
1,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,2021-12-31,...,1713.0,2772.0,4325.0,,,,,0.926645,0.925834,0.931672
2,61148,2,08701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,2021-12-31,...,,,,0.923638,0.076362,0.822053,0.177947,,,
3,61148,2,08701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,2021-12-31,...,4265.0,3076.0,3539.0,,,,,0.912155,0.886346,0.995142
4,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,2021-12-31,...,,,,0.964913,0.035087,0.986556,0.013444,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3673991,80861,39992,52163,zip,IA,IA,Protivin,,Howard County,2019-03-31,...,28.0,20.0,4.0,,,,,0.987342,0.983051,1.000000
3673992,69074,39992,26576,zip,WV,WV,Farmington,"Fairmont, WV",Marion County,2019-03-31,...,,,,0.694087,0.305913,1.000000,0.000000,,,
3673993,69074,39992,26576,zip,WV,WV,Farmington,"Fairmont, WV",Marion County,2019-03-31,...,34.0,9.0,9.0,,,,,0.952688,0.946731,1.000000
3673994,80190,39992,50160,zip,IA,IA,Martensdale,"Des Moines-West Des Moines, IA",Warren County,2019-03-31,...,,,,0.931507,0.068493,0.960784,0.039216,,,


In [390]:
# out = pa.Table.from_pandas(master)
# csv.write_csv(out, 'master_v2.csv')

# master_df = pd.read_csv('master.csv')
# # list(master_df.columns)
# master_df.head()

In [401]:
top20 = master.sort_values(by=['year_over_year_change'], ascending=False).head(20)
top20

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,date,...,B27019_030E,B27019_035E,B27019_040E,house_occupied_ratio,house_vacant_ratio,has_one_or_more_computing_device_ratio,has_no_computing_device_ratio,has_health_insurance_ratio,has_health_insurance_ratio_26_to_64,has_health_insurance_ratio_65_and_over
3195718,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-02-28,...,,,,0.865017,0.134983,0.841445,0.158555,,,
3195719,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-02-28,...,827.0,470.0,253.0,,,,,0.958939,0.940529,1.0
3641926,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-03-31,...,,,,0.865017,0.134983,0.841445,0.158555,,,
3641927,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-03-31,...,827.0,470.0,253.0,,,,,0.958939,0.940529,1.0
881106,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-01-31,...,,,,0.880197,0.119803,0.844836,0.155164,,,
881107,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-01-31,...,967.0,645.0,706.0,,,,,0.943271,0.922252,1.0
886323,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-01-31,...,827.0,470.0,253.0,,,,,0.958939,0.940529,1.0
886322,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-01-31,...,,,,0.865017,0.134983,0.841445,0.158555,,,
3190503,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-02-28,...,967.0,645.0,706.0,,,,,0.943271,0.922252,1.0
3190502,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-02-28,...,,,,0.880197,0.119803,0.844836,0.155164,,,


In [403]:
import matplotlib.pyplot as plt

In [406]:
alt.Chart(top20).mark_line().encode(
    x='date:T',
    y= 'sale_price',
    color='zipcode'
)

In [8]:
label_df = pd.read_csv('assets/census data labels.xlsx - raw labels.csv')
label_df = label_df[label_df['include_in_data_pull'] == 1]
len(label_df)

339

In [9]:
# set year, feature is set
year =2021


FIELDS = list(label_df['Name'])
FIELDS50 = FIELDS[0:50]
FIELDS100 = FIELDS[50:100]
FIELDS150 = FIELDS[100:150]
FIELDS200 = FIELDS[150:200]
FIELDS250 = FIELDS[200:250]
FIELDS300 = FIELDS[250:300]
FIELDS310 = FIELDS[300::]




df50 = get_census_data_by_zip(key, FIELDS50, year)
df100 = get_census_data_by_zip(key, FIELDS100, year)
df150 = get_census_data_by_zip(key, FIELDS150, year)
df200 = get_census_data_by_zip(key, FIELDS200, year)
df250 = get_census_data_by_zip(key, FIELDS250, year)
df300 = get_census_data_by_zip(key, FIELDS300, year)
df310 = get_census_data_by_zip(key, FIELDS310, year)
# frames = [df50, df100, df150, df200, df250, df300, df310]
# df = pd.concat(frames)

df = df50.merge(df100, on='zip code tabulation area')
df = df.merge(df150, on='zip code tabulation area')
df = df.merge(df200, on='zip code tabulation area')
df = df.merge(df250, on='zip code tabulation area')
df = df.merge(df300, on='zip code tabulation area')
df = df.merge(df310, on='zip code tabulation area')

## saving to file locally
out = pa.Table.from_pandas(df)
file_name = 'census{}.csv'.format(year)
csv.write_csv(out, file_name)





# '''
# Error 400: error: error: unknown variable 'B25130_001E'
# Error 400: error: error: unknown variable 'B25130_007E'
# Error 400: error: error: unknown variable 'C15003_013E'
# Error 400: error: error: unknown variable 'B25130_002E'
# Error 400: error: error: unknown variable 'B25130_009E'
# Error 400: error: error: unknown variable 'C15003_016E'
# Error 400: error: error: unknown variable 'B24114_002E'
# Error 400: error: error: unknown variable 'B25130_002E'
# Error 400: error: error: unknown variable 'B25130_009E'
# Error 400: error: error: unknown variable 'C15003_016E'
# '''

Error 400: error: error: unknown variable 'B25130_001E'
Error 400: error: error: unknown variable 'C15002_003E'


TypeError: Can only merge Series or DataFrame objects, a <class 'NoneType'> was passed

In [130]:
home_sales_df = pd.read_csv('ZHVI_Single_Family_Homes.csv')


In [157]:
census2021 = pd.read_csv('census/census2021.csv')
census2020 = pd.read_csv('census/census2020.csv')
census2019 = pd.read_csv('census/census2019.csv')
census2018 = pd.read_csv('census/census2018.csv')
census2017 = pd.read_csv('census/census2017.csv')
census2016 = pd.read_csv('census/census2016.csv')
census2015 = pd.read_csv('census/census2015.csv')

# print('before dropping  ' + str(census2021.shape))

census2021.dropna(how='all', axis=1, inplace=True)
census2020.dropna(how='all', axis=1,inplace=True)
census2019.dropna(how='all', axis=1,inplace=True)
census2018.dropna(how='all', axis=1,inplace=True)
census2017.dropna(how='all', axis=1,inplace=True)
census2016.dropna(how='all', axis=1,inplace=True)
census2015.dropna(how='all', axis=1,inplace=True)

col_list = list(census2021.columns)

for col in col_list:
    if 'zip' in col:
        print(col)
# census = pd.concat([census2021, census2020, census2019, census2018, census2017, census2016, census2015])
# census.drop_duplicates(inplace=True)
# # master = home_sales_df.merge(census2017, left_on='RegionName', right_on='zip code tabulation area')

# census.drop(columns=['__index_level_0__', 'state'], inplace=True)
# census.dropna(how='all', axis=1, inplace=True)
# # census.head()
# census.set_index('year', inplace=True)
# f = census[census['zip code tabulation area'] == 19713]
# # master[master['RegionName'] ==19713]
# f['zip code tabulation area']

zip code tabulation area


In [166]:
FIELDS50

['B01001_001E',
 'B01001_002E',
 'B01001_026E',
 'B01002_001E',
 'B01002_002E',
 'B01002_003E',
 'B05010_001E',
 'B06011_001E',
 'B06012_001E',
 'B06012_002E',
 'B06012_003E',
 'B06012_004E',
 'B07001_001E',
 'B07001_049E',
 'B07001_065E',
 'B07001_081E',
 'B07002_001E',
 'B07009_002E',
 'B07009_003E',
 'B07009_004E',
 'B07009_005E',
 'B07009_006E',
 'B07010_002E',
 'B07010_003E',
 'B07010_004E',
 'B07010_005E',
 'B07010_006E',
 'B07010_007E',
 'B07010_008E',
 'B07010_009E',
 'B07010_010E',
 'B07010_011E',
 'B07013_002E',
 'B07013_003E',
 'B07101_002E',
 'B07101_003E',
 'B07101_004E',
 'B07101_005E',
 'B07101_006E',
 'B07101_007E',
 'B07101_008E',
 'B07403_002E',
 'B07403_003E',
 'B07409_002E',
 'B07409_003E',
 'B07409_004E',
 'B07409_005E',
 'B07409_006E',
 'B07410_002E',
 'B07410_003E']