In [173]:
import pandas as pd
import requests
import json
import pyarrow as pa
from pyarrow import csv

In [174]:
# pip install pyarrow

In [175]:
key = '658f931dafd58fa7948b59ebdb5c381ff7f84235'

In [176]:
def get_census_data_by_zip(api_key, fields, year):
    """
    Fetch census data by ZIP Code Tabulation Areas (ZCTAs) for specified fields.
    
    :param api_key: Your Census API Key.
    :param fields: List of fields to fetch.
    :param year: Census year (default is 2020).
    :return: DataFrame with fetched data.
    """
    base_url = f"https://api.census.gov/data/{year}/acs/acs5"
    
    # Combine the fields into a comma-separated string
    fields_str = ",".join(fields)

    # Construct the final URL
    url = f"{base_url}?get={fields_str}&for=zip%20code%20tabulation%20area:*"
    # url = 'https://api.census.gov/data/2018/zbp?get=NAME,GEO_ID,NAICS2017_LABEL,EMPSZES_LABEL,EMPSZES,ESTAB&for=zip%20code:*&NAICS2017=00&key={}'.format(key)

    headers = {
        "Content-Type": "application/json",
    }

    # Make the API request
    response = requests.get(url, headers=headers, params={"key": api_key})
    
    if response.status_code == 200:
        data = response.json()
        # Convert data to DataFrame
        df = pd.DataFrame(data[1:], columns=data[0])
        df['year'] = year
        return df
    else:
        print(f"Error {response.status_code}: {response.text}")
        return None

In [177]:
# get_census_data_by_zip(key, FIELDS50, 2018)

In [247]:
# get labels
# label = 'census data labels.xlsx - raw labels.csv'
# label_df = pd.read_csv(label)
# label_df = label_df[label_df['include'] == 1]
#     'S1101_C01_002E': 'Estimate!!Total!!HOUSEHOLDS!!Average household size',
#     'S1101_C01_003E': 'Estimate!!Total!!FAMILIES!!Total families',
#     'S1101_C01_004E': 'Estimate!!Total!!FAMILIES!!Average family size',
#     'S1101_C01_009E': 'Estimate!!Total!!Total households',
#     'S1101_C01_018E': 'Estimate!!Total!!Total households!!HOUSING TENURE!!Owner-occupied housing units',
#     'S1101_C01_019E': 'Estimate!!Total!!Total households!!HOUSING TENURE!!Renter-occupied housing units',
#     'S1101_C05_003E': 'Estimate!!Nonfamily household!!FAMILIES!!Total families'
#     B28010_001E	Estimate!!Total: Computers in Household
#     'B28010_002E': 'Estimate!!Total:!!Has one or more types of computing devices:',
#     'B28010_005E': 'Estimate!!Total:!!Has one or more types of computing devices:!!Smartphone, tablet or other portable wireless computer or other computer',
#     'B28010_007E': 'Estimate!!Total:!!No Computer'
# B27019_001E	Estimate!!Total:
# B27019_002E	Estimate!!Total:!!26 to 64 years:
# B25002_001E	Estimate!!Total:	Occupancy Status
# B25002_002E	Estimate!!Total:!!Occupied	Occupancy Status
# B25002_003E	Estimate!!Total:!!Vacant	Occupancy Status

# B27019_001E	Estimate!!Total:
# B27019_002E	Estimate!!Total:!!26 to 64 years:
# B27019_004E	Estimate!!Total:!!26 to 64 years:!!Less than high school graduate:!!With health insurance coverage
# B27019_009E	Estimate!!Total:!!26 to 64 years:!!High school graduate (includes equivalency):!!With health insurance coverage
# B27019_014E	Estimate!!Total:!!26 to 64 years:!!Some college or associate's degree:!!With health insurance coverage
# B27019_019E	Estimate!!Total:!!26 to 64 years:!!Bachelor's degree or higher:!!With health insurance coverage

# B27019_023E	Estimate!!Total:!!65 years and over:
# B27019_025E	Estimate!!Total:!!65 years and over:!!Less than high school graduate:!!With health insurance coverage
# B27019_030E	Estimate!!Total:!!65 years and over:!!High school graduate (includes equivalency):!!With health insurance coverage
# B27019_035E	Estimate!!Total:!!65 years and over:!!Some college or associate's degree:!!With health insurance coverage
# B27019_040E	Estimate!!Total:!!65 years and over:!!Bachelor's degree or higher:!!With health insurance coverage



label2 = 'census data labels v2.csv'
label_df = pd.read_csv(label2)
label_df = label_df[label_df['include_in_master_table'] == 1]
features = list(label_df['Name'])
health_features = ['B27019_001E', 'B27019_002E', 'B27019_004E', 'B27019_009E', 'B27019_014E', 'B27019_019E',
                  'B27019_023E', 'B27019_025E', 'B27019_030E', 'B27019_035E', 'B27019_040E']
print(len(features))
print(features)


47
['B01001_001E', 'B01001_002E', 'B01001_026E', 'B01002_001E', 'B01002_002E', 'B01002_003E', 'B07101_002E', 'B07101_003E', 'B07101_004E', 'B08012_002E', 'B08012_003E', 'B08012_004E', 'B08012_005E', 'B08012_006E', 'B08012_007E', 'B08012_008E', 'B08012_009E', 'B08012_010E', 'B08012_011E', 'B08012_012E', 'B08012_013E', 'B17026_002E', 'B17026_003E', 'B17026_004E', 'B17026_005E', 'B17026_006E', 'B17026_007E', 'B17026_008E', 'B17026_009E', 'B17026_010E', 'B17026_011E', 'B17026_012E', 'B17026_013E', 'B25001_001E', 'B25002_001E', 'B25002_002E', 'B25002_003E', 'B25003_001E', 'B25003_002E', 'B25003_003E', 'B27019_001E', 'B27019_002E', 'B27019_023E', 'B28010_001E', 'B28010_002E', 'B28010_005E', 'B28010_007E']


In [252]:
year =2021

df = get_census_data_by_zip(key, health_features, year)
out = pa.Table.from_pandas(df)
file_name = 'census_health{}.csv'.format(year)
csv.write_csv(out, file_name)

In [378]:
# list(df.columns)

census2021 = pd.read_csv('census/census_ad2021.csv', converters={'zip code tabulation area': str, 'year:': str})
census2020 = pd.read_csv('census/census_ad2020.csv', converters={'zip code tabulation area': str, 'year:': str})
census2019 = pd.read_csv('census/census_ad2019.csv', converters={'zip code tabulation area': str, 'year:': str})
census2018 = pd.read_csv('census/census_ad2018.csv', converters={'zip code tabulation area': str, 'year:': str})
census2017 = pd.read_csv('census/census_ad2017.csv', converters={'zip code tabulation area': str, 'year:': str})

census2021.dropna(how='all', axis=1, inplace=True)
census2020.dropna(how='all', axis=1,inplace=True)
census2019.dropna(how='all', axis=1,inplace=True)
census2018.dropna(how='all', axis=1,inplace=True)
census2017.dropna(how='all', axis=1,inplace=True)


census2021h = pd.read_csv('census/census_health2021.csv', converters={'zip code tabulation area': str, 'year:': str})
census2020h = pd.read_csv('census/census_health2020.csv', converters={'zip code tabulation area': str, 'year:': str})
census2019h = pd.read_csv('census/census_health2019.csv', converters={'zip code tabulation area': str, 'year:': str})
census2018h = pd.read_csv('census/census_health2018.csv', converters={'zip code tabulation area': str, 'year:': str})
census2017h = pd.read_csv('census/census_health2017.csv', converters={'zip code tabulation area': str, 'year:': str})

census2021h.dropna(how='all', axis=1, inplace=True)
census2020h.dropna(how='all', axis=1,inplace=True)
census2019h.dropna(how='all', axis=1,inplace=True)
census2018h.dropna(how='all', axis=1,inplace=True)
census2017h.dropna(how='all', axis=1,inplace=True)



census = pd.concat([census2021, census2020, census2019, census2018, census2017,
                   census2021h, census2020h, census2019h, census2018h, census2017h])
census.drop_duplicates(inplace=True)
census.rename(columns={'zip code tabulation area':'zipcode'}, inplace=True)
census['year'] = census['year'].apply(str)
# census[census['zip code tabulation area'] == 19713]
# census[['zip code tabulation area','year', 'B25002_001E', 'B25002_002E', 'B25002_003E']]
census['house_occupied_ratio'] = census['B25002_002E']/census['B25002_001E']
census['house_vacant_ratio'] = census['B25002_003E']/census['B25002_001E']
census['has_one_or_more_computing_device_ratio'] = census['B28010_002E']/census['B28010_001E']
census['has_no_computing_device_ratio'] = census['B28010_007E']/census['B28010_001E']
census['has_health_insurance_ratio'] = (census['B27019_004E']+census['B27019_009E']+census['B27019_014E']+census['B27019_019E']+census['B27019_025E']+census['B27019_030E']+census['B27019_035E']+census['B27019_040E'])/census['B27019_001E']
census['has_health_insurance_ratio_26_to_64'] =  (census['B27019_004E']+census['B27019_009E']+census['B27019_014E']+census['B27019_019E'])/census['B27019_002E']
census['has_health_insurance_ratio_65_and_over'] = (census['B27019_025E']+census['B27019_030E']+census['B27019_035E']+census['B27019_040E'])/census['B27019_023E']

# census[['zip code tabulation area', 'has_health_insurance_ratio', 'has_health_insurance_ratio_26_to_64', 'has_health_insurance_ratio_65_and_over']]

In [357]:
census[['zipcode', 'has_health_insurance_ratio', 'has_health_insurance_ratio_26_to_64', 'has_health_insurance_ratio_65_and_over']]

Unnamed: 0,zipcode,has_health_insurance_ratio,has_health_insurance_ratio_26_to_64,has_health_insurance_ratio_65_and_over
0,00601,,,
1,00602,,,
2,00603,,,
3,00606,,,
4,00610,,,
...,...,...,...,...
33115,71454,0.825511,0.780635,1.0
33116,71455,0.606061,0.500000,1.0
33117,71921,0.845838,0.797648,1.0
33118,71336,0.794872,0.708879,1.0


In [358]:
zillow_seasonal = pd.read_csv('Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
zillow_seasonal.head()

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2000-01-31,...,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,213207.449193,...,484545.423106,482998.627982,480762.710077,479445.795903,478396.792362,479623.197138,481652.685632,484818.440189,487650.169224,490161.927693
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,138706.618807,...,525459.564754,528481.179889,530866.429595,533099.554913,536015.609542,540209.612968,545621.736221,551618.956629,557787.374606,563740.488083
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,104005.403992,...,287817.295002,285751.839606,283305.514483,280947.848337,279077.781291,277875.217836,277392.281244,277773.660692,278539.454227,279441.879993
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,149493.2119,...,508472.778857,505169.076617,499583.980419,492648.821682,486263.256758,479571.79702,472397.526197,466948.659683,465108.150687,465102.144897
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,102704.068566,...,277720.958818,276694.069269,275123.071172,273514.546846,272181.096873,271383.921551,271240.340967,271842.305613,272691.518967,273570.680709


In [392]:
zillow_df = pd.read_csv('ZHVI_Single_Family_Homes.csv', converters={'RegionName': str})

In [393]:
zillow_df.shape

(26264, 293)

In [394]:


to_drop =['2000-01-31',
 '2000-02-29',
 '2000-03-31',
 '2000-04-30',
 '2000-05-31',
 '2000-06-30',
 '2000-07-31',
 '2000-08-31',
 '2000-09-30',
 '2000-10-31',
 '2000-11-30',
 '2000-12-31',
 '2001-01-31',
 '2001-02-28',
 '2001-03-31',
 '2001-04-30',
 '2001-05-31',
 '2001-06-30',
 '2001-07-31',
 '2001-08-31',
 '2001-09-30',
 '2001-10-31',
 '2001-11-30',
 '2001-12-31',
 '2002-01-31',
 '2002-02-28',
 '2002-03-31',
 '2002-04-30',
 '2002-05-31',
 '2002-06-30',
 '2002-07-31',
 '2002-08-31',
 '2002-09-30',
 '2002-10-31',
 '2002-11-30',
 '2002-12-31',
 '2003-01-31',
 '2003-02-28',
 '2003-03-31',
 '2003-04-30',
 '2003-05-31',
 '2003-06-30',
 '2003-07-31',
 '2003-08-31',
 '2003-09-30',
 '2003-10-31',
 '2003-11-30',
 '2003-12-31',
 '2004-01-31',
 '2004-02-29',
 '2004-03-31',
 '2004-04-30',
 '2004-05-31',
 '2004-06-30',
 '2004-07-31',
 '2004-08-31',
 '2004-09-30',
 '2004-10-31',
 '2004-11-30',
 '2004-12-31',
 '2005-01-31',
 '2005-02-28',
 '2005-03-31',
 '2005-04-30',
 '2005-05-31',
 '2005-06-30',
 '2005-07-31',
 '2005-08-31',
 '2005-09-30',
 '2005-10-31',
 '2005-11-30',
 '2005-12-31',
 '2006-01-31',
 '2006-02-28',
 '2006-03-31',
 '2006-04-30',
 '2006-05-31',
 '2006-06-30',
 '2006-07-31',
 '2006-08-31',
 '2006-09-30',
 '2006-10-31',
 '2006-11-30',
 '2006-12-31',
 '2007-01-31',
 '2007-02-28',
 '2007-03-31',
 '2007-04-30',
 '2007-05-31',
 '2007-06-30',
 '2007-07-31',
 '2007-08-31',
 '2007-09-30',
 '2007-10-31',
 '2007-11-30',
 '2007-12-31',
 '2008-01-31',
 '2008-02-29',
 '2008-03-31',
 '2008-04-30',
 '2008-05-31',
 '2008-06-30',
 '2008-07-31',
 '2008-08-31',
 '2008-09-30',
 '2008-10-31',
 '2008-11-30',
 '2008-12-31',
 '2009-01-31',
 '2009-02-28',
 '2009-03-31',
 '2009-04-30',
 '2009-05-31',
 '2009-06-30',
 '2009-07-31',
 '2009-08-31',
 '2009-09-30',
 '2009-10-31',
 '2009-11-30',
 '2009-12-31',
 '2010-01-31',
 '2010-02-28',
 '2010-03-31',
 '2010-04-30',
 '2010-05-31',
 '2010-06-30',
 '2010-07-31',
 '2010-08-31',
 '2010-09-30',
 '2010-10-31',
 '2010-11-30',
 '2010-12-31',
 '2011-01-31',
 '2011-02-28',
 '2011-03-31',
 '2011-04-30',
 '2011-05-31',
 '2011-06-30',
 '2011-07-31',
 '2011-08-31',
 '2011-09-30',
 '2011-10-31',
 '2011-11-30',
 '2011-12-31',
 '2012-01-31',
 '2012-02-29',
 '2012-03-31',
 '2012-04-30',
 '2012-05-31',
 '2012-06-30',
 '2012-07-31',
 '2012-08-31',
 '2012-09-30',
 '2012-10-31',
 '2012-11-30',
 '2012-12-31',
 '2013-01-31',
 '2013-02-28',
 '2013-03-31',
 '2013-04-30',
 '2013-05-31',
 '2013-06-30',
 '2013-07-31',
 '2013-08-31',
 '2013-09-30',
 '2013-10-31',
 '2013-11-30',
 '2013-12-31',
 '2014-01-31',
 '2014-02-28',
 '2014-03-31',
 '2014-04-30',
 '2014-05-31',
 '2014-06-30',
 '2014-07-31',
 '2014-08-31',
 '2014-09-30',
 '2014-10-31',
 '2014-11-30',
 '2014-12-31',
 '2015-01-31',
 '2015-02-28',
 '2015-03-31',
 '2015-04-30',
 '2015-05-31',
 '2015-06-30',
 '2015-07-31',
 '2015-08-31',
 '2015-09-30',
 '2015-10-31',
 '2015-11-30',
 '2015-12-31',
 '2016-01-31',
 '2016-02-29',
 '2016-03-31',
 '2016-04-30',
 '2016-05-31',
 '2016-06-30',
 '2016-07-31',
 '2016-08-31',
 '2016-09-30',
 '2016-10-31',
 '2016-11-30',
 '2016-12-31']


# drop data before 2017
zillow_df.drop(columns=to_drop, inplace=True)

zillow_df.head(5)

Unnamed: 0,RegionID,SizeRank,RegionName,RegionType,StateName,State,City,Metro,CountyName,2017-01-31,...,2022-11-30,2022-12-31,2023-01-31,2023-02-28,2023-03-31,2023-04-30,2023-05-31,2023-06-30,2023-07-31,2023-08-31
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,323580.258551,...,484812.248823,483270.636456,481038.183265,479721.873619,478668.36715,479881.403656,481894.597228,485046.382049,487872.99138,490385.169211
1,61148,2,8701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,394156.300927,...,644904.087044,648741.824038,650794.311229,652508.254601,655175.941212,660548.696785,668432.531461,677540.593485,687378.218275,696773.567354
2,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,171583.47755,...,287849.265624,285784.837604,283338.919177,280982.358438,279112.875127,277911.26653,277429.5663,277812.780825,278579.748133,279482.307522
3,62080,4,11368,zip,NY,NY,New York,"New York-Newark-Jersey City, NY-NJ-PA",Queens County,719129.427216,...,886897.867278,885628.363831,885552.350921,884145.746022,881743.541864,878675.145891,875712.760185,872584.255384,871420.265788,871160.471818
4,91733,5,77084,zip,TX,TX,Houston,"Houston-The Woodlands-Sugar Land, TX",Harris County,168888.858279,...,278537.547066,277502.648841,275922.593554,274305.254214,272963.850362,272159.780823,272011.02478,272611.025961,273460.285442,274335.613083


In [395]:
column_names = zillow_df.columns
    
    # Get all the column names used to pivot the data 
column_names_ids = ['RegionID', 'SizeRank', 'RegionName', 'RegionType', 'StateName', 'State', 'City', 'Metro', 'CountyName']

# Get all the column names used to pivot the data 
column_names_values = list(set(column_names).difference(column_names_ids))
    
zillow_df = zillow_df.melt(id_vars=column_names_ids,
                    var_name='date',
                    value_vars=column_names_values,
                    value_name='sale_price')
zillow_df = zillow_df.rename(columns={'RegionName':'zipcode'})
zillow_df['year'] = zillow_df['date'].apply(lambda x: x.split('-')[0])
zillow_df['year'] = zillow_df['year'].apply(str)

In [396]:
zillow_df['sale_price_lagged_1month'] = (zillow_df.sort_values(by=['date'], ascending=True)
                       .groupby(['zipcode'])['sale_price'].shift(1))

zillow_df['sale_price_lagged_12months'] = (zillow_df.sort_values(by=['date'], ascending=True)
                       .groupby(['zipcode'])['sale_price'].shift(12))

zillow_df['month_over_month_change'] = (zillow_df['sale_price_lagged_1month'] - zillow_df['sale_price'])/zillow_df['sale_price']
zillow_df['year_over_year_change'] = (zillow_df['sale_price_lagged_12months'] - zillow_df['sale_price'])/zillow_df['sale_price']


In [397]:
zillow_df[zillow_df['zipcode'] == '19713']

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,date,sale_price,year,sale_price_lagged_1month,sale_price_lagged_12months,month_over_month_change,year_over_year_change
2893,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2021-12-31,251400.037571,2021,250185.881263,233633.190236,-0.004830,-0.070672
29157,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2017-05-31,203608.629697,2017,203706.724832,,0.000482,
55421,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2020-03-31,218222.921683,2020,217445.184082,215847.467550,-0.003564,-0.010885
81685,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2020-07-31,217947.190913,2020,217765.479555,215971.011140,-0.000834,-0.009067
107949,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2017-08-31,202525.478978,2017,202572.280781,,0.000231,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1972693,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2021-02-28,242371.946635,2021,237997.258425,217445.184082,-0.018049,-0.102845
1998957,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2017-01-31,200619.482897,2017,,,,
2025221,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2018-10-31,211994.430843,2018,211572.093051,203661.997176,-0.001992,-0.039305
2051485,66037,2929,19713,zip,DE,DE,Newark,"Philadelphia-Camden-Wilmington, PA-NJ-DE-MD",New Castle County,2021-09-30,252501.567323,2021,255043.382996,221770.854690,0.010067,-0.121705


In [385]:
# pip install altair

In [398]:
import altair as alt

zillow_df['date'] = pd.to_datetime(zillow_df['date'])

df_19713 = zillow_df[zillow_df['zipcode'] == '19713']
alt.Chart(df_19713).mark_line().encode(
    x='date:T',
    y= 'sale_price'
)

In [399]:
# list(master.columns)
master = zillow_df.merge(census, how='left', on=['zipcode', 'year'])
# zillow_df.shape # (2101120, 11)
master
# census['year'].dtypes

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,date,...,B27019_030E,B27019_035E,B27019_040E,house_occupied_ratio,house_vacant_ratio,has_one_or_more_computing_device_ratio,has_no_computing_device_ratio,has_health_insurance_ratio,has_health_insurance_ratio_26_to_64,has_health_insurance_ratio_65_and_over
0,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,2021-12-31,...,,,,0.959002,0.040998,0.991112,0.008888,,,
1,91982,1,77494,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Fort Bend County,2021-12-31,...,1713.0,2772.0,4325.0,,,,,0.926645,0.925834,0.931672
2,61148,2,08701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,2021-12-31,...,,,,0.923638,0.076362,0.822053,0.177947,,,
3,61148,2,08701,zip,NJ,NJ,Lakewood,"New York-Newark-Jersey City, NY-NJ-PA",Ocean County,2021-12-31,...,4265.0,3076.0,3539.0,,,,,0.912155,0.886346,0.995142
4,91940,3,77449,zip,TX,TX,Katy,"Houston-The Woodlands-Sugar Land, TX",Harris County,2021-12-31,...,,,,0.964913,0.035087,0.986556,0.013444,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3673991,80861,39992,52163,zip,IA,IA,Protivin,,Howard County,2019-03-31,...,28.0,20.0,4.0,,,,,0.987342,0.983051,1.000000
3673992,69074,39992,26576,zip,WV,WV,Farmington,"Fairmont, WV",Marion County,2019-03-31,...,,,,0.694087,0.305913,1.000000,0.000000,,,
3673993,69074,39992,26576,zip,WV,WV,Farmington,"Fairmont, WV",Marion County,2019-03-31,...,34.0,9.0,9.0,,,,,0.952688,0.946731,1.000000
3673994,80190,39992,50160,zip,IA,IA,Martensdale,"Des Moines-West Des Moines, IA",Warren County,2019-03-31,...,,,,0.931507,0.068493,0.960784,0.039216,,,


In [390]:
# out = pa.Table.from_pandas(master)
# csv.write_csv(out, 'master_v2.csv')

# master_df = pd.read_csv('master.csv')
# # list(master_df.columns)
# master_df.head()

In [401]:
top20 = master.sort_values(by=['year_over_year_change'], ascending=False).head(20)
top20

Unnamed: 0,RegionID,SizeRank,zipcode,RegionType,StateName,State,City,Metro,CountyName,date,...,B27019_030E,B27019_035E,B27019_040E,house_occupied_ratio,house_vacant_ratio,has_one_or_more_computing_device_ratio,has_no_computing_device_ratio,has_health_insurance_ratio,has_health_insurance_ratio_26_to_64,has_health_insurance_ratio_65_and_over
3195718,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-02-28,...,,,,0.865017,0.134983,0.841445,0.158555,,,
3195719,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-02-28,...,827.0,470.0,253.0,,,,,0.958939,0.940529,1.0
3641926,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-03-31,...,,,,0.865017,0.134983,0.841445,0.158555,,,
3641927,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-03-31,...,827.0,470.0,253.0,,,,,0.958939,0.940529,1.0
881106,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-01-31,...,,,,0.880197,0.119803,0.844836,0.155164,,,
881107,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-01-31,...,967.0,645.0,706.0,,,,,0.943271,0.922252,1.0
886323,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-01-31,...,827.0,470.0,253.0,,,,,0.958939,0.940529,1.0
886322,85705,10368,62832,zip,IL,IL,Du Quoin,,Perry County,2019-01-31,...,,,,0.865017,0.134983,0.841445,0.158555,,,
3190503,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-02-28,...,967.0,645.0,706.0,,,,,0.943271,0.922252,1.0
3190502,85826,7700,62966,zip,IL,IL,Murphysboro,"Carbondale-Marion, IL",Jackson County,2019-02-28,...,,,,0.880197,0.119803,0.844836,0.155164,,,


In [403]:
import matplotlib.pyplot as plt

In [406]:
alt.Chart(top20).mark_line().encode(
    x='date:T',
    y= 'sale_price',
    color='zipcode'
)

In [172]:
# set year, feature is set
year =2021


FIELDS = list(label_df['Name'])
FIELDS50 = FIELDS[0:50]
FIELDS100 = FIELDS[50:100]
FIELDS150 = FIELDS[100:150]
FIELDS200 = FIELDS[150:200]
FIELDS250 = FIELDS[200:250]
FIELDS300 = FIELDS[250:300]
FIELDS310 = FIELDS[300::]




df50 = get_census_data_by_zip(key, FIELDS50, year)
df100 = get_census_data_by_zip(key, FIELDS100, year)
df150 = get_census_data_by_zip(key, FIELDS150, year)
df200 = get_census_data_by_zip(key, FIELDS200, year)
df250 = get_census_data_by_zip(key, FIELDS250, year)
df300 = get_census_data_by_zip(key, FIELDS300, year)
df310 = get_census_data_by_zip(key, FIELDS310, year)
# frames = [df50, df100, df150, df200, df250, df300, df310]
# df = pd.concat(frames)

df = df50.merge(df100, on='zip code tabulation area')
df = df.merge(df150, on='zip code tabulation area')
df = df.merge(df200, on='zip code tabulation area')
df = df.merge(df250, on='zip code tabulation area')
df = df.merge(df300, on='zip code tabulation area')
df = df.merge(df310, on='zip code tabulation area')

## saving to file locally
# out = pa.Table.from_pandas(df)
# file_name = 'census{}.csv'.format(year)
# csv.write_csv(out, file_name)





# '''
# Error 400: error: error: unknown variable 'B25130_001E'
# Error 400: error: error: unknown variable 'B25130_007E'
# Error 400: error: error: unknown variable 'C15003_013E'
# Error 400: error: error: unknown variable 'B25130_002E'
# Error 400: error: error: unknown variable 'B25130_009E'
# Error 400: error: error: unknown variable 'C15003_016E'
# Error 400: error: error: unknown variable 'B24114_002E'
# Error 400: error: error: unknown variable 'B25130_002E'
# Error 400: error: error: unknown variable 'B25130_009E'
# Error 400: error: error: unknown variable 'C15003_016E'
# '''

Error 400: error: error: unknown variable 'B25130_002E'
Error 400: error: error: unknown variable 'B25130_009E'
Error 400: error: error: unknown variable 'C15003_016E'


Unnamed: 0,B01001_001E,B01001_002E,B01001_026E,B01002_001E,B01002_002E,B01002_003E,B05010_001E,B06011_001E,B06012_001E,B06012_002E,...,B07403_003E,B07409_002E,B07409_003E,B07409_004E,B07409_005E,B07409_006E,B07410_002E,B07410_003E,zip code tabulation area,year
0,17126,8451,8675,43.7,42.6,45.1,3170,,,,...,,,,,,,,,00601,2021
1,37895,18588,19307,44.4,43.4,45.6,6130,,,,...,,,,,,,,,00602,2021
2,49136,23817,25319,44.1,42.3,45.7,8252,,,,...,,,,,,,,,00603,2021
3,5751,2817,2934,44.9,44.6,45.8,922,,,,...,,,,,,,,,00606,2021
4,26153,12678,13475,43.5,41.9,45.8,4229,,,,...,,,,,,,,,00610,2021
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33769,13,0,13,-666666666.0,-666666666.0,-666666666.0,0,-666666666,13,0,...,,,,,,,,,99923,2021
33770,917,513,404,43.3,43.9,41.8,184,24392,916,182,...,,,,,,,,,99925,2021
33771,1445,801,644,38.0,38.6,37.4,318,21970,1429,252,...,,,,,,,,,99926,2021
33772,11,0,11,-666666666.0,-666666666.0,-666666666.0,0,-666666666,11,0,...,,,,,,,,,99927,2021


In [130]:
home_sales_df = pd.read_csv('ZHVI_Single_Family_Homes.csv')


In [157]:
census2021 = pd.read_csv('census/census2021.csv')
census2020 = pd.read_csv('census/census2020.csv')
census2019 = pd.read_csv('census/census2019.csv')
census2018 = pd.read_csv('census/census2018.csv')
census2017 = pd.read_csv('census/census2017.csv')
census2016 = pd.read_csv('census/census2016.csv')
census2015 = pd.read_csv('census/census2015.csv')

# print('before dropping  ' + str(census2021.shape))

census2021.dropna(how='all', axis=1, inplace=True)
census2020.dropna(how='all', axis=1,inplace=True)
census2019.dropna(how='all', axis=1,inplace=True)
census2018.dropna(how='all', axis=1,inplace=True)
census2017.dropna(how='all', axis=1,inplace=True)
census2016.dropna(how='all', axis=1,inplace=True)
census2015.dropna(how='all', axis=1,inplace=True)

col_list = list(census2021.columns)

for col in col_list:
    if 'zip' in col:
        print(col)
# census = pd.concat([census2021, census2020, census2019, census2018, census2017, census2016, census2015])
# census.drop_duplicates(inplace=True)
# # master = home_sales_df.merge(census2017, left_on='RegionName', right_on='zip code tabulation area')

# census.drop(columns=['__index_level_0__', 'state'], inplace=True)
# census.dropna(how='all', axis=1, inplace=True)
# # census.head()
# census.set_index('year', inplace=True)
# f = census[census['zip code tabulation area'] == 19713]
# # master[master['RegionName'] ==19713]
# f['zip code tabulation area']

zip code tabulation area


In [166]:
FIELDS50

['B01001_001E',
 'B01001_002E',
 'B01001_026E',
 'B01002_001E',
 'B01002_002E',
 'B01002_003E',
 'B05010_001E',
 'B06011_001E',
 'B06012_001E',
 'B06012_002E',
 'B06012_003E',
 'B06012_004E',
 'B07001_001E',
 'B07001_049E',
 'B07001_065E',
 'B07001_081E',
 'B07002_001E',
 'B07009_002E',
 'B07009_003E',
 'B07009_004E',
 'B07009_005E',
 'B07009_006E',
 'B07010_002E',
 'B07010_003E',
 'B07010_004E',
 'B07010_005E',
 'B07010_006E',
 'B07010_007E',
 'B07010_008E',
 'B07010_009E',
 'B07010_010E',
 'B07010_011E',
 'B07013_002E',
 'B07013_003E',
 'B07101_002E',
 'B07101_003E',
 'B07101_004E',
 'B07101_005E',
 'B07101_006E',
 'B07101_007E',
 'B07101_008E',
 'B07403_002E',
 'B07403_003E',
 'B07409_002E',
 'B07409_003E',
 'B07409_004E',
 'B07409_005E',
 'B07409_006E',
 'B07410_002E',
 'B07410_003E']