# Real Estate Market Selection Model

<b> Goal of Model and Analysis </b>
    
The goal of this analysis is to help inform which city Mehrmah and I decide to buy an investment property in.  The output of the analysis will be a ranked list of cities with forecasted 5YR and 10YR appreciation values.

In order to reduce the risk of us losing money, a strict criteria for our next investment property is that it will be cashflow neutral or better, however we recognize that appreciation will likely be the primary way to get strong returns.  Therefore, our objective function is to maximize appreciation under the constraint that the property cashflows with a 20% downpayment. 

Given a property’s ability to cashflow requires analysis on the individual property, we will first do the analysis to determine which markets to look at individual properties in.  


In [1]:
pip install openpyxl

Looking in indexes: https://pypi.lyft.net/simple/, https://pypi.lyft.net/pypi/
Note: you may need to restart the kernel to use updated packages.


In [2]:
import pandas as pd

# Data Imports and Reformating

In [3]:
poverty = pd.read_excel("PovertyEstimates.xls", header = 4) #, skiprows = 3) #, skipfooter = 3, dtype = str)
poverty


Unnamed: 0,FIPStxt,Stabr,Area_name,Rural-urban_Continuum_Code_2003,Urban_Influence_Code_2003,Rural-urban_Continuum_Code_2013,Urban_Influence_Code_2013,POVALL_2019,CI90LBALL_2019,CI90UBALL_2019,...,CI90UB517P_2019,MEDHHINC_2019,CI90LBINC_2019,CI90UBINC_2019,POV04_2019,CI90LB04_2019,CI90UB04_2019,PCTPOV04_2019,CI90LB04P_2019,CI90UB04P_2019
0,0,US,United States,,,,,39490096,39248096,39732096,...,16.0,65712,65594,65830,3457689.0,3405854.0,3509524.0,18.2,17.9,18.5
1,1000,AL,Alabama,,,,,747478,730491,764465,...,21.6,51771,51179,52363,69236.0,65296.0,73176.0,24.2,22.8,25.6
2,1001,AL,Autauga County,2.0,2.0,2.0,2.0,6723,5517,7929,...,19.4,58233,52517,63949,,,,,,
3,1003,AL,Baldwin County,4.0,5.0,3.0,2.0,22360,18541,26179,...,17.2,59871,54593,65149,,,,,,
4,1005,AL,Barbour County,6.0,6.0,6.0,6.0,5909,4787,7031,...,49.0,35972,31822,40122,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3188,56037,WY,Sweetwater County,5.0,8.0,5.0,8.0,3453,2743,4163,...,11.1,80639,73437,87841,,,,,,
3189,56039,WY,Teton County,7.0,8.0,7.0,8.0,1396,1073,1719,...,6.7,98837,86531,111143,,,,,,
3190,56041,WY,Uinta County,7.0,8.0,7.0,8.0,1699,1264,2134,...,11.1,70756,63191,78321,,,,,,
3191,56043,WY,Washakie County,7.0,11.0,7.0,11.0,845,626,1064,...,17.4,55122,50050,60194,,,,,,


In [4]:
poverty.columns

Index(['FIPStxt', 'Stabr', 'Area_name', 'Rural-urban_Continuum_Code_2003',
       'Urban_Influence_Code_2003', 'Rural-urban_Continuum_Code_2013',
       'Urban_Influence_Code_2013', 'POVALL_2019', 'CI90LBALL_2019',
       'CI90UBALL_2019', 'PCTPOVALL_2019', 'CI90LBALLP_2019',
       'CI90UBALLP_2019', 'POV017_2019', 'CI90LB017_2019', 'CI90UB017_2019',
       'PCTPOV017_2019', 'CI90LB017P_2019', 'CI90UB017P_2019', 'POV517_2019',
       'CI90LB517_2019', 'CI90UB517_2019', 'PCTPOV517_2019', 'CI90LB517P_2019',
       'CI90UB517P_2019', 'MEDHHINC_2019', 'CI90LBINC_2019', 'CI90UBINC_2019',
       'POV04_2019', 'CI90LB04_2019', 'CI90UB04_2019', 'PCTPOV04_2019',
       'CI90LB04P_2019', 'CI90UB04P_2019'],
      dtype='object')

In [5]:
employment = pd.read_excel("Employment/laucnty00.xlsx", engine='openpyxl', header = None, skiprows = 6, skipfooter = 3, dtype = str)


for i in range(1,20):
    i = str(i)
    if len(i) == 1: i = "0" + i
    print(i)
    temp = pd.read_excel("Employment/laucnty" + i + ".xlsx", engine='openpyxl', header = None, skiprows = 6, skipfooter = 3, dtype = str)
    employment = pd.concat([employment, temp], ignore_index = True)

employment.rename(columns={0: "laus", 1: "fips_state", 2: "fips_county", 3: "county", 4: "year", 6: "labor_force", 7: "employed", 8: "unemployed", 9: "unemployment_rate"}, inplace = True)
employment.drop(columns = [5], inplace = True)
employment["fips"] = employment["fips_state"] + employment["fips_county"]

employment = employment[~(employment["labor_force"] == "N.A.")]
employment["fips"] = pd.to_numeric(employment["fips"])
employment["year"] = pd.to_numeric(employment["year"])
employment["labor_force"] = pd.to_numeric(employment["labor_force"])
employment["employed"] = pd.to_numeric(employment["employed"])
employment["unemployed"] = pd.to_numeric(employment["unemployed"])
employment["unemployment_rate"] = pd.to_numeric(employment["unemployment_rate"])


01
02
03
04
05
06
07
08
09
10
11
12
13
14
15
16
17
18
19


In [6]:
# import zipcode to FIPS mapping
zip_mapping = pd.read_csv('ZIP-COUNTY-FIPS_2018-03.csv')
zip_mapping.rename(columns={"ZIP": "zipcode", "STCOUNTYFP": "fips", "CITY": "city_mapped", "STATE": "state_mapped", "COUNTYNAME":"county_mapped", "CLASSFP":"classfp"}, inplace = True)

In [7]:
# import 2010 population by zipcode data for ZHVI weighting
pop_weights = pd.read_csv('Population/Census+Population+By+Zipcode+(ZCTA).csv')

In [8]:
# import and reformat ZHVI data
zhvi = pd.read_csv('ZHVI/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')
zhvi = pd.melt(zhvi, id_vars=['RegionID','SizeRank','RegionName','RegionType','StateName','State','City','Metro','CountyName'])
zhvi.rename(columns={"RegionName": "zipcode", "StateName": "state", "Metro": "msa", "CountyName": "county", "variable":"month", "value":"zhvi"}, inplace = True)
zhvi['month'] = pd.to_datetime(zhvi.month, infer_datetime_format=True)
zhvi['year'] = zhvi.month.dt.year
zhvi = zhvi[["zipcode","state","msa","county","year","zhvi"]].loc[zhvi.month.dt.month == 1]
zhvi = zhvi.merge(zip_mapping, how = 'left',left_on = 'zipcode',right_on = 'zipcode')


In [9]:
# import and reformat GDP data
gdp_county = pd.read_csv('GDP/CAGDP2__ALL_AREAS_2001_2019.csv', skipfooter = 4)
gdp_county.GeoFIPS = gdp_county.GeoFIPS.str.replace('"', '')
gdp_county = pd.melt(gdp_county, id_vars=['GeoFIPS','GeoName','Region','TableName','LineCode','IndustryClassification','Description','Unit'])
gdp_county.rename(columns={"GeoFIPS": "fips", "GeoName": "county", "Description": "metric", "variable": "year", "value":"gdp"}, inplace = True)
gdp_county = gdp_county[["fips","county","metric","year","gdp"]]
gdp_county = gdp_county[gdp_county.gdp.str.isnumeric() == True]  #dropping empty values
gdp_county.gdp = pd.to_numeric(gdp_county.gdp)
gdp_county = gdp_county.pivot_table(index=["fips","county","year"], columns="metric")['gdp'].reset_index()
gdp_county.columns.name = None
gdp_county.fips = gdp_county.fips.str.replace(' ','')
gdp_county.fips = pd.to_numeric(gdp_county.fips)
gdp_county.year = gdp_county.year.astype(int)


  


In [10]:
# import and reformat 2000 Population data
pop_2000 = pd.read_csv('Population/DECENNIALSF12000.P001_data_with_overlays_2021-07-23T112021.csv')

#all FIPS start with 0500000US in this dataset.  cutting that part out
pop_2000.GEO_ID = pop_2000.GEO_ID.str.replace('0500000US','')
pop_2000 = pop_2000[~(pop_2000.GEO_ID == 'id')] #filtering out header row
pop_2000.GEO_ID = pd.to_numeric(pop_2000.GEO_ID)
pop_2000["year"] = 2000
pop_2000.rename(columns={"GEO_ID": "fips", "NAME": "county", "P001001": "population"}, inplace = True)


In [11]:
# import and reformat 2010 to 2020 Population data
pop_2010_2020 = pd.read_csv('Population/PEPPOP2019.PEPANNRES_data_with_overlays_2021-08-08T193054.csv')
pop_2010_2020.head()
pop_2010_2020 = pop_2010_2020[~pop_2010_2020.DATE_CODE.str.contains('Census')]
pop_2010_2020 = pop_2010_2020[~pop_2010_2020.DATE_CODE.str.contains('base')]
pop_2010_2020.DATE_CODE = pop_2010_2020.DATE_CODE.str.slice(stop = 8)
pop_2010_2020.DATE_CODE = pop_2010_2020.DATE_CODE.str.slice(start = 4)
pop_2010_2020 = pop_2010_2020[~(pop_2010_2020.GEO_ID == 'id')] #filtering out header row
pop_2010_2020.GEO_ID = pop_2010_2020.GEO_ID.str.replace('0500000US','')
pop_2010_2020.rename(columns={"GEO_ID": "fips", "NAME": "county","DATE_CODE": "year", "POP": "population"}, inplace = True)
pop_2010_2020.fips = pd.to_numeric(pop_2010_2020.fips)
pop_2010_2020.year = pd.to_numeric(pop_2010_2020.year)
pop_2010_2020.population = pd.to_numeric(pop_2010_2020.population)


In [12]:
#merge the two population datasets together
population = pd.concat([pop_2000, pop_2010_2020])

#linearly interpolate missing years from 2000 to 2010
population_start = population[population.year ==2000][['fips','population']]
population_start.rename(columns={"population":"2000_pop"}, inplace = True)

population_end = population[population.year ==2010][['fips','population']]
population_end.rename(columns={"population":"2010_pop"}, inplace = True)

population_slope = population_start.merge(population_end, how = 'left', on = 'fips')
population_slope["slope"] = (pd.to_numeric(population_slope["2010_pop"]) - pd.to_numeric(population_slope["2000_pop"]))/10
population_slope

fips = pd.DataFrame(population.fips.unique(), columns = ['fips'])
years = pd.DataFrame({'year': [x for x in range(2001, 2010)]})

fips['key'] = 1
years['key'] = 1

interpolated_pop = pd.merge(fips, years, on ='key').drop("key", 1)
interpolated_pop = interpolated_pop.merge(population_slope, how = 'left', on = 'fips')
interpolated_pop["baseline_year"] = 2000
interpolated_pop["2000_pop"] = pd.to_numeric(interpolated_pop["2000_pop"])
interpolated_pop["2010_pop"]= pd.to_numeric(interpolated_pop["2010_pop"])
interpolated_pop["population"] = interpolated_pop["2000_pop"] + (interpolated_pop["year"]-interpolated_pop["baseline_year"])*interpolated_pop["slope"]

#15 missing fips due to 2000 population data not existing
print(interpolated_pop[interpolated_pop.population.isnull()].fips.nunique())
interpolated_pop = interpolated_pop[["fips","year","population"]]

population_w_estimates = pd.concat([population[["fips","year","population"]], interpolated_pop]) 

#df['key'] = 1
#df1['key'] = 1
#result = pd.merge(df, df1, on ='key').drop("key", 1)

15


# Preparing ZHVI Data and Aggregating to the FIPS Level

In [13]:
zhvi = zhvi.merge(pop_weights, how = 'left',left_on = 'zipcode',right_on = 'Zip Code ZCTA')
zhvi['zhvi_pop_prod'] = zhvi['Census Population 2010']*zhvi['zhvi']

In [14]:
#Filter to dates with well populated ZHVI data and GDP data
zhvi = zhvi[zhvi.year > 2000]
zhvi = zhvi[zhvi.year < 2020]
zhvi['missing'] = zhvi.zhvi.isnull()
print(zhvi[zhvi.missing == True]['Census Population 2010'].sum()/zhvi['Census Population 2010'].sum())

#make a list of zipcodes with partial data to exclude from the analysis
zipmissing = zhvi.groupby('zipcode', as_index = False).missing.mean()
print(zipmissing[zipmissing.missing > 0].shape[0] / zipmissing.shape[0])
zipmissing = zipmissing[zipmissing.missing>0].zipcode
zhvi = zhvi[~zhvi.zipcode.isin(zipmissing)]

0.05319918337346067
0.39723717491406707


In [15]:
#Aggregate ZHVI from the zipcode level up to the FIPS level, weighting by 2010 population
zhvi_fips_agg = zhvi.groupby(['fips','year'], as_index = False).agg({'zhvi': 'mean','zhvi_pop_prod': 'sum', 'Census Population 2010': 'sum'})
zhvi_fips_agg['zhvi_weighted'] = zhvi_fips_agg['zhvi_pop_prod'] / zhvi_fips_agg['Census Population 2010']
zhvi_fips_agg.fips = zhvi_fips_agg.fips.astype(int)


In [16]:
# population and therefore weighted ZHVI is missing for 1 FIPS -filter to only data that mapped to 2010 population data
print(zhvi_fips_agg[zhvi_fips_agg.zhvi_weighted.isnull()].fips.nunique())
zhvi_fips_agg.zhvi_weighted.isnull().sum()/zhvi_fips_agg.shape[0]
zhvi_fips_agg = zhvi_fips_agg[zhvi_fips_agg.zhvi_weighted.isnull()==False]

1


# Joining ZHVI Data to Other Data Sources

In [17]:
# Merge ZHVI data with GDP data.  2% of FIPS don't have matching GDP data, filter those out
df_agg = zhvi_fips_agg.merge(gdp_county, how = 'left', on = ['fips','year'])
print(df_agg["All industry total"].isnull().sum()/df_agg.shape[0])
df_agg = df_agg[df_agg["All industry total"].isnull()==False]

0.02125124131082423


In [18]:
# Merge ZHVI data with population data
df_agg = df_agg.merge(population_w_estimates, how = 'left', on = ['fips','year'])
print(df_agg["population"].isnull().sum()/df_agg.shape[0])
df_agg = df_agg[~df_agg["population"].isnull()]

0.00020292207792207794


In [19]:
employment

Unnamed: 0,laus,fips_state,fips_county,county,year,labor_force,employed,unemployed,unemployment_rate,fips
0,CN0100100000000,01,001,"Autauga County, AL",2000,21861,20971,890,4.1,1001
1,CN0100300000000,01,003,"Baldwin County, AL",2000,69979,67370,2609,3.7,1003
2,CN0100500000000,01,005,"Barbour County, AL",2000,11449,10812,637,5.6,1005
3,CN0100700000000,01,007,"Bibb County, AL",2000,8623,8160,463,5.4,1007
4,CN0100900000000,01,009,"Blount County, AL",2000,25266,24375,891,3.5,1009
...,...,...,...,...,...,...,...,...,...,...
64355,CN7214500000000,72,145,"Vega Baja Municipio, PR",2019,13172,11910,1262,9.6,72145
64356,CN7214700000000,72,147,"Vieques Municipio, PR",2019,2616,2433,183,7.0,72147
64357,CN7214900000000,72,149,"Villalba Municipio, PR",2019,7492,6307,1185,15.8,72149
64358,CN7215100000000,72,151,"Yabucoa Municipio, PR",2019,8840,7685,1155,13.1,72151


In [20]:
df_agg = df_agg.merge(employment[["fips","year","labor_force","employed","unemployed","unemployment_rate"]], how = 'left', on = ['fips','year'])
#employment

In [21]:
df_agg

Unnamed: 0,fips,year,zhvi,zhvi_pop_prod,Census Population 2010,zhvi_weighted,county,Accommodation and food services,Administrative and support and waste management and remediation services,"Arts, entertainment, and recreation",...,Natural resources and mining,Private goods-producing industries 2/,Private services-providing industries 3/,Trade,Transportation and utilities,population,labor_force,employed,unemployed,unemployment_rate
0,1001,2001,125288.666667,1.145516e+10,108787.0,105298.942429,"Autauga, AL",32162.0,,1662.0,...,16213.0,276566.0,375844.0,94895.0,32302.0,44781.2,22081.0,21166.0,915.0,4.1
1,1001,2002,127334.833333,1.162606e+10,108787.0,106869.923686,"Autauga, AL",32655.0,,1536.0,...,11951.0,245163.0,426077.0,102459.0,55406.0,45891.4,22161.0,21096.0,1065.0,4.8
2,1001,2003,129304.250000,1.196953e+10,108787.0,110027.238972,"Autauga, AL",35535.0,,1511.0,...,28183.0,227255.0,455589.0,112787.0,61079.0,47001.6,22695.0,21557.0,1138.0,5.0
3,1001,2004,132550.166667,1.228776e+10,108787.0,112952.424159,"Autauga, AL",36899.0,,1580.0,...,41858.0,283720.0,524450.0,119589.0,95693.0,48111.8,23241.0,22146.0,1095.0,4.7
4,1001,2005,141508.166667,1.296247e+10,108787.0,119154.617252,"Autauga, AL",35707.0,,1947.0,...,43221.0,296290.0,528930.0,126576.0,83560.0,49222,23887.0,22986.0,901.0,3.8
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
39411,56023,2015,228781.000000,1.077559e+08,471.0,228781.000000,"Lincoln, WY",,,,...,141433.0,205955.0,354929.0,53183.0,120341.0,18747,8343.0,7955.0,388.0,4.7
39412,56023,2016,233033.000000,1.097585e+08,471.0,233033.000000,"Lincoln, WY",,,,...,134235.0,203323.0,377571.0,55421.0,133049.0,19072,8725.0,8313.0,412.0,4.7
39413,56023,2017,240792.000000,1.134130e+08,471.0,240792.000000,"Lincoln, WY",15888.0,,2718.0,...,146892.0,223287.0,383581.0,,126567.0,19278,8834.0,8500.0,334.0,3.8
39414,56023,2018,235748.000000,1.110373e+08,471.0,235748.000000,"Lincoln, WY",15254.0,,2982.0,...,159387.0,232744.0,405279.0,55166.0,124692.0,19445,8889.0,8550.0,339.0,3.8


# Preparing data for export

In [48]:
# make a function to prepare the target feature desired (ie appreciation)
def target_prep(timeseries_df, start_year, lookforward, target_variable):
    baseline_df = timeseries_df[timeseries_df.year == start_year]
    lookforward_df = timeseries_df[timeseries_df.year == start_year + lookforward]
    df_out = baseline_df[["fips", target_variable]]
    df_out = df_out.merge(lookforward_df[['fips',target_variable]], how = 'left', on = 'fips')
    df_out[target_variable] = df_out.iloc[:, [2]].values/df_out.iloc[:, [1]].values
    df_out = df_out[["fips", target_variable]]
    df_out.columns = ['fips', target_variable + '_future_growth']
    return df_out

In [49]:
#function to calculate growth rate features over X year lookback

def growth_feature_prep(timeseries_df, start_year, look_back, cols_to_adj):
    baseline_df = timeseries_df[timeseries_df.year == start_year]
    lookback_df = timeseries_df[timeseries_df.year == start_year - look_back]

    df_out = baseline_df[["fips"]]
    
    for col in cols_to_adj:
        temp = baseline_df[['fips',col]]
        temp = temp.merge(lookback_df[['fips',col]], how = 'left', on = 'fips')
        temp[col] = temp.iloc[:, [1]].values/temp.iloc[:, [2]].values
        temp = temp[['fips',col]]
        temp.columns = ['fips', col + '_growth']
        df_out = df_out.merge(temp, how = 'left', on= 'fips')        
    return df_out


In [50]:
#function to calculate year over year growth rate features over X year lookback

def yoy_growth_feature_prep(timeseries_df, start_year, look_back, cols_to_adj):
    i = 0 
    df_out = timeseries_df[timeseries_df.year == start_year][["fips"]]
    
    while i < look_back:
        next_year_df = timeseries_df[timeseries_df.year == start_year - i]
        prev_year_df = timeseries_df[timeseries_df.year == start_year - i - 1]
        
        for col in cols_to_adj:
            temp = next_year_df[['fips',col]]
            temp = temp.merge(prev_year_df[['fips',col]], how = 'left', on = 'fips')
            temp[col] = temp.iloc[:, [1]].values/temp.iloc[:, [2]].values
            temp = temp[['fips',col]]
            col_name = col + "_" + str(i) + "_yoy__growth"
            temp.columns = ['fips', col_name]
            df_out = df_out.merge(temp, how = 'left', on= 'fips')
        i+=1
    return df_out


In [51]:
cols_to_adj = [#'   Accommodation and food services',
               #'   Administrative and support and waste management and remediation services',
               #'   Arts, entertainment, and recreation',
               #'   Durable goods manufacturing', 
               #'   Educational services',
               #'   Finance and insurance', 
               #'   Health care and social assistance',
               #'   Management of companies and enterprises',
               #'   Nondurable goods manufacturing',
               #'   Professional, scientific, and technical services',
               #'   Real estate and rental and leasing',
               #'  Agriculture, forestry, fishing and hunting',
               #'  Arts, entertainment, recreation, accommodation, and food services',
               '  Construction',
               #'  Educational services, health care, and social assistance',
               '  Finance, insurance, real estate, rental, and leasing',
               #'  Information', 
               '  Manufacturing',
               '  Mining, quarrying, and oil and gas extraction',
               #'  Other services (except government and government enterprises)',
               #'  Professional and business services', 
               '  Retail trade',
               #'  Transportation and warehousing', 
               #'  Utilities', 
               #'  Wholesale trade',
               ' Private industries', 
               'All industry total',
               'Government and government enterprises',
               #'Manufacturing and information', 
               #'Natural resources and mining',
               #'Private goods-producing industries 2/',
               #'Private services-providing industries 3/', 
               #'Trade',
               #'Transportation and utilities',
               'population', 'labor_force','employed','unemployed','unemployment_rate','zhvi_weighted'
]

In [52]:
fips_map = gdp_county.groupby(['fips','county'], as_index = False).year.count()
fips_map = fips_map[['fips','county']]

In [53]:
df_agg.columns

Index(['fips', 'year', 'zhvi', 'zhvi_pop_prod', 'Census Population 2010',
       'zhvi_weighted', 'county', '   Accommodation and food services',
       '   Administrative and support and waste management and remediation services',
       '   Arts, entertainment, and recreation',
       '   Durable goods manufacturing', '   Educational services',
       '   Finance and insurance', '   Health care and social assistance',
       '   Management of companies and enterprises',
       '   Nondurable goods manufacturing',
       '   Professional, scientific, and technical services',
       '   Real estate and rental and leasing',
       '  Agriculture, forestry, fishing and hunting',
       '  Arts, entertainment, recreation, accommodation, and food services',
       '  Construction',
       '  Educational services, health care, and social assistance',
       '  Finance, insurance, real estate, rental, and leasing',
       '  Information', '  Manufacturing',
       '  Mining, quarrying, and o

In [54]:
baseline_features = ['fips'
                    ,'Census Population 2010',
                     'zhvi_weighted',
                    #'   Accommodation and food services',
                    #'   Administrative and support and waste management and remediation services',
                    #'   Arts, entertainment, and recreation',
                    #'   Durable goods manufacturing', 
                    #'   Educational services',
                    #'   Finance and insurance', 
                    #'   Health care and social assistance',
                    #'   Management of companies and enterprises',
                    #'   Nondurable goods manufacturing',
                    #'   Professional, scientific, and technical services',
                    #'   Real estate and rental and leasing',
                    #'  Agriculture, forestry, fishing and hunting',
                    #'  Arts, entertainment, recreation, accommodation, and food services',
                    #'  Construction',
                    #'  Educational services, health care, and social assistance',
                    #'  Finance, insurance, real estate, rental, and leasing',
                    #'  Information', 
                    #'  Manufacturing',
                    #'  Mining, quarrying, and oil and gas extraction',
                    #'  Other services (except government and government enterprises)',
                    #'  Professional and business services', 
                    #'  Retail trade',
                    #'  Transportation and warehousing', 
                    #'  Utilities', 
                    #'  Wholesale trade',
                    ' Private industries', 
                    'All industry total',
                    'Government and government enterprises',
                    #'Manufacturing and information', 
                    #'Natural resources and mining',
                    #'Private goods-producing industries 2/',
                    #'Private services-providing industries 3/', 
                    #'Trade',
                     #'Transportation and utilities',
                     'population', 
                     'labor_force', 
                     'employed',
                     'unemployed', 
                     'unemployment_rate']

In [55]:
#Preparing Training & Testing Dataset
start_year = 2004
lookforward = 3
look_back = 3


df_target = target_prep(df_agg, start_year = start_year, lookforward = lookforward, target_variable = 'zhvi_weighted')
df_growth_features = growth_feature_prep(df_agg, start_year = start_year, look_back = look_back, cols_to_adj = cols_to_adj)
df_yoy_growth = yoy_growth_feature_prep(df_agg, start_year = start_year, look_back = look_back, cols_to_adj = cols_to_adj)
baseline_df = df_agg[df_agg.year == start_year]
df = df_target.merge(df_growth_features, how = 'left', on = 'fips')
df = df.merge(fips_map, how = 'left',on = 'fips')
df = df.merge(baseline_df[baseline_features], how = 'left',on = 'fips')
df = df.merge(df_yoy_growth, how = 'left',on = 'fips')


df_train_test = df

df_train_test.to_csv('df_train_test.csv', index = False)

  if sys.path[0] == '':
  if sys.path[0] == '':
  
  
  
  
  
  


In [56]:
# Preparing Validation Dataset
start_year = 2016
lookforward = 3
look_back = 3

df_target = target_prep(df_agg, start_year = start_year, lookforward = lookforward, target_variable = 'zhvi_weighted')
df_growth_features = growth_feature_prep(df_agg, start_year = start_year, look_back = look_back, cols_to_adj = cols_to_adj)
df_yoy_growth = yoy_growth_feature_prep(df_agg, start_year = start_year, look_back = look_back, cols_to_adj = cols_to_adj)
baseline_df = df_agg[df_agg.year == start_year]
df = df_target.merge(df_growth_features, how = 'left', on = 'fips')
df = df.merge(fips_map, how = 'left',on = 'fips')
df = df.merge(baseline_df[baseline_features], how = 'left',on = 'fips')
df = df.merge(df_yoy_growth, how = 'left',on = 'fips')

df_val = df

df_val.to_csv('df_validation.csv', index = False)

  if sys.path[0] == '':
  if sys.path[0] == '':
  
  
  
  
  
  


In [57]:
# Preparing Scoring Dataset
start_year = 2019
#lookforward = 3
look_back = 3

#df_target = target_prep(df_agg, start_year = start_year, lookforward = lookforward, target_variable = 'zhvi_weighted')
df_growth_features = growth_feature_prep(df_agg, start_year = start_year, look_back = look_back, cols_to_adj = cols_to_adj)
df_yoy_growth = yoy_growth_feature_prep(df_agg, start_year = start_year, look_back = look_back, cols_to_adj = cols_to_adj)
baseline_df = df_agg[df_agg.year == start_year]
#df = df_target.merge(df_growth_features, how = 'left', on = 'fips')
df = df_growth_features.merge(fips_map, how = 'left',on = 'fips')
df = df.merge(baseline_df[baseline_features], how = 'left',on = 'fips')
df = df.merge(df_yoy_growth, how = 'left',on = 'fips')

df_score = df

df_score.to_csv('df_score.csv', index = False)


  if sys.path[0] == '':
  if sys.path[0] == '':
  
  
  
  
  
  


In [58]:
percent_missing = df_train_test.isnull().sum() * 100 / len(df_train_test)
missing_value_df = pd.DataFrame({'column_name': df_train_test.columns,
                                 'percent_missing': percent_missing})
missing_value_df.sort_values('percent_missing', inplace=True, ascending = False)
missing_value_df.head(50)

Unnamed: 0,column_name,percent_missing
"Mining, quarrying, and oil and gas extraction_1_yoy__growth","Mining, quarrying, and oil and gas extractio...",17.261331
"Mining, quarrying, and oil and gas extraction_0_yoy__growth","Mining, quarrying, and oil and gas extractio...",17.020251
"Mining, quarrying, and oil and gas extraction_2_yoy__growth","Mining, quarrying, and oil and gas extractio...",16.730955
"Mining, quarrying, and oil and gas extraction_growth","Mining, quarrying, and oil and gas extractio...",15.33269
Construction_growth,Construction_growth,8.919961
Construction_2_yoy__growth,Construction_2_yoy__growth,8.630665
Construction_1_yoy__growth,Construction_1_yoy__growth,8.534233
Construction_0_yoy__growth,Construction_0_yoy__growth,7.955641
Manufacturing_growth,Manufacturing_growth,6.460945
Manufacturing_1_yoy__growth,Manufacturing_1_yoy__growth,6.412729


In [59]:
df_agg[df_agg.county.str.contains('San Fran')].groupby(['year']).agg({'zhvi_weighted':'sum'}).to_csv('sf_growth.csv')

# QA

In [67]:
# Check that values in the validation dataset match expectations vs the raw data
#Looks good for GDP data comparing 2016 to 2013
df_val[["fips","zhvi_weighted_growth"]]

Unnamed: 0,fips,zhvi_weighted_growth
0,1001,1.077572
1,1003,1.127344
2,1005,1.291814
3,1007,1.048176
4,1009,1.066646
...,...,...
2070,55131,1.147973
2071,55133,1.123654
2072,56001,1.220023
2073,56021,1.143621


In [47]:
df_train_test.head()

Unnamed: 0,fips,zhvi_weightedfuture_growth,Construction_growth,"Finance, insurance, real estate, rental, and leasing_growth",Manufacturing_growth,"Mining, quarrying, and oil and gas extraction_growth",Retail trade_growth,Private industries_growth,All industry total_growth,Government and government enterprises_growth,...,Retail trade_2_yoy__growth,Private industries_2_yoy__growth,All industry total_2_yoy__growth,Government and government enterprises_2_yoy__growth,population_2_yoy__growth,labor_force_2_yoy__growth,employed_2_yoy__growth,unemployed_2_yoy__growth,unemployment_rate_2_yoy__growth,zhvi_weighted_2_yoy__growth
0,1001,1.209146,0.694951,1.236581,1.01648,1.691146,1.244984,1.238746,1.264443,1.438165,...,1.076243,1.028862,1.044089,1.147046,1.02479,1.003623,0.996693,1.163934,1.170732,1.014919
1,1003,1.457189,1.226898,1.285057,1.199572,1.305015,1.338502,1.292162,1.290783,1.281135,...,1.06726,1.078431,1.075456,1.054643,1.02951,0.997269,0.990022,1.156746,1.162791,1.022033
2,1005,1.195766,1.099945,1.144663,1.291125,0.992997,1.242456,1.206069,1.19244,1.107652,...,0.97908,1.006969,1.007976,1.014238,0.994073,0.971918,0.970004,0.995327,1.013158,1.007245
3,1007,1.169099,1.897191,1.109655,1.194022,0.859638,1.064291,1.242564,1.224252,1.169466,...,1.00135,1.05097,1.050479,1.04901,1.00972,0.98106,0.978856,1.011272,1.029412,1.02664
4,1009,1.127905,1.086735,1.14616,,1.982682,1.023986,1.113936,1.126677,1.205297,...,1.068311,1.02283,1.024953,1.038054,1.0123,1.015167,0.996534,1.506997,1.459459,1.03924


In [61]:
gdp_qa = pd.read_csv('GDP/CAGDP2__ALL_AREAS_2001_2019.csv', skipfooter = 4)
gdp_qa[(gdp_qa.GeoFIPS.str.contains("1001"))][["Description","2001","2004","2013","2016"]]

  """Entry point for launching an IPython kernel.


Unnamed: 0,Description,2001,2004,2013,2016
68,All industry total,748918,946964,1457702,1716235
69,Private industries,652410,808170,1239612,1467403
70,"Agriculture, forestry, fishing and hunting",14756,39394,28355,22791
71,"Mining, quarrying, and oil and gas extraction",1457,2464,14681,17077
72,Utilities,19515,81339,376088,456310
...,...,...,...,...,...
97609,Trade,62825,69162,92745,97884
97610,Transportation and utilities,17624,22520,(D),(D)
97611,Manufacturing and information,921317,654996,1095804,490533
97612,Private goods-producing industries 2/,1019309,817522,1261385,617639


In [62]:
zhvi_zip= pd.read_csv('ZHVI/Zip_zhvi_uc_sfrcondo_tier_0.33_0.67_sm_sa_month.csv')

In [63]:
zipcodes_in_01001 = zip_mapping[zip_mapping.fips == 1001].zipcode.values

In [64]:
zhvi_zip = zhvi_zip[zhvi_zip.RegionName.isin(zipcodes_in_01001)]

In [65]:
zhvi_zip = zhvi_zip.merge(pop_weights, how = 'left', left_on = 'RegionName', right_on = 'Zip Code ZCTA')

In [70]:
zhvi_zip['weighed_zhvi_2019'] = zhvi_zip['2019-01-31']*zhvi_zip['Census Population 2010']
zhvi_zip['weighed_zhvi_2016'] = zhvi_zip['2016-01-31']*zhvi_zip['Census Population 2010']
#zhvi_zip['weighed_zhvi_2013'] = zhvi_zip['2013-01-31']*zhvi_zip['Census Population 2010']
zhvi_zip['weighed_zhvi_2019'].sum() / zhvi_zip['weighed_zhvi_2016'].sum()

#1.082575

1.0825746902163775

In [71]:
zhvi_zip['weighed_zhvi_2007'] = zhvi_zip['2007-01-31']*zhvi_zip['Census Population 2010']
zhvi_zip['weighed_zhvi_2004'] = zhvi_zip['2004-01-31']*zhvi_zip['Census Population 2010']
zhvi_zip['weighed_zhvi_2007'].sum() / zhvi_zip['weighed_zhvi_2004'].sum()

1.209146

1.209146