In [24]:
import pandas as pd
import numpy as np
from datetime import datetime
import seaborn as sns
import matplotlib.pyplot as plt
from statsmodels.tsa.seasonal import seasonal_decompose
from sklearn.preprocessing import StandardScaler, MinMaxScaler


In [25]:
#housing price index (2015 baseline)
hpi = pd.read_csv('./data/data_raw/home_price_index.csv')


# HPI

In [26]:
def snake_case(dataset):
    dataset.columns = [x.lower().replace(' ', '_') for x in dataset.columns]
    return

In [27]:
#before applying snake_case the duplicates and unnecessary columns need to be dropped
hpi = hpi.drop(columns = ['TL', 'Territory Level and Typology', 'REG_ID', 'VAR', 'Variable', 'VINTAGE', 'Vintage', 
                          'DWELLINGS', 'Dwellings', 'MEASURE', 'Frequency', 'FREQ', 'TIME', 'Unit Code', 'Unit', 'Reference Period Code', 'PowerCode Code', 'PowerCode', 'Reference Period', 'Flag Codes', 'Flags'])

In [28]:
snake_case(hpi)

In [29]:
#converting Q1... notation into dates
quarterly = {'Q1': '01-01',
            'Q2' : '03-01',
            'Q3' : '06-01',
            'Q4' : '09-01'}

hpi['time'] = hpi['time'].replace(to_replace = quarterly, regex = True)

hpi['time'] = hpi['time'].astype('datetime64[ns]')


In [30]:
hpi.head()

Unnamed: 0,region,measure,time,value
0,United States,Index publication base,2010-01-01,91.332
1,United States,Index publication base,2010-03-01,90.506
2,United States,Index publication base,2010-06-01,91.385
3,United States,Index publication base,2010-09-01,90.733
4,United States,Index publication base,2011-01-01,88.219


In [31]:
#the time series is repeated for each 'measure'. Took measure vals and condensed it all into 1 df.

#py
pct_chg_py = hpi[hpi['measure'] == 'Percentage change on the same period of the previous year'].reset_index()
hpi['pct_chg_py'] = pct_chg_py['value']

#pp
pct_chg_pp = hpi[hpi['measure'] == 'Percentage change from previous period'].reset_index()
hpi['pct_chg_pp'] = pct_chg_pp['value']

In [32]:
#adding the values of py and pp onto original hpi dataset
hpi = hpi[hpi['measure'] == 'Index publication base']
hpi = hpi.rename(columns = {'value':'hpi'})


In [33]:
#removing '(USA)' from region
hpi['region'] = hpi['region'].map(lambda x: x.replace(' (USA)', ''))

In [34]:
hpi.head()

Unnamed: 0,region,measure,time,hpi,pct_chg_py,pct_chg_pp
0,United States,Index publication base,2010-01-01,91.332,-7.013,-1.196
1,United States,Index publication base,2010-03-01,90.506,-5.37,-0.904
2,United States,Index publication base,2010-06-01,91.385,-1.887,0.971
3,United States,Index publication base,2010-09-01,90.733,-1.844,-0.714
4,United States,Index publication base,2011-01-01,88.219,-3.408,-2.77


In [35]:
# grouping data by state

def by_state(dataframe, metric):
    df = pd.DataFrame()
    df['time'] = dataframe['time'][:48]
    for x in hpi['region'].unique():
        df1 = hpi[hpi['region'] == x]
        df[x] = df1[metric].reset_index(drop = True)
        
    df.columns = df.columns.str.lower().str.replace(' ', '_')
    # df = df.drop(columns = )
       
    return df

In [36]:
hpi = by_state(hpi, 'hpi').set_index('time')
# hpi_py = by_state(hpi, 'pct_chg_py').set_index('time')
# hpi_pp = by_state(hpi, 'pct_chg_pp').set_index('time')

In [37]:
hpi.head()

Unnamed: 0_level_0,united_states,alabama,alaska,arizona,arkansas,california,colorado,connecticut,delaware,district_of_columbia,...,south_dakota,tennessee,texas,utah,vermont,virginia,washington,west_virginia,wisconsin,wyoming
time,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
2010-01-01,91.332,101.43,89.961,84.858,95.728,78.345,80.626,105.04,103.837,75.459,...,87.739,92.851,82.999,87.836,98.93,96.438,90.661,95.1,99.374,91.333
2010-03-01,90.506,99.937,90.753,81.897,96.025,78.116,79.687,103.88,102.746,75.779,...,87.611,92.425,83.055,86.568,98.661,95.8,89.618,94.891,98.809,91.768
2010-06-01,91.385,101.275,91.293,81.76,96.361,78.542,80.685,105.048,102.661,76.975,...,88.997,93.753,83.576,87.309,98.845,96.783,89.598,95.457,99.847,91.652
2010-09-01,90.733,100.441,91.516,79.732,95.927,77.678,80.327,105.231,101.882,76.914,...,89.07,93.078,83.209,86.843,99.197,96.252,88.005,96.195,99.213,91.758
2011-01-01,88.219,97.81,91.477,75.258,94.394,74.909,78.303,102.508,99.355,75.875,...,88.566,90.856,82.003,82.986,98.501,94.006,85.055,94.079,97.024,90.507


In [None]:
# hpi_hpi.to_csv('./data/hpi_state.csv')
# hpi_py.to_csv('./data/hpi_state_py.csv')
# hpi_pp.to_csv('./data/hpi_state_pp.csv')

#### Aligning pop and hpi datasets to create per cap hpi data (DEPRECATED)

In [None]:
#converting objects to floats
# def mod_pop(df):
#     df.columns = df.columns.str.lower().str.replace(' ', '_')
#     df = df.drop(columns = ['puerto_rico']) 
#     return df

In [439]:
# pop1 = pop.copy()
# pop1 = mod_pop(pop1)

In [440]:
#removing 04 months in year
# pop1 = pop1.drop(index= ['2010-04-01','2020-04-01'])

In [441]:
# pop1 = pop1.resample('Q').ffill()

In [442]:
# def per_cap(df):
#     #starts at 2010-09-01 to match population data
#     start = (df.index > '2010-06-01')
#     df = df.loc[start]
#     #drop US from hpi
#     df = df.drop(columns = ['united_states'])
#     pop1.index = df.index
#     percap = df.div(pop1)
#     return percap

In [443]:
# percap_hpi = per_cap(hpi_hpi)
# percap_hpi_pp = per_cap(hpi_pp)
# percap_hpi_py = per_cap(hpi_py)

In [453]:
# percap_hpi.to_csv('./data/percap_hpi.csv')
# percap_hpi_pp.to_csv('./data/percap_hpi_pp.csv')
# percap_hpi_py.to_csv('./data/percap_hpi_py.csv')