# Hedonic model

A hedonic model of rents in Los Angeles.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import shapely
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import statsmodels.api as sm
import geopandas as gp
from geoalchemy2 import Geography
import sqlalchemy as sq

In [None]:
data = pd.read_sql('''
SELECT
    serial,
    -- they are all the same within a hh, MIN just selects the value
    MIN(rentgrs) AS rentgrs,
     MIN(bedrooms) AS bedrooms,
     MIN(builtyr2) AS builtyr2,
     MIN(unitsstr) AS unitsstr,
     MIN(puma) AS puma
FROM ipums
WHERE numprec >= 1
AND ownershp = 'Rented'
AND rentgrs > 200
AND unitsstr NOT IN ('Mobile home or trailer', 'Boat, tent, van, other')
GROUP BY serial
''', 'sqlite:////Volumes/Pheasant Ridge/IPUMS/scag_sorting_5yr/scag_sorting_5yr.db')

In [None]:
data['built_2000_or_later'] = data.builtyr2.isin([ '2000-2004 (1999-2002 in the 2000-2002 ACS)', '2006', '2008',
       '2009',
       '2005 (2005 or later in datasets containing 2005, 2006, or 2007 ACS/PRCS data)',
       '2010', '2007', '2011', '2012', '2013', '2014', '2015', '2016',
       '2017'])

In [None]:
data['bedcat'] = data.bedrooms.map({
    '1': '1',
    '3': '3+',
    '2': '2',
    '4 (1970-2000, 2000-2007 ACS/PRCS)': '3+',
    'No bedrooms': 'Studio',
    '5+ (1970-2000, 2000-2007 ACS/PRCS)': '3+',
    '10': '3+',
    '9': '3+',
    '7': '3+'
}).astype('category')
assert not data.bedcat.isnull().any()

In [None]:
data.unitsstr.unique()

In [None]:
data['sfh'] = data.unitsstr.isin(['1-family house, detached', '1-family house, attached'])

In [None]:
data['lnrentgrs'] = np.log(data.rentgrs)

In [None]:
est_data = pd.get_dummies(data[['lnrentgrs', 'built_2000_or_later', 'bedcat', 'puma', 'sfh']], columns=['bedcat', 'puma']).drop(columns='bedcat_Studio')

In [None]:
mod = sm.OLS(est_data.lnrentgrs, est_data.drop(columns=['lnrentgrs']).astype('float64')) # no intercept - treating each puma as intercept
fit = mod.fit(cov_type='HC3')
fit.summary()

In [None]:
# used to derive error in estimates
# https://www.r-bloggers.com/2013/08/forecasting-from-log-linear-regressions/
varnk = sum(fit.resid ** 2) / (fit.nobs - len(fit.params))
varnk

In [None]:
res = pd.DataFrame({'Coefficient': fit.params, 'Std. Err.': fit.bse, 't': fit.tvalues, 'p': fit.pvalues}).loc[[c for c in fit.params.index if not c.startswith('puma')]].round(2)
res['Coefficient'] = res.Coefficient.astype('str') + '***'
res = res.rename(index={
    'built_2000_or_later': 'Built in 2000 or later',
    'sfh': 'Single-family home',
    'bedcat_1': '1 bedroom',
    'bedcat_2': '2 bedroom',
    'bedcat_3+': '3 bedroom'
})
fes = fit.params[[i for i in fit.params.index if i.startswith('puma_')]]
res.loc['PUMA fixed effects', 'Coefficient'] = f'{fes.min().round(2)}–{fes.max().round(2)}'
res.loc['R-squared', 'Coefficient'] = fit.rsquared.round(2)
res.loc['Adj. R-squared', 'Coefficient'] = fit.rsquared_adj.round(2)
res.loc['Sample size', 'Coefficient'] = f'{fit.nobs:,.0f}'
res.loc['', 'Coefficient'] = 'Standard errors are heteroskedasticity-robust (HC3) \\autocite[300]{angrist_mostly_2009}'
res = res.fillna('')

In [None]:
res

In [None]:
with pd.option_context('display.max_colwidth', None):
    print(res.to_latex(escape=False))

In [None]:
# throwing error - figure this out later
#pumas = gp.read_file('/Volumes/Pheasant Ridge/IPUMS/pumas/socal_pumas_projected.shp')

In [None]:
fedf = pd.DataFrame({'fixed_effect': fes}).reset_index()
fedf['puma'] = fedf['index'].str.slice(5)

In [None]:
#pumas = pumas.merge(fedf, left_on='PUMA', right_on='puma', validate='m:1', how='inner')

In [None]:
#pumas.plot(column='fixed_effect', legend=True)

In [None]:
#pumas.to_file('../data/fixed_effects.shp')

In [None]:
DB_URI = 'postgresql://matthewc@localhost:5432/matthewc'
engine = sq.create_engine(DB_URI)

### Prediction

In [None]:
new_pred_data = pd.read_sql('''
SELECT b.gid, prototype, p.puma FROM diss.building_costs b
    LEFT JOIN diss.gp16 p
    ON (p.gid = b.gid);
''', DB_URI)

In [None]:
specifications = {
    'sfh': pd.DataFrame({
        'built_2000_or_later': [True],
        'sfh': [True],
        'bedcat': ['3+']
    }),
    'duplex': pd.DataFrame({
        'built_2000_or_later': [True, True],
        'sfh': [False, False],
        'bedcat': ['2', '2']
    }),
    'threeplex': pd.DataFrame({
        'built_2000_or_later': [True, True, True],
        'sfh': [False, False, False],
        'bedcat': ['1', '2', '2']
    }),
    'sixplex': pd.DataFrame({
        'built_2000_or_later': [True, True, True, True, True, True],
        'sfh': [False, False, False, False, False, False],
        'bedcat': ['3+', '2', '2', '2', '1', '1']
    })
}

In [None]:
non_location_rents = {
    k: ((d := pd.get_dummies(v, columns=['bedcat'])) * fit.params[d.columns]).apply(np.sum, 1)
    for k, v in specifications.items()
}
non_location_rents = pd.DataFrame(pd.concat(non_location_rents.values(), keys=non_location_rents.keys())).reset_index().rename(columns={'level_0': 'prototype', 'level_1': 'unit', 0: 'non_location_rent'})
non_location_rents

In [None]:
non_location_rents['tr'] = True

In [None]:
fedf['tr'] = True

In [None]:
puma_unit_rents = non_location_rents.merge(fedf, on='tr', how='left')

In [None]:
# varnk/2 is correction factor for lognormally distributed error term
# https://www.r-bloggers.com/2013/08/forecasting-from-log-linear-regressions/
puma_unit_rents['rent'] = np.exp(puma_unit_rents.non_location_rent + puma_unit_rents.fixed_effect + varnk / 2)
puma_prototype_rents = pd.DataFrame(puma_unit_rents.groupby(['prototype', 'puma']).rent.sum().rename('total_rent')).reset_index()
puma_prototype_rents

In [None]:
puma_prototype_rents.to_sql('puma_prototype_rents', DB_URI, schema='diss')

In [None]:
with engine.connect() as con:
    con.execute('''
        ALTER TABLE diss.building_costs ADD COLUMN total_rent DOUBLE PRECISION;
        UPDATE diss.building_costs b SET total_rent = r.total_rent
                FROM diss.puma_prototype_rents r
                WHERE b.puma = r.puma AND b.prototype = r.prototype;
    ''')

## Rents for existing buildings

In [None]:
fedf[['puma', 'fixed_effect']].to_sql('puma_fixed_effects', DB_URI, schema='diss')

In [None]:
with engine.connect() as con:
    con.execute(sq.text('ALTER TABLE diss.gp16 ADD COLUMN total_rent double precision;'))
    con.execute(sq.text('''    
    UPDATE diss.gp16 a SET total_rent = EXP(
        f.fixed_effect
        + CASE WHEN Building_YearBuilt >= 2000 THEN :built_2000_or_later ELSE 0 END
        + :sfh
        + CASE
            WHEN Building_TotalBedrooms = 1 THEN :bedcat_1
            WHEN Building_TotalBedrooms = 2 THEN :bedcat_2
            WHEN Building_TotalBedrooms >= 3 then :bedcat_3plus
            ELSE 0
        END
        + :correction_factor
    )
    FROM diss.puma_fixed_effects f
    WHERE a.building_propertylandusestndcode = 'RR101'
        AND f.PUMA = a.puma;
    '''), **fit.params.rename(index={'bedcat_3+': 'bedcat_3plus'}).to_dict(), correction_factor=varnk/2)