# Property sales

Given that a property is profitable to redevelop, what is the likelihood it actually will be? Build a logistic model of property sales and an auction model of developers and homeowners bidding on a property.

In [None]:
%matplotlib inline
%config InlineBackend.figure_format = 'retina'

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib
import statsmodels.api as sm
import scipy.stats
import geopandas as gp
import libpysal.weights
from glob import glob
from census import Census
from os import environ
import matplotlib.patches as mpatch
import textwrap

DB_URI = 'postgresql://matthewc@localhost/matthewc'

In [None]:
capi = Census(environ['CENSUS_API_KEY'])

In [None]:
orig_sales = pd.read_sql('''
    WITH trans_hist AS (SELECT DISTINCT "PropertyInfo_ImportParcelID" 
        FROM diss.ztrans z
        WHERE SUBSTRING(z."Main_RecordingDate", 1, 4) IN ('2013', '2014', '2015', '2016', '2017'))
    SELECT gid, county, clean_apn, building_yearbuilt, building_totalbedrooms, building_noofunits, building_totalcalculatedbathcount,
    puma, ST_Area(geog) AS area_sqm, h."PropertyInfo_ImportParcelID" IS NOT NULL AS sold, scag_zn_co, building_propertylandusestndcode
    FROM diss.gp16 p
        LEFT JOIN trans_hist h ON (p.Main_ImportParcelID = h."PropertyInfo_ImportParcelID")
        
        WHERE p.scag_zn_co IN (
            '1110', -- single family residential
            '1111', -- high dens SF residential
            '1112', -- med dens SF residential
            '1113', -- low dens SF residential
            '1150'  -- rural residential`
        )
        AND p.building_propertylandusestndcode = 'RR101'
        AND NOT Main_ImportParcelID IS NULL
        AND puma IS NOT NULL
        
''', DB_URI)

In [None]:
orig_sales.county.value_counts()

In [None]:
orig_sales.loc[orig_sales.county == 'Imperial'].head()

In [None]:
orig_sales.loc[orig_sales.county == 'Imperial', ['building_totalbedrooms', 'building_totalcalculatedbathcount']].value_counts()

In [None]:
orig_len = len(orig_sales)

In [None]:
orig_sales[(orig_sales.building_noofunits <= 1) | orig_sales.building_noofunits.isnull()].county.value_counts()

In [None]:
sales = (
    orig_sales[
    ((orig_sales.building_noofunits <= 1) | orig_sales.building_noofunits.isnull()) &
    # allow nan year builts in Imperial County, average will just get absorbed into fixed effects
    # There is a single PUMA, 02500, that covers all of Imperial County and nothing else
    (~orig_sales.building_yearbuilt.isnull() | (orig_sales.county == 'Imperial'))]
.dropna(subset=['building_totalbedrooms', 'building_totalcalculatedbathcount'])
)

In [None]:
len(sales) / orig_len

In [None]:
assert (sales.loc[sales.building_yearbuilt.isnull(), 'county'] == 'Imperial').all()

In [None]:
sales['building_yearbuilt'] = sales.building_yearbuilt.fillna(0) # will wind up in base category in model, PUMA fixed effect will account for average

In [None]:
sales.county.value_counts()

In [None]:
assert sales.puma.value_counts().min() > 4 # avoid convergence problems with fixed effects

In [None]:
sales['yrbltcat'] = pd.cut(sales.building_yearbuilt, [-np.inf, 1940, 1960, 1980, 2000, np.inf])
sales['beds'] = np.minimum(sales.building_totalbedrooms, 4)
exog = sales[['yrbltcat', 'beds', 'building_totalcalculatedbathcount', 'puma', 'area_sqm']]
exog = sm.add_constant(pd.get_dummies(exog, columns=['yrbltcat', 'beds', 'puma']).drop(columns=['puma_07104', 'yrbltcat_(-inf, 1940.0]', 'beds_2']))
mod = sm.Logit(sales.sold, exog)
fit = mod.fit()
fit.summary()

In [None]:
ex_sale_prob = pd.Series(fit.predict(), index=sales.gid)

## Vacant property sale model

In [None]:
vac = pd.read_sql('''
    WITH trans_hist AS (SELECT DISTINCT "PropertyInfo_ImportParcelID" 
        FROM diss.ztrans z
        WHERE SUBSTRING(z."Main_RecordingDate", 1, 4) IN ('2013', '2014', '2015', '2016', '2017'))
    SELECT gid, clean_apn, county, building_propertylandusestndcode
    puma, ST_Area(geog) AS area_sqm, h."PropertyInfo_ImportParcelID" IS NOT NULL AS sold
    FROM diss.gp16 p
        LEFT JOIN trans_hist h ON (p.Main_ImportParcelID = h."PropertyInfo_ImportParcelID")
        
        WHERE p.scag_zn_co IN (
            '1110', -- single family residential
            '1111', -- high dens SF residential
            '1112', -- med dens SF residential
            '1113', -- low dens SF residential
            '1150'  -- rural residential
        )
        AND p.building_propertylandusestndcode = 'VL101'
        AND NOT Main_ImportParcelID IS NULL
        AND puma IS NOT NULL
''', DB_URI)

In [None]:
vac.county.value_counts()

In [None]:
puma_vcs = vac.puma.value_counts()
# lower bar here than in sales since there are fewer properties than property sales
pumas_to_merge = puma_vcs[puma_vcs < 5].index
pumas_to_merge

In [None]:
pumas = gp.read_file('/Volumes/Pheasant Ridge/IPUMS/pumas/socal_pumas_projected.shp').dissolve('PUMA')

In [None]:
queen_weights = libpysal.weights.Queen(pumas.geometry, ids=pumas.index.to_list()).to_adjlist().set_index('focal')

In [None]:
# merge with largest adjacent
merges = {}
for puma in pumas_to_merge:
    candidates = queen_weights.loc[puma].neighbor
    candidates = [i for i in candidates if i in puma_vcs.index]
    if len(candidates) == 0:
        print(f'no neighbors for {puma}!')
    else:
        neighbor = puma_vcs.loc[candidates].idxmax()
        merges[puma] = neighbor
# these don't end up merged into large enough pumas
merges['03710'] = '03738'
merges['03717'] = '03738'
merges['03738'] = '03738'
merges['03753'] = '03738'

In [None]:
vac['merged_puma'] = vac.puma.replace(merges)

In [None]:
# check for perfect prediction
pp = vac.groupby('merged_puma').sold.mean()
pumas_to_merge = pp[(pp == 1) | (pp == 0)].index

pp_merges = {}

# find adjacent PUMAs without the same problem
for  puma in pumas_to_merge:
    candidates = queen_weights.loc[puma].neighbor
    candidates = [i for i in candidates if i in puma_vcs.index and pp[merges[i] if i in merges else i] != pp[puma]]
    if len(candidates) == 0:
        print(f'no neighbors for {puma}!')
    else:
        neighbor = puma_vcs.loc[candidates].idxmax()
        pp_merges[puma] = merges[neighbor] if neighbor in merges else neighbor

In [None]:
merges = {**merges, **pp_merges}

In [None]:
vac['merged_puma'] = vac.puma.replace(merges)

In [None]:
vac.merged_puma.value_counts()

In [None]:
vac.groupby('merged_puma').sold.mean()

In [None]:
puma_vcs.idxmax()

In [None]:
exog = sm.add_constant(pd.get_dummies(vac[['area_sqm', 'merged_puma']], columns=['merged_puma']).drop(columns=['merged_puma_' + puma_vcs.idxmax()]))
vacmod = sm.Logit(vac.sold, exog)
vacfit = vacmod.fit()
vacfit.summary()

In [None]:
vac_sale_prob = pd.Series(vacfit.predict(), index=vac.gid)

## Put probabilities together

And fill in for any non-vacant properties missing probabilities (all vacant properties should have probabilities, no missing data issues).

In [None]:
# this is the number of properties in the profitability model
assert len(vac) + len(orig_sales) == 2777552

In [None]:
orig_sales = orig_sales.set_index('gid')
orig_sales['sale_prob'] = ex_sale_prob.reindex(orig_sales.index)

In [None]:
# fill in null sales
orig_sales['ersatz_sale_prob'] = orig_sales.groupby('puma').sale_prob.transform(lambda s: s.dropna().mean() if not np.all(pd.isnull(s)) else np.nan)

In [None]:
ex_sale_prob = orig_sales.sale_prob.fillna(orig_sales.ersatz_sale_prob)
assert not ex_sale_prob.isnull().any()

In [None]:
sale_prob = pd.concat([vac_sale_prob, ex_sale_prob])

In [None]:
assert len(sale_prob) == 2777552

In [None]:
def sigstars (p):
    if p < 0.001:
        return '***'
    elif p < 0.01:
        return '**'
    elif p < 0.01:
        return '*'
    elif p < 0.1:
        return '.'
    else:
        return ''

In [None]:
fit_table = pd.DataFrame({
    'Odds ratio': np.exp(fit.params),
    'Coef': fit.params,
    'Std. err.': fit.bse,
    't-value': fit.tvalues,
    'p-value': fit.pvalues
}).round(2)

fit_table['Odds ratio'] = fit_table['Odds ratio'].astype(str).str.cat(fit.pvalues.apply(sigstars))

fit_table = fit_table.loc[[i for i in fit_table.index if not i.startswith('puma_')]] # remove fixed effects
fit_table.loc['Sample size', 'Odds ratio'] = fit.nobs
fit_table.loc['Pseudo R^2', 'Odds ratio'] = round(fit.prsquared, 2)

In [None]:
vacfit_table = pd.DataFrame({
    'Odds ratio': np.exp(vacfit.params),
    'Coef': vacfit.params,
    'Std. err.': vacfit.bse,
    't-value': vacfit.tvalues,
    'p-value': vacfit.pvalues
}).round(2)

vacfit_table['Odds ratio'] = vacfit_table['Odds ratio'].astype(str).str.cat(vacfit.pvalues.apply(sigstars))

vacfit_table = vacfit_table.loc[[i for i in vacfit_table.index if not i.startswith('merged_puma_')]] # remove fixed effects
vacfit_table.loc['Sample size', 'Odds ratio'] = vacfit.nobs
vacfit_table.loc['Pseudo R^2', 'Odds ratio'] = round(vacfit.prsquared, 2)


In [None]:
names = {
    'const': 'Constant',
    'building_totalcalculatedbathcount': 'Bathrooms',
    'yrbltcat_(1940.0, 1960.0]': 'Built 1941–1960',
    'yrbltcat_(1960.0, 1980.0]': 'Built 1961–1980',
    'yrbltcat_(1980.0, 2000.0]': 'Built 1981–2000',
    'yrbltcat_(2000.0, inf]': 'Built 2001–present',
    'area_sqm': 'Lot area (square meters)',
    'beds_0': 'No bedrooms',
    'beds_1': 'One bedroom',
    'beds_3': 'Three bedrooms',
    'beds_4': 'Four or more bedrooms'
}

result_table = pd.concat([
    fit_table.rename(index=names), vacfit_table.rename(index=names)],
    keys=['Existing', 'Vacant']).fillna('')
result_table

In [None]:
print(result_table.to_latex())

## Map hedonic fixed effects

In [None]:
# plot out just the rent new MF
land = gp.read_file('../../sorting/data/ne_10m_land.shp').to_crs(epsg=26911)

roads = pd.concat([gp.read_file(i).to_crs(epsg=26911) for i in glob('../../sorting/data/tl_roads/*.shp')], ignore_index=True)

counties = gp.read_file('../../sorting/data/counties/tl_2019_us_county.shp').to_crs(26911)
counties = counties[(counties.STATEFP == '06') & counties.NAME.isin(['Los Angeles', 'Ventura', 'Orange', 'Riverside', 'San Bernardino', 'Imperial'])]

In [None]:
pumas[pumas.GEOID == '0611102']

In [None]:
pumas['merged_puma'] = pumas.GEOID.replace(merges)

f, axs = plt.subplots(2, 1, figsize=(9, 11))

for lbl, model, ax in zip(['Existing homes', 'Vacant properties'], [fit, vacfit], axs):
    fes = fit.params.loc[[i for i in fit.params.index if 'puma' in i]].rename('fe').reset_index()
    fes['puma'] = '06' + fes['index'].str.slice(-5)
    fes = fes.set_index('puma')
    
    # add back base effect
    if lbl == 'Existing homes':
        fes.loc['0607104'] = 0
    elif lbl == 'Vacant properties':
        fes.loc['06' + puma_vcs.idxmax()] = 0
    else:
        assert False, 'unexpected label'    
    
    pumas_fes = pumas.merge(fes, left_on='GEOID' if lbl == 'Existing homes' else 'merged_puma', right_index=True, validate='m:1')

    pumas_fes.to_crs(epsg=26911).plot(ax=ax, column='fe', cmap='Blues', scheme='quantiles', legend=True)
    roads.plot(color='#888888', ax=ax, lw=0.5)
    counties.plot(edgecolor='#000',  facecolor='none', ax=ax, lw=1)
    #water.plot(color='#aaaaaa', ax=ax)
    ax.set_ylim(3.59e6, 3.95e6)
    ax.set_xlim(2.74e5, 7.7e5)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(lbl)

    ax.set_yticks([])
    ax.set_xticks([])
    #ax.set_axis_off()


#     ax.legend(
#         [mpatch.Patch(color=c) for c in colors.values()],
#         [i.replace('$-', '-$').replace('$', '\\$') for i in colors.keys()],
#         loc='lower left',
#         title='Change in average rent',
#         framealpha=1,
#         fontsize='medium',
#         title_fontsize='medium'
#     )

plt.savefig('../../dissertation/fig/sales/sale_fes.png', dpi=300, bbox_inches='tight')


### Standard deviation of errors

In [None]:
rents = pd.read_sql('SELECT gid, total_rent, vacant_npv FROM diss.gp16', DB_URI).set_index('gid')

In [None]:
rents[~rents.total_rent.isnull() & ~rents.vacant_npv.isnull()]

In [None]:
sigma_lnp = 0.3988261988782725
discount_rate_ex = 0.04125
cap_rate = 0.0479
appreciation = 0.014
op_cost = 0.45
SCALE_FACTOR = 1.8031293436149882  # Scale factor to bring IPUMS rents in line with Zillow rental index
VACANCY_RATE = 0.04  # 4% assumed vacancy
TRANSACTION_COSTS = 0.09

def get_sigma_npv (discount_rate_ex=discount_rate_ex, cap_rate=cap_rate, appreciation=appreciation, op_cost=op_cost, **kwargs):
    lnp = np.log(rents.total_rent.dropna())
    sigma_price = np.sqrt(np.exp(sigma_lnp**2 + 2 * lnp) * (np.exp(sigma_lnp**2) - 1)) # https://mathworld.wolfram.com/LogNormalDistribution.html
    sigma_inc = 6.27 * sigma_price * SCALE_FACTOR
    sigma_ex_npv = sigma_inc * (
                    sum([1 / ((1 + discount_rate_ex)**i * (1 + appreciation) ** i) for i in range(10)])  # rental value
            + 1 / cap_rate / (1 + discount_rate_ex) ** 10 * (1 + appreciation) ** 10 * (1 - TRANSACTION_COSTS) # ultimate sale value
    ) 

    sigma_lnvp = 1.2619061577511022
    # TODO dropping places where total_rent is not null b/c the ztrax and scag data don't completely agree on what is vacant
    lnvp = np.log(rents.vacant_npv[rents.total_rent.isnull()].dropna())
    sigma_vacnpv = np.sqrt(np.exp(sigma_lnvp ** 2 + 2 * lnvp) * np.exp(sigma_lnvp**2) - 1)

    sigma_npv = pd.concat([sigma_ex_npv, sigma_vacnpv])
    
    return sigma_npv

In [None]:
sigma_npv = get_sigma_npv()

In [None]:
assert not sigma_npv.index.duplicated().any()

## Auction simulation

Compute the probility that a bid for the intended use is the winning bid.

In [None]:
npvs = pd.read_parquet('../data/Base_net_present_value.parquet')

In [None]:
npvs

In [None]:
redevelopment_probs = [
    (0.95, 1.0, 0.05), # up to 5% less than profitable: 5% redevelopment probability
    (1, 1.1, 0.1), # 0-10%: 10%
    (1.1, 1.25, 0.2),
    (1.25, 1.5, 0.3),
    (1.5, 2, 0.4),
    (2, np.inf, 0.5)
]

In [None]:
def simulate_auction (npvs, sale_prob):
    joint_idx = npvs.index.intersection(sale_prob.index)

    assert len(joint_idx) == 2777552 # make sure we're not missing anything

    npvs = npvs.reindex(joint_idx) # why are there more in npv than sale_prob?
    sale_prob = sale_prob.reindex(joint_idx)

    ex_npv = npvs.existing
    new_npvs = npvs[['sfh', 'duplex', 'threeplex', 'sixplex']]
    max_new_npv = new_npvs.max(axis=1)
    which_new_npv = new_npvs.idxmax(axis=1)
    
    redev_probs = pd.Series(np.zeros(len(max_new_npv)), index=max_new_npv.index)
    
    potential_profit = max_new_npv / ex_npv
    
    print(f'{(potential_profit > 1).sum()} lots profitable to redevelop')
        
    for lo, hi, prob in redevelopment_probs:
        redev_probs.loc[(potential_profit >= lo) & (potential_profit < hi)] = prob
        
    combined_redev_prob = redev_probs * sale_prob
    # splay it back out to all property types. only the most profitable and existing types will have any profitability
    combined_prob = pd.DataFrame(np.zeros((len(npvs), len(npvs.columns)), np.float64), index=npvs.index, columns=npvs.columns)
    print('computing combined probabilities')
    for new_type in ['sfh', 'duplex', 'threeplex', 'sixplex']:
        combined_prob.loc[which_new_npv == new_type, new_type] = combined_redev_prob.loc[which_new_npv == new_type]
    combined_prob['existing'] = 1 - combined_redev_prob
    
    return combined_prob#, potential_profit#, redev_probs, max_new_npv, ex_npv, joint_idx

In [None]:
puma_for_gid = pd.read_sql('SELECT gid, tract, puma, lu16, building_yearbuilt, building_propertylandusestndcode FROM diss.gp16', DB_URI).set_index('gid')

In [None]:
hh = pd.read_parquet('../../sorting/data/full_hh.parquet')

In [None]:
tract_to_puma = pd.read_csv('../../abm/data/2010_Census_Tract_to_2010_PUMA.csv', dtype='str')
tract_to_puma['tract_geoid'] = tract_to_puma.STATEFP.str.cat(tract_to_puma.COUNTYFP).str.cat(tract_to_puma.TRACTCE)
tract_to_puma['puma_geoid'] = tract_to_puma.STATEFP.str.cat(tract_to_puma.PUMA5CE)

In [None]:
CAP_RATE = 0.04424375
discount_rate_ex = 0.06813142841167415

scenarios = {
    'Current appreciation':  {
        'discount_rate_new': 0.11,
        'discount_rate_ex': discount_rate_ex,
        'cap_rate': CAP_RATE,
        'appreciation': 0.0480328437009123,
        'op_cost': 0.45,
        'cost_scale': 1.1
    }
}


scenarios['Base'] = {
    'discount_rate_new': 0.11,
    'discount_rate_ex': discount_rate_ex,
    'cap_rate': CAP_RATE,
    'appreciation': 0.014,  # CPI
    'op_cost': 0.45,
    'cost_scale': 1.1 # 10% contingency
}

scenarios['Low discount rate'] = {
    'discount_rate_new': 0.08,
    'discount_rate_ex': 0.04125,
    'cap_rate': CAP_RATE,
    'appreciation': 0.014,  # CPI
    'op_cost': 0.45,
    'cost_scale': 1.1 # 10% contingency
}

scenarios['Equal discount rate (8% existing and new)'] = {
    'discount_rate_new': 0.08,
    'discount_rate_ex': 0.08,
    'cap_rate': CAP_RATE,
    'appreciation': 0.014,  # CPI
    'op_cost': 0.45,
    'cost_scale': 1.1 # 10% contingency
}

scenarios['Low operating cost (25%)'] = {
    'discount_rate_new': 0.11,
    'discount_rate_ex': discount_rate_ex,
    'cap_rate': CAP_RATE,
    'appreciation': 0.0480328437009123,  # CPI
    'op_cost': 0.25,
    'cost_scale': 1.1 # 10% contingency
}

scenarios['High construction cost'] = {
    'discount_rate_new': 0.11,
    'discount_rate_ex': discount_rate_ex,
    'cap_rate': CAP_RATE,
    'appreciation': 0.014,  # CPI
    'op_cost': 0.45,
    'cost_scale': 1.4 # 30% on top
}

marginal_units = {}
scenario_units = {}
construction_and_demolition = {}
tract_marginal_units = {}
tract_units = {}

short_names = {
    'Base': 'npv_base',
    'Current appreciation': 'npv_current_appreciation',
    'Equal discount rate (8% existing and new)': 'npv_eq_discount',
    'Low discount rate': 'npv_low_discount',
    'Low operating cost (25%)': 'npv_low_opcost',
    'High construction cost': 'high_const_cost'
}

for sname, short_name in list(short_names.items()):
    short_names[f'{sname}_hqta'] = f'{short_name}_hqta'
    scenarios[f'{sname}_hqta'] = scenarios[sname]

# get combined probs for all scenarios
for scenario in sorted(glob('../data/*_net_present_value.parquet')):
    sname = scenario[8:-26]
    print(sname)
    
    sc_npvs = pd.read_parquet(scenario)
    sc_probs = simulate_auction(sc_npvs[['existing', 'sfh', 'duplex', 'threeplex', 'sixplex']], sale_prob)
    sc_probs.to_parquet(f'../data/{sname}_redevelopment_probability.parquet')
    sc_buildings = sc_probs.sum()
    
    sc_probs_geo = sc_probs.join(puma_for_gid.reindex(sc_probs.index))
    teardowns = sc_probs_geo[sc_probs_geo.building_propertylandusestndcode == 'RR101'].copy()
    
    # Figure out marginal units
    new_units = sc_buildings.sfh + 2 * sc_buildings.duplex + 3 * sc_buildings.threeplex + 6 * sc_buildings.sixplex
    destroyed_units = (1 - teardowns.existing).sum()
        
    margun = new_units - destroyed_units
    marginal_units[sname] = margun
    print(f'new units: {new_units} destroyed units: {destroyed_units} marginal units: {margun}')
    
    # prepare scenario for sorting
    # how many new multifamily homes are being created in each PUMA
    # Do also for tracts - used in population synthesis
    for grp in ('puma', 'tract'):
        grpd = sc_probs_geo.groupby(grp)
        mfh_new = grpd.sixplex.sum() * 6 + grpd.threeplex.sum() * 3 + grpd.duplex.sum() * 2
        sfh_new = grpd.sfh.sum()

        # and how many are torn down?
        assert (sc_probs_geo.building_propertylandusestndcode == 'VL101').any() # make sure we didn't lose vacant properties
        teardowns = sc_probs_geo[sc_probs_geo.building_propertylandusestndcode == 'RR101'].copy()

        teardowns['teardown_prob'] = 1 - teardowns.existing
        teardowns['built_2000_or_later'] = (teardowns.building_yearbuilt >= 2000).fillna(False) # assume missing is old

        sfh_new -= teardowns[teardowns.built_2000_or_later].groupby(grp).teardown_prob.sum()
        sfh_old = -teardowns[~teardowns.built_2000_or_later].groupby(grp).teardown_prob.sum()

        weighted_supply = hh.groupby('choice').hhwt.sum() / 100

        weighted_supply = weighted_supply.reset_index()

        weighted_supply[['puma', 'sfmf', 'age', 'tenure']] = weighted_supply.choice.str.split('_', expand=True)

        rent_prop = weighted_supply.groupby(['puma', 'sfmf', 'age']).apply(lambda df: df.loc[df.tenure == 'rent', 'hhwt'].sum() / df.hhwt.sum()).rename('rent_prop')

        if grp == 'puma':
            rent_prop.index = [f'{puma}_{sfmf}_{age}' for puma, sfmf, age in rent_prop.index]
        elif grp == 'tract':
            # use puma level rent/own proportion, broadcast to tract level
            rent_prop = rent_prop.reset_index()
            rent_prop = rent_prop.merge(tract_to_puma, left_on='puma', right_on='PUMA5CE', how='inner', validate='m:m')
            
            # deal with one Census tract in Woodland Hills that is split between PUMAs using in those
            # two pumas put together
            wdhls = (
                weighted_supply[weighted_supply.puma.isin(['03725', '03726'])].groupby(['sfmf', 'age'])
                .apply(lambda df: df.loc[df.tenure == 'rent', 'hhwt'].sum() / df.hhwt.sum()).rename('rent_prop')
            ).reset_index()
            
            wdhls['tract_geoid'] = '06037137000'
            
            rent_prop = pd.concat([rent_prop, wdhls], ignore_index=True)
            rent_prop = rent_prop.set_index(['tract_geoid', 'sfmf', 'age'])
            rent_prop.index = [f'{tract}_{sfmf}_{age}' for tract, sfmf, age in rent_prop.index]
            rent_prop = rent_prop.rent_prop  # peel back to just a multiindexed series
            
        sorting_inputs = pd.DataFrame({
            'MF_new': mfh_new,
            'MF_old': 0,
            'SF_new': sfh_new,
            'SF_old': sfh_old
        })

        sorting_inputs = sorting_inputs.stack()

        sorting_inputs.index = [f'{g}_{htype}' for g, htype in sorting_inputs.index]
        sorting_inputs

        rent_prop = rent_prop.reindex(sorting_inputs.index)
        assert not rent_prop.isnull().any()

        renter_sorting_inputs = sorting_inputs * rent_prop
        renter_sorting_inputs.index = [f'{i}_rent' for i in renter_sorting_inputs.index]

        owner_sorting_inputs = sorting_inputs * (1 - rent_prop)
        owner_sorting_inputs.index = [f'{i}_own' for i in owner_sorting_inputs.index]

        sorting_inputs = pd.concat([renter_sorting_inputs, owner_sorting_inputs])

        if grp == 'puma':
            scenario_units[short_names[sname]] = sorting_inputs
            sc_buildings.loc['Teardowns'] = teardowns.teardown_prob.sum()
            construction_and_demolition[sname] = sc_buildings
        else:
            tract_units[short_names[sname]] = sorting_inputs
    
    
    grpd_new = sc_probs_geo.groupby('tract')
    tract_marginal_units[sname] = (
        grpd_new.sfh.sum() + grpd_new.duplex.sum() * 2 + grpd_new.threeplex.sum() * 3 + grpd_new.sixplex.sum() * 6
        - teardowns.groupby('tract').teardown_prob.sum()
    )

In [None]:
construction_and_demolition = pd.DataFrame(construction_and_demolition).transpose()


In [None]:
# read profitability table from earlier chapter for comparison
profitability_table = pd.read_parquet('../data/profitability_table.parquet')

In [None]:
profitability_table

In [None]:
paper_table = construction_and_demolition.copy()
# need to do this first before converting rest to number of units
paper_table['ex_pct'] = paper_table.existing / (paper_table.existing + paper_table.sfh + paper_table.duplex + paper_table.threeplex + paper_table.sixplex) * 100
paper_table['duplex'] *= 2
paper_table['threeplex'] *= 3
paper_table['sixplex'] *= 6
paper_table['total_new'] = paper_table.sfh + paper_table.duplex + paper_table.threeplex + paper_table.sixplex
paper_table['marginal_units'] = paper_table.total_new - paper_table.Teardowns
paper_table.index = [i.replace('_hqta', ' (HQTA)') for i in paper_table.index]
paper_table = paper_table.reindex(profitability_table.index)
paper_table[['sfh', 'duplex', 'threeplex', 'sixplex', 'total_new', 'Teardowns', 'marginal_units']] /= 1000
paper_table['marginal_units_orig'] = profitability_table['Marginal units'] / 1000
paper_table[['sfh', 'duplex', 'threeplex', 'sixplex', 'total_new', 'marginal_units', 'Teardowns', 'marginal_units_orig']] =\
    paper_table[['sfh', 'duplex', 'threeplex', 'sixplex', 'total_new', 'marginal_units', 'Teardowns', 'marginal_units_orig']].round().astype('int64').apply(lambda c: c.apply('{:,d}'.format))
paper_table['ex_pct'] = paper_table.ex_pct.apply('{:.1f}%'.format)
paper_table = paper_table[['ex_pct', 'sfh', 'duplex', 'threeplex', 'sixplex', 'total_new', 'Teardowns', 'marginal_units', 'marginal_units_orig']].rename(columns={
    'sfh': 'Single-family home',
    'duplex': 'Duplex',
    'threeplex': 'Threeplex',
    'sixplex': 'Sixplex',
    'ex_pct': 'Non-redeveloped parcels',
    'total_new': 'Total',
    'marginal_units': 'Marginal units',
    'marginal_units_orig': 'Most profitable marginal units'
})
paper_table

In [None]:
print(paper_table.to_latex())

In [None]:
scenario_output = pd.DataFrame(scenario_units)

In [None]:
scenario_output.to_parquet('../data/npv_scenarios.parquet')

In [None]:
scenario_output.isnull().sum()

In [None]:
scenario_output

In [None]:
scenario_output.npv_base[scenario_output.npv_base > 0].sum()

In [None]:
tract_units = pd.DataFrame(tract_units)
tract_units.head()

In [None]:
tract_units.to_parquet('../data/npv_tract_scenarios.parquet')

In [None]:
mun = pd.Series(marginal_units).sort_values(ascending=False).rename(index={
    'Current (8.3%) Appreciation': 'Current appreciation',
       'Low discount rate (2.5% existing, 5% new)': 'Low discount rate',
       'Equal discount rate (8% existing and new)': 'Equal existing/new\ndiscount rate',
       'Low operating cost (25%)': 'Low operating cost',
       'High discount rate (6% existing, 12% new)': 'High discount rate'
})
plt.barh(np.arange(len(mun)), mun / 1_000_000, color='#8c1d40')
plt.yticks(np.arange(len(mun)), mun.index)
plt.xlabel('Number of new units (millions)')
plt.savefig('uncertainty.svg')
plt.savefig('uncertainty.pdf')