## To do:
- [ ] repair flags $\leftarrow$ talk to george
- [ ] multiply ellis evictions by # units
- [ ] include demos?
- [ ] counts > # units?
- [ ] OMI counts > 1?
- [ ] assessor value + sqft?
- [ ] 40 MAP_BLK_LOTs with 2 diff neighborhood codes
- [ ] treatment of failure to pay $\leftarrow$ remove from uncontrolled pop? estimate # missing from controlled?

In [1]:
import pandas as pd
import os
from matplotlib import pyplot as plt
import seaborn as sns
import numpy as np
import statsmodels.formula.api as smf

In [2]:
df = pd.read_stata("hpd_correction/Eviction_Building_Panel_Gardner_200307to201312_v2.dta")

In [3]:
df['counter9_expanded'] = df['counter9'] * df['initial_newUNITS']
df['counter9_new'] = df[['counter9_expanded','initial_newUNITS']].min(axis=1)

df['counter10_expanded'] = df['counter10'] * df['initial_newUNITS']
df['counter10_new'] = df[['counter10_expanded','initial_newUNITS']].min(axis=1)

df['counter13_expanded'] = df['counter13'] * df['initial_newUNITS']
df['counter13_new'] = df[['counter13_expanded','initial_newUNITS']].min(axis=1)

df['counter14_expanded'] = df['counter14'] * df['initial_newUNITS']
df['counter14_new'] = df[['counter14_expanded','initial_newUNITS']].min(axis=1)

df['counter15_expanded'] = df['counter15'] * df['initial_newUNITS']
df['counter15_new'] = df[['counter15_expanded','initial_newUNITS']].min(axis=1)

In [4]:
df['nofault_new'] = (
    df['counter8']  # OMI
#     + df['counter9_new']  # condo
    + df['counter10_new']  # demo
#     + df['counter11']  # capital improvement
    + df['counter13_new']  # Ellis
#     + df['counter14_new']  # lead
#     + df['counter15_new']  # dev agreement demo
#     + df['counter16']  # Good Samaritan
)

In [5]:
df['total_evic'] = df['atfault'] + df['nofault_new']

In [6]:
df_year = df.groupby(
    ['MAP_BLK_LOT', 'year', 'newYRBLT', 'initial_newUNITS','initial_rentcontrol']).agg(
    {'atfault': "sum", "nofault": "sum", "total_evic": "sum", "month": "count", "RP1NBRCDE": pd.Series.mode}
).reset_index().rename(columns={'month':'months_obs'})

df_year['nofault_per_unit_per_month'] = (df_year['nofault'] / df_year['initial_newUNITS']) / df_year['months_obs']
df_year['atfault_per_unit_per_month'] = (df_year['atfault'] / df_year['initial_newUNITS']) / df_year['months_obs']
df_year['total_evic_per_unit_per_month'] = (df_year['total_evic'] / df_year['initial_newUNITS']) / df_year['months_obs']

df_year['nofault_per_unit_per_yr'] = df_year['nofault_per_unit_per_month'] * 12
df_year['total_evic_per_unit_per_yr'] = df_year['total_evic_per_unit_per_month'] * 12
df_year['atfault_per_unit_per_yr'] = df_year['atfault_per_unit_per_month'] * 12

In [7]:
bandwidth = 23

rd_df = df_year[
    (df_year['newYRBLT'] > 1980 - bandwidth) &  # bandwidth
    (df_year['newYRBLT'] < 1980 + bandwidth) &  # bandwidth
    (df_year['newYRBLT'] != 1985) &  # huge outlier (?)
    (df_year['newYRBLT'] != 1979) &  # partial RC (?)
    (df_year['newYRBLT'] != 1980) &  # Should be RC but could be fuzzy boundary (?)
    (df_year['initial_newUNITS'] > 0)
].copy()

In [8]:
rd_df['pre_1980'] = rd_df['newYRBLT'] < 1980
rd_df['rent_control'] = False
rd_df.loc[rd_df['pre_1980'] == True, 'rent_control'] = True
rd_df['year_built_centered'] = rd_df['newYRBLT'] - 1980

### Model 1

In [9]:
rd = smf.ols(
    "total_evic_per_unit_per_yr ~ rent_control + year_built_centered * rent_control",
    data=rd_df)

In [10]:
fitted = rd.fit()

In [11]:
print(fitted.summary())

                                OLS Regression Results                                
Dep. Variable:     total_evic_per_unit_per_yr   R-squared:                       0.001
Model:                                    OLS   Adj. R-squared:                  0.001
Method:                         Least Squares   F-statistic:                     15.29
Date:                        Sat, 24 Dec 2022   Prob (F-statistic):           6.10e-10
Time:                                11:42:56   Log-Likelihood:                 80414.
No. Observations:                       52007   AIC:                        -1.608e+05
Df Residuals:                           52003   BIC:                        -1.608e+05
Df Model:                                   3                                         
Covariance Type:                    nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------

### Model 2

In [12]:
rd2 = smf.ols(
    "total_evic_per_unit_per_yr ~ rent_control + year_built_centered*rent_control + np.log(initial_newUNITS)",
    data=rd_df)

In [13]:
fitted2 = rd2.fit()
print(fitted2.summary())

                                OLS Regression Results                                
Dep. Variable:     total_evic_per_unit_per_yr   R-squared:                       0.001
Model:                                    OLS   Adj. R-squared:                  0.001
Method:                         Least Squares   F-statistic:                     12.05
Date:                        Sat, 24 Dec 2022   Prob (F-statistic):           8.60e-10
Time:                                11:42:56   Log-Likelihood:                 80415.
No. Observations:                       52007   AIC:                        -1.608e+05
Df Residuals:                           52002   BIC:                        -1.608e+05
Df Model:                                   4                                         
Covariance Type:                    nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------

### Model 3

In [22]:
census_df = pd.read_csv('data/census_tract_acs_data_2006_2010.csv', dtype={'tract_id': str, 'tract': str})
df_geo = pd.read_csv('hpd_correction/asquith_data_with_fipscd.csv', dtype={'tract_id': str, 'tract': str})

In [26]:
df_geo = df_geo[['MAP_BLK_LOT', 'tract_id']].drop_duplicates()

In [27]:
rd_df2 = rd_df.merge(df_geo, on='MAP_BLK_LOT')

In [28]:
rd_df2 = rd_df2.merge(census_df, on='tract_id')

In [29]:
rd_df2['pct_non_white'] = 1 - (rd_df2['white_only_pop'] / rd_df2['total_pop_race'])
rd_df2['pct_black'] = (rd_df2['black_only_pop'] / rd_df2['total_pop_race'])
rd_df2['pct_black_multi'] = (rd_df2['black_only_pop'] + rd_df2['multiracial_pop']) / rd_df2['total_pop_race']
rd_df2['pct_latino'] = rd_df2['hispanic_pop'] / rd_df2['total_pop_race']
rd_df2['pct_occ_units_rental'] = rd_df2['total_rental_tenure'] / rd_df2['total_occupied_units']
rd_df2['pct_renter_pop'] = rd_df2['total_renter_pop'] / rd_df2['total_pop_tenure']
rd_df2['pct_rentals_pre2000_movein'] = (
    rd_df2['total_rental_tenure'] - rd_df2['rental_movein_2005_later'] - rd_df2['rental_movein_2000_2004']) / (
    rd_df2['total_rental_tenure'])
rd_df2['pct_renter_same_house_last_yr'] = rd_df2['renter_non_mover'] / rd_df2['total_renter_pop']
rd_df2['pct_recent_mover_rent'] = 1 - (rd_df2['renter_non_mover'] / rd_df2['total_renter_pop'])

In [30]:
rd3 = smf.ols(
    "total_evic_per_unit_per_yr ~ rent_control + year_built_centered*rent_control + "
#     "np.log(median_hh_income) + "
    "pct_non_white + "
    "pct_latino + "
    "pct_recent_mover_rent + "
    "np.log(initial_newUNITS)"
    ,
    data=rd_df2)

In [31]:
fitted3 = rd3.fit()
print(fitted3.summary())

                                OLS Regression Results                                
Dep. Variable:     total_evic_per_unit_per_yr   R-squared:                       0.001
Model:                                    OLS   Adj. R-squared:                  0.001
Method:                         Least Squares   F-statistic:                     9.301
Date:                        Sat, 24 Dec 2022   Prob (F-statistic):           1.46e-11
Time:                                11:51:03   Log-Likelihood:                 78672.
No. Observations:                       51129   AIC:                        -1.573e+05
Df Residuals:                           51121   BIC:                        -1.573e+05
Df Model:                                   7                                         
Covariance Type:                    nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------

### Model 4

In [32]:
rd_df2['RP1NBRCDE'] = rd_df2['RP1NBRCDE'].astype(str)

In [33]:
rd4 = smf.ols(
    "total_evic_per_unit_per_yr ~ rent_control + year_built_centered*rent_control"
    " + pct_non_white"
    " + pct_latino"
    " + pct_recent_mover_rent"
    "+ np.log(initial_newUNITS)"
    " + RP1NBRCDE"
    ,
    data=rd_df2)

In [34]:
fitted4 = rd4.fit()
print(fitted4.summary())

                                OLS Regression Results                                
Dep. Variable:     total_evic_per_unit_per_yr   R-squared:                       0.003
Model:                                    OLS   Adj. R-squared:                  0.001
Method:                         Least Squares   F-statistic:                     1.621
Date:                        Sat, 24 Dec 2022   Prob (F-statistic):           0.000325
Time:                                11:51:21   Log-Likelihood:                 78706.
No. Observations:                       51129   AIC:                        -1.572e+05
Df Residuals:                           51046   BIC:                        -1.565e+05
Df Model:                                  82                                         
Covariance Type:                    nonrobust                                         
                                               coef    std err          t      P>|t|      [0.025      0.975]
---------------------