### Description
Author: T. Majidzadeh

Date Created: March 9, 2025

Date Updated: March 17, 2025

Purpose: Test difference-in-difference regression for Zillow rent indices, 2015-2019, treatment in Dec 2017. Prototype version assumes "affected" metros are those with at least 35% post-merger penetration rate and at least 10% share gain from the merger. Conducts tests for parallel trend assumption and placebo tests.

In [2]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import altair as alt
from datetime import datetime
import json
import os
import re

In [3]:
paths = {
    "zillow_raw": "..\\data\\zillow_data_raw\\",
    "zillow_reg": "..\\data\\zillow_reg_data\\"
}

In [4]:
zillow_reg = pd.read_pickle(paths['zillow_reg']+'zillow_data_reg_proto_20250309.pkl')

In [5]:
zillow_reg['AffectedCityTime'] = zillow_reg \
    .apply(lambda x: x.AffectedCity * x.AffectedTime, axis=1)
zillow_reg['AffectedCityTimeTrend'] = zillow_reg \
    .apply(lambda x: x.AffectedCity * x.TimeTrend, axis=1)
zillow_reg['Year'] = zillow_reg['Year'].astype(str)
zillow_reg['Month'] = zillow_reg['Month'].astype(str)

In [6]:
zillow_chart = zillow_reg
zillow_chart['Year-Month'] = pd.to_datetime(zillow_chart['Year-Month'])
alt.Chart(zillow_chart).mark_line().encode(
    y = 'ZORI',
    x = 'Year-Month',
    color = 'RegionName'
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


In [7]:
alt.Chart(zillow_chart).mark_line().encode(
    y = alt.Y('average(ZORI)', scale=alt.Scale(domainMin=1000, domainMax=1900)),
    x = 'Year-Month',
    color = 'AffectedCity'
)

  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)
  col = df[col_name].apply(to_list_if_array, convert_dtype=False)


#### Get Dummies for Regression

In [9]:
zillow_reg = pd \
    .get_dummies(zillow_reg, columns=['RegionName', 'Year', 'Month'], drop_first = True)
zillow_reg

Unnamed: 0,StateName,Year-Month,ZORI,AffectedCity,AffectedTime,TimeTrend,ZORI-Lag1,AffectedCityTime,AffectedCityTimeTrend,"RegionName_Boston, MA",...,Month_11,Month_12,Month_2,Month_3,Month_4,Month_5,Month_6,Month_7,Month_8,Month_9
1,NY,2015-01-31,2209.959116,0,0,0,,0,0,False,...,False,False,False,False,False,False,False,False,False,False
2,CA,2015-01-31,1684.237669,0,0,0,,0,0,False,...,False,False,False,False,False,False,False,False,False,False
3,IL,2015-01-31,1366.441879,0,0,0,,0,0,False,...,False,False,False,False,False,False,False,False,False,False
4,TX,2015-01-31,1024.638892,1,0,0,,0,0,False,...,False,False,False,False,False,False,False,False,False,False
5,TX,2015-01-31,1138.420831,0,0,0,,0,0,False,...,False,False,False,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29103,MN,2019-12-31,1389.784723,0,1,59,1386.426145,0,0,False,...,False,True,False,False,False,False,False,False,False,False
29104,CA,2019-12-31,2018.979007,0,1,59,2011.736617,0,0,False,...,False,True,False,False,False,False,False,False,False,False
29105,FL,2019-12-31,1236.871601,1,1,59,1232.537853,1,59,False,...,False,True,False,False,False,False,False,False,False,False
29106,CO,2019-12-31,1538.059242,1,1,59,1539.900116,1,59,False,...,False,True,False,False,False,False,False,False,False,False


#### Test Parallel Trends

In [11]:
yvar = 'ZORI'
xvars = [
    'AffectedCityTimeTrend',
    'TimeTrend'
]

In [12]:
zillow_reg_parallel=zillow_reg[zillow_reg['Year-Month'] < '2017-12-01']
zillow_reg_parallel=zillow_reg_parallel[[yvar]+xvars].dropna()
y = np.asarray(zillow_reg_parallel[yvar])
X = sm.add_constant(zillow_reg_parallel[xvars], prepend=True) * 1


In [13]:
mod = sm.OLS(y, X, hasconst=True)
res = mod.fit(cov_type='HC3')
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.095
Model:                            OLS   Adj. R-squared:                  0.092
Method:                 Least Squares   F-statistic:                     45.03
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           4.04e-19
Time:                        17:27:08   Log-Likelihood:                -5255.8
No. Observations:                 700   AIC:                         1.052e+04
Df Residuals:                     697   BIC:                         1.053e+04
Df Model:                           2                                         
Covariance Type:                  HC3                                         
                            coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------------
const                  1356.54

#### Base Model
(expecting a spurious result, because the pre-treatment trends are not parallel).

In [57]:
yvar = 'ZORI'
xvars = [
    'AffectedCity', 
    'AffectedCityTime',
    'TimeTrend'
]
city_fe = [col for col in zillow_reg.columns if col.startswith('RegionName')]
year_fe = [col for col in zillow_reg.columns if (col.startswith('Year')) & (col != "Year-Month")]
month_fe = [col for col in zillow_reg.columns if col.startswith('Month')]

In [59]:
zillow_reg_base=zillow_reg[[yvar]+xvars+month_fe].dropna()
y = np.asarray(zillow_reg_base[yvar])
X = sm.add_constant(zillow_reg_base[xvars+month_fe], prepend=True) * 1


In [61]:
mod = sm.OLS(y, X, hasconst=True)
res = mod.fit(cov_type='HC3')
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.142
Model:                            OLS   Adj. R-squared:                  0.132
Method:                 Least Squares   F-statistic:                     18.66
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           1.24e-42
Time:                        18:49:08   Log-Likelihood:                -9008.2
No. Observations:                1200   AIC:                         1.805e+04
Df Residuals:                    1185   BIC:                         1.812e+04
Df Model:                          14                                         
Covariance Type:                  HC3                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const             1446.7388     49.811  

#### Placebo Test
(expecting it to fail, because the parallel trends fail).

In [19]:
yvar = 'ZORI'
xvars = [
    'AffectedCity', 
    'AffectedCityTime',
    'TimeTrend'
]
city_fe = [col for col in zillow_reg.columns if col.startswith('RegionName')]
year_fe = [col for col in zillow_reg.columns if (col.startswith('Year')) & (col != "Year-Month")]
month_fe = [col for col in zillow_reg.columns if col.startswith('Month')]

In [20]:
zillow_reg_placebo=zillow_reg[zillow_reg['Year-Month'] < '2017-12-01']
placebo_ym = datetime.strptime('2016-12-01', '%Y-%m-%d')
zillow_reg_placebo['AffectedTime'] = zillow_reg_placebo['Year-Month'].apply(lambda x: (x >= placebo_ym)*1)
zillow_reg_placebo['AffectedCityTime'] = zillow_reg_placebo \
    .apply(lambda x: x.AffectedCity * x.AffectedTime, axis=1)
zillow_reg_placebo=zillow_reg_placebo[[yvar]+xvars+month_fe].dropna()
y = np.asarray(zillow_reg_placebo[yvar])
X = sm.add_constant(zillow_reg_placebo[xvars+month_fe], prepend=True) * 1

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zillow_reg_placebo['AffectedTime'] = zillow_reg_placebo['Year-Month'].apply(lambda x: (x >= placebo_ym)*1)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zillow_reg_placebo['AffectedCityTime'] = zillow_reg_placebo \


In [21]:
mod = sm.OLS(y, X, hasconst=True)
res = mod.fit(cov_type='HC3')
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       0.121
Model:                            OLS   Adj. R-squared:                  0.103
Method:                 Least Squares   F-statistic:                     9.062
Date:                Mon, 17 Mar 2025   Prob (F-statistic):           2.03e-18
Time:                        17:27:08   Log-Likelihood:                -5245.8
No. Observations:                 700   AIC:                         1.052e+04
Df Residuals:                     685   BIC:                         1.059e+04
Df Model:                          14                                         
Covariance Type:                  HC3                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const             1440.4422     62.558  

#### With Lag Term

In [45]:
yvar = 'ZORI'
xvars = [
    'AffectedCity', 
    'AffectedCityTime',
    'ZORI-Lag1'
]
city_fe = [col for col in zillow_reg.columns if col.startswith('RegionName')]
year_fe = [col for col in zillow_reg.columns if (col.startswith('Year')) & (col != "Year-Month")]
month_fe = [col for col in zillow_reg.columns if col.startswith('Month')]
zillow_reg_lag=zillow_reg[[yvar]+xvars].dropna()

In [47]:
y = np.asarray(zillow_reg_lag[yvar])
X = sm.add_constant(zillow_reg_lag[xvars], prepend=True) * 1


In [49]:
mod = sm.OLS(y, X, hasconst=True)
res = mod.fit(cov_type='HC3')
print(res.summary())

                            OLS Regression Results                            
Dep. Variable:                      y   R-squared:                       1.000
Model:                            OLS   Adj. R-squared:                  1.000
Method:                 Least Squares   F-statistic:                 7.633e+05
Date:                Mon, 17 Mar 2025   Prob (F-statistic):               0.00
Time:                        17:27:43   Log-Likelihood:                -4222.4
No. Observations:                1180   AIC:                             8453.
Df Residuals:                    1176   BIC:                             8473.
Df Model:                           3                                         
Covariance Type:                  HC3                                         
                       coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------------------------------------------------
const                4.1250      1.006  