### Description
Author: T. Majidzadeh

Date Created: March 9, 2025

Date Updated: April 4, 2025

Purpose: Prepare data for Zillow rent indices, 2015-2019, treatment in Dec 2017. Prototype version assumes "affected" metros are those with at least 35% post-merger penetration rate and at least 10% share gain from the merger.

Update: Add-back rent index up to 2022-12-31 for graphing purposes.

In [2]:
import pandas as pd
import numpy as np
from datetime import datetime
import json
import os
import re

In [3]:
paths = {
    "zillow_raw": "..\\data\\zillow_data_raw\\",
    "zillow_reg": "..\\data\\zillow_reg_data\\"
}

In [4]:
first_ym = datetime.strptime('2015-01-01', '%Y-%m-%d').date()
treat_ym = datetime.strptime('2017-12-01', '%Y-%m-%d').date()
covid_ym = datetime.strptime('2020-01-01', '%Y-%m-%d').date()
end_ym = datetime.strptime('2023-01-01', '%Y-%m-%d').date()
lag_months = 1

def months_since_start(date, start_date):
    return (date.year - start_date.year) * 12 + (date.month - start_date.month)

def lag_term(df, num_months=1):
    sorted_df = df.sort_values(by=['RegionName', 'Year-Month'])
    shifted = sorted_df.groupby('RegionName')['ZORI'].shift(num_months)
    return shifted

In [5]:
# Assumes "affected" metros are those with at least 35% post-merger penetration rate and at least 10% share gain from the merger.

affected_msas = {
    "Atlanta, GA" : 1,
    "Dallas, TX" : 1,
    "Phoenix, AZ" : 1,
    "Denver, CO" : 1,
    "Tampa, FL" : 1,
    "Washington, DC" : 1,
    
    "Houston, TX" : 0,
    "Riverside, CA" : 0,
    "Las Vegas, NV" : 0,
    "Seattle, WA" : 0,
    "Philadelphia, PA" : 0,
    "Boston, MA" : 0,
    "Minneapolis, MN" : 0,
    "San Diego, CA" : 0,
    "Miami, FL" : 0,
    "San Francisco, CA" : 0,
    "Chicago, IL" : 0,
    "Detroit, MI" : 0,
    "Los Angeles, CA" : 0,
    "New York, NY" : 0
}

In [6]:
zillow_raw = pd.read_csv(
    paths["zillow_raw"]+"Metro_zori_uc_mfr_sm_month.csv"
)

In [7]:
zillow_long = zillow_raw.melt(
    id_vars=["RegionName", "StateName"],
    value_vars=zillow_raw.columns[5:],
    var_name="Year-Month",
    value_name="ZORI"
)
zillow_long['Year-Month'] = zillow_long['Year-Month'] \
    .apply(lambda x: datetime.strptime(x, '%Y-%m-%d').date())

In [8]:
zillow_long['AffectedCity'] = zillow_long['RegionName'].map(affected_msas)
zillow_long['AffectedTime'] = zillow_long['Year-Month'].apply(lambda x: (x >= treat_ym)*1)
zillow_long['Year'] = zillow_long['Year-Month'].apply(lambda x: x.year)
zillow_long['Month'] = zillow_long['Year-Month'].apply(lambda x: x.month)
zillow_long['TimeTrend'] = zillow_long['Year-Month'] \
    .apply(lambda x: months_since_start(x, first_ym))
zillow_long['ZORI-Lag1'] = lag_term(zillow_long, lag_months)

In [9]:
zillow_reg = zillow_long.dropna(subset='AffectedCity')
zillow_reg['AffectedCity'] = zillow_reg['AffectedCity'].astype(int)
zillow_reg = zillow_reg[zillow_reg['Year-Month'] < end_ym]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  zillow_reg['AffectedCity'] = zillow_reg['AffectedCity'].astype(int)


In [10]:
zillow_long.to_csv(paths['zillow_reg']+'zillow_data_long.csv')
zillow_reg.to_csv(paths['zillow_reg']+'zillow_data_reg_20250404.csv')
zillow_reg.to_pickle(paths['zillow_reg']+'zillow_data_reg_20250404.pkl')