In [1]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

hoa_df = pd.read_stata('../../final_datasets/master_datasets/master_dataset_assoc_quarter_redfin.dta')
mls_df = pd.read_stata('../../final_datasets/master_datasets/master_dataset_assoc_quarter_mls.dta')

# Load and filter data
hoa_df['has_hoa_data'] = 1
mls_df['has_hoa_data'] = 0

# Keep only relevant quarters
hoa_df = hoa_df[hoa_df['quarter'].between('2019Q1', '2023Q2')]
mls_df = mls_df[mls_df['quarter'].between('2019Q1', '2023Q2')]

# Compute county share per quarter from MLS
county_dist = (
    mls_df.groupby(['quarter', 'mm_fips_county_name_attom'])
    .size()
    .groupby(level=0)
    .transform(lambda x: x / x.sum())
    .rename('county_fraction')
    .reset_index()
)

# Merge county_fraction into MLS and HOA
mls_df = mls_df.merge(county_dist, on=['quarter', 'mm_fips_county_name_attom'], how='left')
hoa_df = hoa_df.merge(county_dist, on=['quarter', 'mm_fips_county_name_attom'], how='left')

# Combine datasets
combined = pd.concat([hoa_df, mls_df], ignore_index=True)

combined.columns
#combined.tail()

Index(['assoc_name_final', 'category_reserve_assoc',
       'mm_fips_county_name_attom', 'imputed_freq', 'zip5_attom',
       'num_units_records_assoc', 'has_sb4d_disc_records_assoc',
       'zip5_owner_attom', 'primary_hoa_mgmt_attom_assoc',
       'add_primary_hoa_mgmt_attom_assoc', 'corp_mgmt_zip_attom_assoc',
       'corp_mgmt_city_attom_assoc', 'corp_mgmt_state_attom_assoc',
       'assoc_identifier_attom', 'num_units_attom_assoc',
       'nonprimary_zip_excorp_attom', 'nonprimary_state_excorp_attom',
       'frac_npexcorp_zip_attom_assoc', 'frac_npexcorp_state_attom_assoc',
       'frac_nonprim_zip_attom_assoc', 'frac_nonprim_state_attom_assoc',
       'frac_corp_own_attom_assoc', 'num_stories_final_assoc', 'treated_assoc',
       'miles_to_coast_assoc', 'three_miles_coast_assoc',
       'x_coord_attom_assoc', 'y_coord_attom_assoc', 'gym_redfin_assoc',
       'pool_redfin_assoc', 'spa_redfin_assoc', 'hot_tub_redfin_assoc',
       'sauna_redfin_assoc', 'spa_broad_redfin_assoc', 't

In [2]:
combined[['mm_fips_county_name_attom','quarter','county_fraction','has_hoa_data']].tail()

Unnamed: 0,mm_fips_county_name_attom,quarter,county_fraction,has_hoa_data
95788,BROWARD,2021-04-01,0.218611,0
95789,BROWARD,2021-07-01,0.212592,0
95790,BROWARD,2023-04-01,0.200998,0
95791,BROWARD,2020-04-01,0.208461,0
95792,BROWARD,2022-04-01,0.194788,0


In [3]:
combined['log_sqft_med_attom_assoc'] = combined['sqft_med_attom_assoc'].apply(lambda x: np.log(x) if x > 0 else np.nan)

# Select features for propensity score estimation
#features = ['num_stories_final_assoc', 'num_bldgs_final_assoc', 'num_units_final_assoc', 'log_sqft_med_attom_assoc',
#            'list_price_sq_ft_assoc_qtr', 'county_fraction']

weights = []

for q in combined['quarter'].unique():
    df_q = combined[combined['quarter'] == q].copy()

    # Ensure non-null entries for predictors
    df_q = df_q.dropna(subset=['num_stories_final_assoc', 'num_bldgs_final_assoc', 'num_units_final_assoc', 
                               'log_sqft_med_attom_assoc', 'list_price_sq_ft_assoc_qtr', 'county_fraction'])

    X_q = df_q[['num_stories_final_assoc', 'num_bldgs_final_assoc', 'num_units_final_assoc', 
                'log_sqft_med_attom_assoc', 'list_price_sq_ft_assoc_qtr', 'county_fraction']]
    X_q = sm.add_constant(X_q)
    y_q = df_q['has_hoa_data']

    model_q = sm.Logit(y_q, X_q).fit(disp=False)
    df_q['propensity_score'] = model_q.predict(X_q)
    df_q['final_weight'] = np.where(df_q['has_hoa_data'] == 1,
                                    1 / df_q['propensity_score'],
                                    0)

    weights.append(df_q[['assoc_name_final', 'quarter', 'final_weight', 'has_hoa_data']])

In [4]:
weights_df = pd.concat(weights, ignore_index=True)
hoa_weights_df = weights_df[weights_df['has_hoa_data'] == 1]
hoa_weights_df = hoa_weights_df[['assoc_name_final', 'quarter', 'final_weight']]
hoa_weights_df.head()
hoa_weights_df['final_weight'].describe()

count    24835.000000
mean         3.076979
std          0.499027
min          1.032754
25%          2.789168
50%          3.024281
75%          3.327314
max         23.448078
Name: final_weight, dtype: float64

In [5]:
# Convert datetime (e.g., 2020-01-01) to number of quarters since 1960Q1
def datetime_to_stata_quarter(dt):
    return (dt.year - 1960) * 4 + (dt.month - 1) // 3

hoa_weights_df['quarter_stata'] = hoa_weights_df['quarter'].apply(datetime_to_stata_quarter)

hoa_weights_df = hoa_weights_df.drop(columns=['quarter'])
hoa_weights_df = hoa_weights_df.rename(columns={'quarter_stata': 'quarter'})

hoa_weights_df.to_stata('../../final_datasets/master_datasets/hoa_redfin_weights.dta', write_index=False)