In [8]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
from sklearn.preprocessing import OneHotEncoder

hoa_df = pd.read_stata('../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta')
mls_df = pd.read_stata('../../final_datasets/master_datasets/master_dataset_unit_obs_mls.dta')

# Load and filter data
hoa_df['has_hoa_data'] = 1
mls_df['has_hoa_data'] = 0

# Keep only relevant quarters
hoa_df = hoa_df[hoa_df['quarter'].between('2019Q1', '2023Q2')]
mls_df = mls_df[mls_df['quarter'].between('2019Q1', '2023Q2')]

# Compute county share per quarter from MLS
county_dist = (
    mls_df.groupby(['quarter', 'mm_fips_county_name_attom'])
    .size()
    .groupby(level=0)
    .transform(lambda x: x / x.sum())
    .rename('county_fraction')
    .reset_index()
)

# Merge county_fraction into MLS and HOA
mls_df = mls_df.merge(county_dist, on=['quarter', 'mm_fips_county_name_attom'], how='left')
hoa_df = hoa_df.merge(county_dist, on=['quarter', 'mm_fips_county_name_attom'], how='left')

# Combine datasets
combined = pd.concat([hoa_df, mls_df], ignore_index=True)

combined.columns
#combined.tail()

  hoa_df['has_hoa_data'] = 1


Index(['assoc_name_final', 'category_reserve_assoc',
       'mm_fips_county_name_attom', 'address_attom', 'building_address_attom',
       'reserve', 'last_obs_date', 'missing_tax_year', 'hoa_total_annual',
       'hoa_total_monthly',
       ...
       'property_age_assoc_qtr', 'property_age_effect_assoc_qtr', 'zip_qtr',
       'county_qtr', 'has_hoa_data', 'county_fraction', 'price_mls',
       'floor_size_mls', 'days_on_market_mls', 'first_price_mls'],
      dtype='object', length=116)

In [9]:
combined[['mm_fips_county_name_attom','quarter','county_fraction','has_hoa_data']].tail()

Unnamed: 0,mm_fips_county_name_attom,quarter,county_fraction,has_hoa_data
205608,BROWARD,2022-07-01,0.204915,0
205609,BROWARD,2021-01-01,0.242078,0
205610,BROWARD,2023-04-01,0.203998,0
205611,BROWARD,2022-10-01,0.224579,0
205612,BROWARD,2021-01-01,0.242078,0


In [10]:
combined['log_sqft_med_attom_assoc'] = combined['sqft_med_attom_assoc'].apply(lambda x: np.log(x) if x > 0 else np.nan)

# Select features for propensity score estimation
#features = ['num_stories_final_assoc', 'num_bldgs_final_assoc', 'num_units_final_assoc', 'log_sqft_med_attom_assoc',
#            'list_price_sq_ft_assoc_qtr', 'county_fraction']

weights = []

for q in combined['quarter'].unique():
    df_q = combined[combined['quarter'] == q].copy()

    # Ensure non-null entries for predictors
    df_q = df_q.dropna(subset=['num_stories_final_assoc', 'num_bldgs_final_assoc', 'num_units_final_assoc', 
                               'log_sqft_med_attom_assoc', 'listed_price_sq_ft', 'county_fraction'])

    X_q = df_q[['num_stories_final_assoc', 'num_bldgs_final_assoc', 'num_units_final_assoc', 
                'log_sqft_med_attom_assoc', 'listed_price_sq_ft', 'county_fraction']]
    X_q = sm.add_constant(X_q)
    y_q = df_q['has_hoa_data']

    model_q = sm.Logit(y_q, X_q).fit(disp=False)
    df_q['propensity_score'] = model_q.predict(X_q)
    df_q['final_weight'] = np.where(df_q['has_hoa_data'] == 1,
                                    1 / df_q['propensity_score'],
                                    0)

    weights.append(df_q[['address_attom', 'quarter', 'final_weight', 'has_hoa_data']])

In [11]:
weights_df = pd.concat(weights, ignore_index=True)
hoa_weights_df = weights_df[weights_df['has_hoa_data'] == 1]
hoa_weights_df = hoa_weights_df[['address_attom', 'quarter', 'final_weight']]
hoa_weights_df.head()
hoa_weights_df['final_weight'].describe()

count    39647.000000
mean         3.911568
std          0.794954
min          1.051570
25%          3.460296
50%          3.752018
75%          4.174561
max         69.450084
Name: final_weight, dtype: float64

In [12]:
# Convert datetime (e.g., 2020-01-01) to number of quarters since 1960Q1
def datetime_to_stata_quarter(dt):
    return (dt.year - 1960) * 4 + (dt.month - 1) // 3

hoa_weights_df['quarter_stata'] = hoa_weights_df['quarter'].apply(datetime_to_stata_quarter)

hoa_weights_df = hoa_weights_df.drop(columns=['quarter'])
hoa_weights_df = hoa_weights_df.rename(columns={'quarter_stata': 'quarter'})

hoa_weights_df.to_stata('../../final_datasets/master_datasets/hoa_redfin_weights_unit.dta', write_index=False)