In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

# -------------------------------------------------
# Helper functions
# -------------------------------------------------

def q_to_start(q):
    """
    Convert quarter-like data to quarter-start timestamps.
    Accepts Stata tq, strings like '2019Q1', or dates.
    """
    # If already Period with Q freq
    if pd.api.types.is_period_dtype(q):
        return q.to_timestamp(how='start')
    dt = pd.to_datetime(q, errors='coerce')
    return pd.PeriodIndex(dt, freq='Q').to_timestamp(how='start')


def wmean(x, w):
    """Weighted mean with NaN / nonpositive weight handling."""
    x = np.asarray(x, dtype="float64")
    w = np.asarray(w, dtype="float64")
    m = np.isfinite(x) & np.isfinite(w) & (w > 0)
    if not m.any():
        return np.nan
    return (x[m] * w[m]).sum() / w[m].sum()


def exclude_mia_broward(df, county_col):
    """Drop Miami-Dade and Broward (county name column)."""
    cty = (df[county_col].astype(str).str.upper().str.strip()
           .str.replace(r'\s*COUNTY$', '', regex=True))
    return df.loc[~cty.isin(['MIAMI-DADE', 'MIAMI DADE', 'BROWARD'])].copy()


# -------------------------------------------------
# SHARED Δ FUNCTION
# -------------------------------------------------

def compute_delta(
    rf,
    *,
    assoc_col,
    county_col,
    zip_col,
    q_col,
    stories_col,
    hoa_psf_col,
    w_col,
    covars,
    pre_start,
    pre_end,
    post_start,
    post_end,
    ctrl_story_max=2,
    risk_source=None,      # 'fema', 'firststreet', or None
    fema_col=None,
    fs_col=None,
    verbose=True
):
    """
    Compute association-level Δ_i in percentage points (pp).

    Exact definition:
        1. Run a weighted regression in the Redfin panel:

              ln(HOA/ft²)_a,q  ~  covariates_a,q
                                      + risk dummies
                                      + ZIP fixed effects
                                      + quarter fixed effects

           using WLS with weights = final_weight.

        2. Take the **residuals** from this regression.

        3. For each county c and quarter q, compute the weighted mean residual
           among the **control buildings** (1–2 stories):

              r̄_c,q = weighted mean of residuals among controls in county c, q.

        4. For each association i, compute *association* and *control* means
           in the pre and post windows:

              ā_i,pre  = w-avg residual for assoc i in pre window
              ā_i,post = w-avg residual for assoc i in post window
              c̄_i,pre  = w-avg r̄_c,q (county-control mean) for quarters
                           of assoc i in pre window
              c̄_i,post = same in post window

        5. Define

              Δ_i = 100 * [ (ā_i,post − ā_i,pre)
                           −(c̄_i,post − c̄_i,pre) ]

           i.e., percentage-point change in residual log HOA for i
           relative to the change for 1–2 story controls in the same counties.

    Returns
    -------
    delta : DataFrame with columns [assoc_col, 'delta_pct']
    res1  : statsmodels WLS results object from step (1).
    """
    rf = rf.copy()

    # --- quarters as quarter-start timestamps
    rf[q_col] = q_to_start(rf[q_col])

    # --- log HOA per sq ft
    rf['ln_hoa_psf'] = np.log(pd.to_numeric(rf[hoa_psf_col], errors='coerce'))

    # --- pre / post indicators
    rf['period'] = np.select(
        [
            (rf[q_col] >= pre_start) & (rf[q_col] <= pre_end),
            (rf[q_col] >= post_start) & (rf[q_col] <= post_end),
        ],
        ['pre', 'post'],
        default=np.nan
    )

    # --- which covariates / risk columns exist
    covars_present = [c for c in covars if c in rf.columns]

    risk_cols = []
    if risk_source == 'fema' and fema_col is not None and fema_col in rf.columns:
        risk_cols.append(fema_col)
    elif risk_source == 'firststreet' and fs_col is not None and fs_col in rf.columns:
        risk_cols.append(fs_col)

    base_cols = [assoc_col, county_col, zip_col, q_col,
                 'period', 'ln_hoa_psf', w_col, stories_col]
    needed = base_cols + covars_present + risk_cols

    # --- subset to pre/post rows with valid ln(HOA)
    r0 = rf.loc[
        rf['period'].isin(['pre', 'post']) & rf['ln_hoa_psf'].notna(),
        needed
    ].copy()

    if verbose:
        print(f"[Δ] Raw rows in pre/post windows: {len(r0):,}")

    # --- build X matrix for Stage 1: covariates + risk + ZIP FE + Q FE
    parts = []

    if covars_present:
        Xcov = r0[covars_present].copy()
        for c in Xcov.columns:
            Xcov[c] = pd.to_numeric(Xcov[c], errors='coerce')
        parts.append(Xcov)

    if risk_source == 'fema' and fema_col is not None and fema_col in r0.columns:
        parts.append(pd.get_dummies(r0[fema_col], prefix='fema', drop_first=True))
    elif risk_source == 'firststreet' and fs_col is not None and fs_col in r0.columns:
        parts.append(pd.get_dummies(r0[fs_col], prefix='fs', drop_first=True))

    # ZIP FE
    parts.append(
        pd.get_dummies(
            r0[zip_col].astype(str).str.upper().str.strip(),
            prefix='zip', drop_first=True
        )
    )
    # Quarter FE
    parts.append(pd.get_dummies(r0[q_col], prefix='q', drop_first=True))

    X = pd.concat(parts, axis=1)
    for c in X.columns:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors='coerce')
    X = sm.add_constant(X, has_constant='add')

    y = r0['ln_hoa_psf'].astype(float)
    w = pd.to_numeric(r0[w_col], errors='coerce').replace([np.inf, -np.inf], np.nan).fillna(1.0)

    ok = X.notna().all(axis=1) & y.notna() & w.notna()
    r1 = r0.loc[ok].copy()
    X1, y1, w1 = X.loc[ok], y.loc[ok], w.loc[ok]

    if verbose:
        print(f"[Δ] Rows used in WLS: {len(r1):,} (dropped {len(r0)-len(r1):,})")

    res1 = sm.WLS(y1, X1, weights=w1).fit()
    r1['resid'] = res1.resid

    # --- 1–2 story controls: county×quarter mean residuals
    is_ctrl = pd.to_numeric(r1[stories_col], errors='coerce') <= ctrl_story_max
    ctrl = r1.loc[is_ctrl].copy()
    if ctrl.empty:
        print("[Δ] WARNING: no controls; Δ will be empty.")
        return pd.DataFrame({assoc_col: [], 'delta_pct': []}), res1

    c_means = (ctrl
               .groupby([county_col, q_col], as_index=False)
               .apply(lambda d: pd.Series({'rbar_c': wmean(d['resid'], d[w_col])})))
    r1 = r1.merge(c_means, on=[county_col, q_col], how='left')

    # --- association-level pre/post means of own residual and control mean
    ac_means = (r1
        .groupby([assoc_col, 'period'])
        .apply(lambda d: pd.Series({
            'a_bar': wmean(d['resid'],   d[w_col]),
            'c_bar': wmean(d['rbar_c'], d[w_col])
        }))
        .reset_index()
    )

    wide = (ac_means
            .pivot(index=assoc_col, columns='period', values=['a_bar', 'c_bar'])
            .reset_index())

    # flatten column names
    wide.columns = [assoc_col] + [
        '_'.join([a for a in c if a]) for c in wide.columns.tolist()[1:]
    ]

    for c in ['a_bar_pre', 'a_bar_post', 'c_bar_pre', 'c_bar_post']:
        if c not in wide.columns:
            wide[c] = np.nan

    # require all four means to compute Δ
    wide = wide.loc[
        wide[['a_bar_pre', 'a_bar_post', 'c_bar_pre', 'c_bar_post']].notna().all(1)
    ].copy()

    if verbose:
        print(f"[Δ] Associations with both pre & post residual means + county controls: {len(wide):,}")

    wide['delta_pct'] = 100.0 * (
        (wide['a_bar_post'] - wide['a_bar_pre']) -
        (wide['c_bar_post'] - wide['c_bar_pre'])
    )

    delta = wide[[assoc_col, 'delta_pct']].copy()

    if verbose and not delta.empty:
        print("\n[Δ] Distribution (pp):")
        print(delta['delta_pct'].describe())

    return delta, res1


In [2]:
# ============================================================
# CAPITALIZATION: MLS prices/DOM on Δ_i  (shared Δ method) - post period; effective year built 
# ============================================================
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm

# ---------------- PATHS ----------------
PATH_DTA_REDFIN = '../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta'
PATH_WGTS       = '../../final_datasets/master_datasets/hoa_redfin_weights_unit.dta'
PATH_DTA_MLS    = '../../final_datasets/master_datasets/master_dataset_price_dom_unit_obs_mls_redfin.dta'
OUTPUT_DIR      = 'exports/delta_all_vars'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- CORE COLS ----------------
ADD_COL     = 'address_attom'
ASSOC_COL   = 'assoc_name_final'
COUNTY_COL  = 'mm_fips_county_name_attom'
ZIP_COL     = 'zip5_attom'
QUARTER_COL = 'quarter'
HOA_PSF_COL = 'hoa_sq_ft'
W_COL       = 'final_weight'
STORIES_COL = 'num_stories_final_assoc'

# MLS outcomes
MLS_LIST_PSF = 'listed_price_sq_ft'
MLS_SOLD_PSF = 'sold_price_sq_ft'
MLS_DOM      = 'dom'

# ---------------- WINDOWS ----------------
# Δ_i uses 2019–2022Q1 (pre) vs 2025 (post), consistent with lending spec
PRE_START   = pd.Timestamp('2019-01-01')
PRE_END     = pd.Timestamp('2022-03-31')   # through 2022Q1
POST_START  = pd.Timestamp('2025-01-01')
POST_END    = pd.Timestamp('2025-12-31')

# MLS capitalization regressions use this **post** window (your existing choice)
MLS_PRE_START_Q = '2022Q3'
MLS_PRE_END_Q   = '2024Q4'

# ---------------- COVARIATES & RISK ----------------
COVARS = [
    'gym_redfin_assoc','pool_redfin_assoc','spa_broad_redfin_assoc',
    'tennis_redfin_assoc','golf_redfin_assoc','garage_redfin_assoc',
    'boat_redfin_assoc','elevator_redfin_assoc','view_redfin_assoc',
    'senior_community_redfin_assoc','property_age_effect_assoc_qtr',
    'frac_npexcorp_state_attom_assoc','frac_corp_own_attom_assoc',
    'corp_mgmt_city_attom_assoc'
]


    
RISK_SOURCE = 'fema'          # 'fema', 'firststreet', or None
FEMA_COL    = 'fema_flood_risk_bucket_assoc'
FS_COL      = 'firststreet_risk_cat_assoc'

EXCLUDE_MIAMI_BROWARD = True

# ---------------- Stage 1: load Redfin + weights, compute Δ_i ----------------
# Load Redfin HOA panel
rf = pd.read_stata(PATH_DTA_REDFIN)

# Load weights and harmonize quarter keys
wg = pd.read_stata(PATH_WGTS)[[ADD_COL, QUARTER_COL, W_COL]].copy()
wg[QUARTER_COL] = q_to_start(wg[QUARTER_COL])
wg[W_COL] = pd.to_numeric(wg[W_COL], errors='coerce').fillna(0)

# Aggregate any duplicate assoc×quarter weights
if wg.duplicated([ADD_COL, QUARTER_COL]).any():
    wg = wg.groupby([ADD_COL, QUARTER_COL], as_index=False)[W_COL].sum()

# Harmonize quarters in rf and merge weights
rf[QUARTER_COL] = q_to_start(rf[QUARTER_COL])
rf = rf.merge(wg, on=[ADD_COL, QUARTER_COL], how='left', validate='m:1')
rf[W_COL] = pd.to_numeric(rf[W_COL], errors='coerce').fillna(1.0)

# Exclude Miami-Dade & Broward
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in rf.columns:
    rf = exclude_mia_broward(rf, COUNTY_COL)

# Compute Δ_i with the shared method
delta, res_stage1 = compute_delta(
    rf,
    assoc_col=ASSOC_COL,
    county_col=COUNTY_COL,
    zip_col=ZIP_COL,
    q_col=QUARTER_COL,
    stories_col=STORIES_COL,
    hoa_psf_col=HOA_PSF_COL,
    w_col=W_COL,
    covars=COVARS,
    pre_start=PRE_START,
    pre_end=PRE_END,
    post_start=POST_START,
    post_end=POST_END,
    ctrl_story_max=2,
    risk_source=RISK_SOURCE,
    fema_col=FEMA_COL,
    fs_col=FS_COL,
    verbose=True
)
# delta has columns: [ASSOC_COL, 'delta_pct']
delta_out_path = os.path.join(OUTPUT_DIR, "delta_assoc_unit_effect_age.csv")

# Ensure clean ordering and no index
delta[[ASSOC_COL, 'delta_pct']].to_csv(
    delta_out_path,
    index=False
)

print(f"Exported delta_assoc to {delta_out_path}")

# ---------------- Stage 2: MLS capitalization regressions ----------------
def ensure_numeric_df(X):
    for c in X.columns:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors='coerce')
    return X

def _to_qstr(q):
    """Convert MLS quarter variable to 'YYYYQx' string."""
    return pd.PeriodIndex(q, freq='Q').astype(str).str.upper()

# Load MLS dataset
mls = pd.read_stata(PATH_DTA_MLS).copy()

# Restrict to treated group: 3+ story associations
mls = mls[pd.to_numeric(mls[STORIES_COL], errors='coerce') >= 3].copy()
mls[QUARTER_COL] = _to_qstr(mls[QUARTER_COL])

# Exclude Miami-Dade & Broward to match Stage 1
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in mls.columns:
    mls = exclude_mia_broward(mls, COUNTY_COL)

# Keep only the MLS pre window (your chosen capitalization window)
mls_pre = mls[
    (mls[QUARTER_COL] >= MLS_PRE_START_Q) &
    (mls[QUARTER_COL] <= MLS_PRE_END_Q)
].copy()

# Merge Δ_i onto MLS panel (inner join → only associations with Δ)
mls_pre = mls_pre.merge(delta, on=ASSOC_COL, how='inner')

# --- build log outcomes ---

def ln_pos(df, col_in, col_out):
    """Keep rows with col_in>0 and add log(col_in) column."""
    x = pd.to_numeric(df[col_in], errors='coerce')
    df = df[x > 0].copy()
    df[col_out] = np.log(x[x > 0].astype(float))
    return df

pre_list = ln_pos(mls_pre.copy(), MLS_LIST_PSF, 'ln_list_psf')
pre_sold = ln_pos(mls_pre.copy(), MLS_SOLD_PSF, 'ln_sold_psf')

pre_dom = mls_pre.copy()
pre_dom = pre_dom[pd.to_numeric(pre_dom[MLS_DOM], errors='coerce') >= 0].copy()
pre_dom['ln_dom1p'] = np.log1p(pd.to_numeric(pre_dom[MLS_DOM], errors='coerce').astype(float))

# --- design matrix: Δ_i + ZIP FE + quarter FE ---

def design_cap(df, ycol):
    y = pd.to_numeric(df[ycol], errors='coerce')

    # ZIP FE
    fe_zip = pd.get_dummies(
        df[ZIP_COL].astype(str).str.upper().str.strip(),
        prefix='zip', drop_first=True
    )
    # Quarter FE (using the 'YYYYQx' strings)
    fe_q = pd.get_dummies(df[QUARTER_COL], prefix='q', drop_first=True)

    X = pd.concat([df[['delta_pct']], fe_zip, fe_q], axis=1)
    X = ensure_numeric_df(X)

    m = y.notna() & X.notna().all(axis=1)
    Xc = sm.add_constant(X.loc[m], has_constant='add')

    groups = df.loc[m, ZIP_COL].astype(str)  # cluster by ZIP
    return y.loc[m], Xc, groups

def run_cap(df, y_name, y_label):
    y, X, g = design_cap(df, y_name)
    if y.empty:
        print(f"[MLS] {y_label}: no usable observations.")
        return None
    fit = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': g})
    print(f"\n[MLS PRE] {y_label} on Δ_i (pp) + ZIP FE + quarter FE, "
          f"unweighted | n={int(fit.nobs)} R²={fit.rsquared:.3f}")
    print(fit.summary())
    return fit

res_list = run_cap(pre_list, 'ln_list_psf', 'ln(list/ft²)')
res_sold = run_cap(pre_sold, 'ln_sold_psf', 'ln(sold/ft²)')
res_dom  = run_cap(pre_dom,  'ln_dom1p',   'ln(1+DOM)')


# ---------------- LaTeX body exporter (3 outcomes, Δ_i row only) ----------------
def stars(p):
    if p is None or not np.isfinite(p):
        return ''
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

def coef_se_line(res, name, scale=1.0, dec=4):
    """Return ('b***', '(se)') for regressor `name`."""
    try:
        b = float(res.params[name]) * scale
        se = float(res.bse[name]) * scale
        p = float(res.pvalues[name])
        return f"{b:.{dec}f}{stars(p)}", f"({se:.{dec}f})"
    except Exception:
        return "", ""

def export_body_threecols(res_list, res_sold, res_dom,
                          out_path,
                          var_rows,
                          col_labels,
                          notes=None,
                          fe_markers=True,
                          dec=4):
    """
    Write only the LaTeX body (no \\begin{tabular}) for 3 outcome columns:
    Listed Price, Sold Price, Days on Market.
    `var_rows` is [(param_name, row_label, scale), ...].
    """
    # if all results are None, write an empty file
    if (res_list is None) and (res_sold is None) and (res_dom is None):
        with open(out_path, 'w', encoding='utf-8') as f:
            f.write('% empty\n')
        print(f"[LaTeX] wrote empty {out_path}")
        return

    def coef_or_blank(res, name, scale):
        if res is None:
            return "", ""
        return coef_se_line(res, name, scale=scale, dec=dec)

    def safe_r2(res):
        try:
            return f"{res.rsquared:.3f}"
        except Exception:
            return ""

    def safe_nobs(res):
        try:
            return str(int(round(res.nobs)))
        except Exception:
            return ""

    lines = []

    # header row
    lines.append(" & " + " & ".join(col_labels) + r" \\")
    lines.append(r"\midrule")

    # coefficient rows
    for name, label, scale in var_rows:
        b1, se1 = coef_or_blank(res_list, name, scale)
        b2, se2 = coef_or_blank(res_sold, name, scale)
        b3, se3 = coef_or_blank(res_dom,  name, scale)

        lines.append(label + " & " + " & ".join([b1, b2, b3]) + r" \\")
        lines.append(" & " + " & ".join([se1, se2, se3]) + r" \\")
        lines.append(r"\addlinespace")

    # fit stats
    r2_1, r2_2, r2_3 = safe_r2(res_list), safe_r2(res_sold), safe_r2(res_dom)
    n1, n2, n3       = safe_nobs(res_list), safe_nobs(res_sold), safe_nobs(res_dom)

    lines.append(r"$R^2$ & " + " & ".join([r2_1, r2_2, r2_3]) + r" \\")
    lines.append("Observations & " + " & ".join([n1, n2, n3]) + r" \\")
    if fe_markers:
        fe_vals = ["Yes" if res is not None else "" for res in (res_list, res_sold, res_dom)]
        lines.append(r"ZIP and Quarter FE & " + " & ".join(fe_vals) + r" \\")

    if notes:
        lines.append(r"\midrule")
        lines.append(notes)

    with open(out_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(lines) + "\n")

    print(f"[LaTeX] wrote {out_path}")


# rows to report (just Δ_i)
rows_common = [('delta_pct', r'$\Delta_i$ (pp)', 1)]

# column headers in the LaTeX table
col_labels = [
    r'Listed Price',
    r'Sold Price',
    r'Days on Market'
]

latex_path = os.path.join(OUTPUT_DIR, 'cap_mls_delta_3outcomes_body_post_unit_effect.tex')
export_body_threecols(
    res_list,
    res_sold,
    res_dom,
    latex_path,
    rows_common,
    col_labels,
    notes=(r"\multicolumn{4}{l}{ZIP-clustered SE in parentheses. "
           r"\sym{*} $p<0.10$, \sym{**} $p<0.05$, \sym{***} $p<0.01$} \\")
)

print("\n[Done] LaTeX body saved in", latex_path)


[Δ] Raw rows in pre/post windows: 20,088
[Δ] Rows used in WLS: 12,445 (dropped 7,643)
[Δ] Associations with both pre & post residual means + county controls: 965

[Δ] Distribution (pp):
count    965.000000
mean       6.002003
std       31.283355
min     -242.217475
25%       -9.423319
50%        6.144484
75%       21.402864
max      174.812632
Name: delta_pct, dtype: float64
Exported delta_assoc to exports/delta_all_vars\delta_assoc_unit_effect_age.csv

[MLS PRE] ln(list/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=5601 R²=0.721
                            OLS Regression Results                            
Dep. Variable:            ln_list_psf   R-squared:                       0.721
Model:                            OLS   Adj. R-squared:                  0.711
Method:                 Least Squares   F-statistic:                     2055.
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          2.70e-173
Time:                        15:10:10   Log-Likelihood:     




[MLS PRE] ln(sold/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=5035 R²=0.734
                            OLS Regression Results                            
Dep. Variable:            ln_sold_psf   R-squared:                       0.734
Model:                            OLS   Adj. R-squared:                  0.724
Method:                 Least Squares   F-statistic:                     2495.
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          1.74e-180
Time:                        15:10:10   Log-Likelihood:                -433.49
No. Observations:                5035   AIC:                             1233.
Df Residuals:                    4852   BIC:                             2427.
Df Model:                         182                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------




[MLS PRE] ln(1+DOM) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=5362 R²=0.112
                            OLS Regression Results                            
Dep. Variable:               ln_dom1p   R-squared:                       0.112
Model:                            OLS   Adj. R-squared:                  0.081
Method:                 Least Squares   F-statistic:                     73.31
Date:                Sat, 17 Jan 2026   Prob (F-statistic):           1.06e-56
Time:                        15:10:10   Log-Likelihood:                -9724.5
No. Observations:                5362   AIC:                         1.982e+04
Df Residuals:                    5179   BIC:                         2.102e+04
Df Model:                         182                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------



In [3]:
# ============================================================
# CAPITALIZATION: MLS prices/DOM on Δ_i  (shared Δ method) - pre period; effective year built 
# ============================================================
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm

# ---------------- PATHS ----------------
PATH_DTA_REDFIN = '../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta'
PATH_WGTS       = '../../final_datasets/master_datasets/hoa_redfin_weights_unit.dta'
PATH_DTA_MLS    = '../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta'
OUTPUT_DIR      = 'exports/delta_all_vars'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- CORE COLS ----------------
ADD_COL     = 'address_attom'
ASSOC_COL   = 'assoc_name_final'
COUNTY_COL  = 'mm_fips_county_name_attom'
ZIP_COL     = 'zip5_attom'
QUARTER_COL = 'quarter'
HOA_PSF_COL = 'hoa_sq_ft'
W_COL       = 'final_weight'
STORIES_COL = 'num_stories_final_assoc'

# MLS outcomes
MLS_LIST_PSF = 'listed_price_sq_ft'
MLS_SOLD_PSF = 'sold_price_sq_ft'
MLS_DOM      = 'dom'

# ---------------- WINDOWS ----------------
# Δ_i uses 2019–2022Q1 (pre) vs 2025 (post), consistent with lending spec
PRE_START   = pd.Timestamp('2019-01-01')
PRE_END     = pd.Timestamp('2022-03-31')   # through 2022Q1
POST_START  = pd.Timestamp('2025-01-01')
POST_END    = pd.Timestamp('2025-12-31')

# MLS capitalization regressions use this **pre** window (your existing choice)
MLS_PRE_START_Q = '2019Q1'
MLS_PRE_END_Q   = '2022Q1'

# ---------------- COVARIATES & RISK ----------------
COVARS = [
    'gym_redfin_assoc','pool_redfin_assoc','spa_broad_redfin_assoc',
    'tennis_redfin_assoc','golf_redfin_assoc','garage_redfin_assoc',
    'boat_redfin_assoc','elevator_redfin_assoc','view_redfin_assoc',
    'senior_community_redfin_assoc','property_age_effect_assoc_qtr',
    'frac_npexcorp_state_attom_assoc','frac_corp_own_attom_assoc',
    'corp_mgmt_city_attom_assoc'
]



RISK_SOURCE = 'fema'          # 'fema', 'firststreet', or None
FEMA_COL    = 'fema_flood_risk_bucket_assoc'
FS_COL      = 'firststreet_risk_cat_assoc'

EXCLUDE_MIAMI_BROWARD = True

# ---------------- Stage 1: load Redfin + weights, compute Δ_i ----------------
# Load Redfin HOA panel
rf = pd.read_stata(PATH_DTA_REDFIN)

# Load weights and harmonize quarter keys
wg = pd.read_stata(PATH_WGTS)[[ADD_COL, QUARTER_COL, W_COL]].copy()
wg[QUARTER_COL] = q_to_start(wg[QUARTER_COL])
wg[W_COL] = pd.to_numeric(wg[W_COL], errors='coerce').fillna(0)

# Aggregate any duplicate assoc×quarter weights
if wg.duplicated([ADD_COL, QUARTER_COL]).any():
    wg = wg.groupby([ADD_COL, QUARTER_COL], as_index=False)[W_COL].sum()

# Harmonize quarters in rf and merge weights
rf[QUARTER_COL] = q_to_start(rf[QUARTER_COL])
rf = rf.merge(wg, on=[ADD_COL, QUARTER_COL], how='left', validate='m:1')
rf[W_COL] = pd.to_numeric(rf[W_COL], errors='coerce').fillna(1.0)

# Exclude Miami-Dade & Broward
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in rf.columns:
    rf = exclude_mia_broward(rf, COUNTY_COL)

# Compute Δ_i with the shared method
delta, res_stage1 = compute_delta(
    rf,
    assoc_col=ASSOC_COL,
    county_col=COUNTY_COL,
    zip_col=ZIP_COL,
    q_col=QUARTER_COL,
    stories_col=STORIES_COL,
    hoa_psf_col=HOA_PSF_COL,
    w_col=W_COL,
    covars=COVARS,
    pre_start=PRE_START,
    pre_end=PRE_END,
    post_start=POST_START,
    post_end=POST_END,
    ctrl_story_max=2,
    risk_source=RISK_SOURCE,
    fema_col=FEMA_COL,
    fs_col=FS_COL,
    verbose=True
)
# delta has columns: [ASSOC_COL, 'delta_pct']

# ---------------- Stage 2: MLS capitalization regressions ----------------
def ensure_numeric_df(X):
    for c in X.columns:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors='coerce')
    return X

def _to_qstr(q):
    """Convert MLS quarter variable to 'YYYYQx' string."""
    return pd.PeriodIndex(q, freq='Q').astype(str).str.upper()

# Load MLS dataset
mls = pd.read_stata(PATH_DTA_MLS).copy()

# Restrict to treated group: 3+ story associations
mls = mls[pd.to_numeric(mls[STORIES_COL], errors='coerce') >= 3].copy()
mls[QUARTER_COL] = _to_qstr(mls[QUARTER_COL])

# Exclude Miami-Dade & Broward to match Stage 1
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in mls.columns:
    mls = exclude_mia_broward(mls, COUNTY_COL)

# Keep only the MLS pre window (your chosen capitalization window)
mls_pre = mls[
    (mls[QUARTER_COL] >= MLS_PRE_START_Q) &
    (mls[QUARTER_COL] <= MLS_PRE_END_Q)
].copy()

# Merge Δ_i onto MLS panel (inner join → only associations with Δ)
mls_pre = mls_pre.merge(delta, on=ASSOC_COL, how='inner')

# --- build log outcomes ---

def ln_pos(df, col_in, col_out):
    """Keep rows with col_in>0 and add log(col_in) column."""
    x = pd.to_numeric(df[col_in], errors='coerce')
    df = df[x > 0].copy()
    df[col_out] = np.log(x[x > 0].astype(float))
    return df

pre_list = ln_pos(mls_pre.copy(), MLS_LIST_PSF, 'ln_list_psf')
pre_sold = ln_pos(mls_pre.copy(), MLS_SOLD_PSF, 'ln_sold_psf')

pre_dom = mls_pre.copy()
pre_dom = pre_dom[pd.to_numeric(pre_dom[MLS_DOM], errors='coerce') >= 0].copy()
pre_dom['ln_dom1p'] = np.log1p(pd.to_numeric(pre_dom[MLS_DOM], errors='coerce').astype(float))

# --- design matrix: Δ_i + ZIP FE + quarter FE ---

def design_cap(df, ycol):
    y = pd.to_numeric(df[ycol], errors='coerce')

    # ZIP FE
    fe_zip = pd.get_dummies(
        df[ZIP_COL].astype(str).str.upper().str.strip(),
        prefix='zip', drop_first=True
    )
    # Quarter FE (using the 'YYYYQx' strings)
    fe_q = pd.get_dummies(df[QUARTER_COL], prefix='q', drop_first=True)

    X = pd.concat([df[['delta_pct']], fe_zip, fe_q], axis=1)
    X = ensure_numeric_df(X)

    m = y.notna() & X.notna().all(axis=1)
    Xc = sm.add_constant(X.loc[m], has_constant='add')

    groups = df.loc[m, ZIP_COL].astype(str)  # cluster by ZIP
    return y.loc[m], Xc, groups

def run_cap(df, y_name, y_label):
    y, X, g = design_cap(df, y_name)
    if y.empty:
        print(f"[MLS] {y_label}: no usable observations.")
        return None
    fit = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': g})
    print(f"\n[MLS PRE] {y_label} on Δ_i (pp) + ZIP FE + quarter FE, "
          f"unweighted | n={int(fit.nobs)} R²={fit.rsquared:.3f}")
    print(fit.summary())
    return fit

res_list = run_cap(pre_list, 'ln_list_psf', 'ln(list/ft²)')
res_sold = run_cap(pre_sold, 'ln_sold_psf', 'ln(sold/ft²)')
res_dom  = run_cap(pre_dom,  'ln_dom1p',   'ln(1+DOM)')


# ---------------- LaTeX body exporter (3 outcomes, Δ_i row only) ----------------
def stars(p):
    if p is None or not np.isfinite(p):
        return ''
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

def coef_se_line(res, name, scale=1.0, dec=4):
    """Return ('b***', '(se)') for regressor `name`."""
    try:
        b = float(res.params[name]) * scale
        se = float(res.bse[name]) * scale
        p = float(res.pvalues[name])
        return f"{b:.{dec}f}{stars(p)}", f"({se:.{dec}f})"
    except Exception:
        return "", ""

def export_body_threecols(res_list, res_sold, res_dom,
                          out_path,
                          var_rows,
                          col_labels,
                          notes=None,
                          fe_markers=True,
                          dec=4):
    """
    Write only the LaTeX body (no \\begin{tabular}) for 3 outcome columns:
    Listed Price, Sold Price, Days on Market.
    `var_rows` is [(param_name, row_label, scale), ...].
    """
    # if all results are None, write an empty file
    if (res_list is None) and (res_sold is None) and (res_dom is None):
        with open(out_path, 'w', encoding='utf-8') as f:
            f.write('% empty\n')
        print(f"[LaTeX] wrote empty {out_path}")
        return

    def coef_or_blank(res, name, scale):
        if res is None:
            return "", ""
        return coef_se_line(res, name, scale=scale, dec=dec)

    def safe_r2(res):
        try:
            return f"{res.rsquared:.3f}"
        except Exception:
            return ""

    def safe_nobs(res):
        try:
            return str(int(round(res.nobs)))
        except Exception:
            return ""

    lines = []

    # header row
    lines.append(" & " + " & ".join(col_labels) + r" \\")
    lines.append(r"\midrule")

    # coefficient rows
    for name, label, scale in var_rows:
        b1, se1 = coef_or_blank(res_list, name, scale)
        b2, se2 = coef_or_blank(res_sold, name, scale)
        b3, se3 = coef_or_blank(res_dom,  name, scale)

        lines.append(label + " & " + " & ".join([b1, b2, b3]) + r" \\")
        lines.append(" & " + " & ".join([se1, se2, se3]) + r" \\")
        lines.append(r"\addlinespace")

    # fit stats
    r2_1, r2_2, r2_3 = safe_r2(res_list), safe_r2(res_sold), safe_r2(res_dom)
    n1, n2, n3       = safe_nobs(res_list), safe_nobs(res_sold), safe_nobs(res_dom)

    lines.append(r"$R^2$ & " + " & ".join([r2_1, r2_2, r2_3]) + r" \\")
    lines.append("Observations & " + " & ".join([n1, n2, n3]) + r" \\")
    if fe_markers:
        fe_vals = ["Yes" if res is not None else "" for res in (res_list, res_sold, res_dom)]
        lines.append(r"ZIP and Quarter FE & " + " & ".join(fe_vals) + r" \\")

    if notes:
        lines.append(r"\midrule")
        lines.append(notes)

    with open(out_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(lines) + "\n")

    print(f"[LaTeX] wrote {out_path}")


# rows to report (just Δ_i)
rows_common = [('delta_pct', r'$\Delta_i$ (pp)', 1)]

# column headers in the LaTeX table
col_labels = [
    r'Listed Price',
    r'Sold Price',
    r'Days on Market'
]

latex_path = os.path.join(OUTPUT_DIR, 'cap_mls_delta_3outcomes_body_pre_unit_effect.tex')
export_body_threecols(
    res_list,
    res_sold,
    res_dom,
    latex_path,
    rows_common,
    col_labels,
    notes=(r"\multicolumn{4}{l}{ZIP-clustered SE in parentheses. "
           r"\sym{*} $p<0.10$, \sym{**} $p<0.05$, \sym{***} $p<0.01$} \\")
)

print("\n[Done] LaTeX body saved in", latex_path)


[Δ] Raw rows in pre/post windows: 20,088
[Δ] Rows used in WLS: 12,445 (dropped 7,643)
[Δ] Associations with both pre & post residual means + county controls: 965

[Δ] Distribution (pp):
count    965.000000
mean       6.002003
std       31.283355
min     -242.217475
25%       -9.423319
50%        6.144484
75%       21.402864
max      174.812632
Name: delta_pct, dtype: float64

[MLS PRE] ln(list/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=4710 R²=0.748
                            OLS Regression Results                            
Dep. Variable:            ln_list_psf   R-squared:                       0.748
Model:                            OLS   Adj. R-squared:                  0.738
Method:                 Least Squares   F-statistic:                     354.5
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          1.01e-117
Time:                        15:11:29   Log-Likelihood:                -868.05
No. Observations:                4710   AIC:                




[MLS PRE] ln(sold/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=4703 R²=0.772
                            OLS Regression Results                            
Dep. Variable:            ln_sold_psf   R-squared:                       0.772
Model:                            OLS   Adj. R-squared:                  0.763
Method:                 Least Squares   F-statistic:                     276.4
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          9.97e-109
Time:                        15:11:29   Log-Likelihood:                -615.16
No. Observations:                4703   AIC:                             1606.
Df Residuals:                    4515   BIC:                             2820.
Df Model:                         187                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------




[MLS PRE] ln(1+DOM) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=4699 R²=0.161
                            OLS Regression Results                            
Dep. Variable:               ln_dom1p   R-squared:                       0.161
Model:                            OLS   Adj. R-squared:                  0.126
Method:                 Least Squares   F-statistic:                     192.4
Date:                Sat, 17 Jan 2026   Prob (F-statistic):           7.69e-96
Time:                        15:11:29   Log-Likelihood:                -5000.1
No. Observations:                4699   AIC:                         1.038e+04
Df Residuals:                    4511   BIC:                         1.159e+04
Df Model:                         187                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------



In [4]:
# ============================================================
# CAPITALIZATION: MLS prices/DOM on Δ_i  (shared Δ method) - post period
# ============================================================
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm

# ---------------- PATHS ----------------
PATH_DTA_REDFIN = '../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta'
PATH_WGTS       = '../../final_datasets/master_datasets/hoa_redfin_weights_unit.dta'
PATH_DTA_MLS    = '../../final_datasets/master_datasets/master_dataset_price_dom_unit_obs_mls_redfin.dta'
OUTPUT_DIR      = 'exports/delta_all_vars'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- CORE COLS ----------------
ADD_COL     = 'address_attom'
ASSOC_COL   = 'assoc_name_final'
COUNTY_COL  = 'mm_fips_county_name_attom'
ZIP_COL     = 'zip5_attom'
QUARTER_COL = 'quarter'
HOA_PSF_COL = 'hoa_sq_ft'
W_COL       = 'final_weight'
STORIES_COL = 'num_stories_final_assoc'

# MLS outcomes
MLS_LIST_PSF = 'listed_price_sq_ft'
MLS_SOLD_PSF = 'sold_price_sq_ft'
MLS_DOM      = 'dom'

# ---------------- WINDOWS ----------------
# Δ_i uses 2019–2022Q1 (pre) vs 2025 (post), consistent with lending spec
PRE_START   = pd.Timestamp('2019-01-01')
PRE_END     = pd.Timestamp('2022-03-31')   # through 2022Q1
POST_START  = pd.Timestamp('2025-01-01')
POST_END    = pd.Timestamp('2025-12-31')

# MLS capitalization regressions use this **post** window (your existing choice)
MLS_PRE_START_Q = '2022Q3'
MLS_PRE_END_Q   = '2024Q4'

# ---------------- COVARIATES & RISK ----------------
COVARS = [
    'gym_redfin_assoc','pool_redfin_assoc','spa_broad_redfin_assoc',
    'tennis_redfin_assoc','golf_redfin_assoc','garage_redfin_assoc',
    'boat_redfin_assoc','elevator_redfin_assoc','view_redfin_assoc',
    'senior_community_redfin_assoc','property_age_assoc_qtr',
    'frac_npexcorp_state_attom_assoc','frac_corp_own_attom_assoc',
    'corp_mgmt_city_attom_assoc'
]


    
RISK_SOURCE = 'fema'          # 'fema', 'firststreet', or None
FEMA_COL    = 'fema_flood_risk_bucket_assoc'
FS_COL      = 'firststreet_risk_cat_assoc'

EXCLUDE_MIAMI_BROWARD = True

# ---------------- Stage 1: load Redfin + weights, compute Δ_i ----------------
# Load Redfin HOA panel
rf = pd.read_stata(PATH_DTA_REDFIN)

# Load weights and harmonize quarter keys
wg = pd.read_stata(PATH_WGTS)[[ADD_COL, QUARTER_COL, W_COL]].copy()
wg[QUARTER_COL] = q_to_start(wg[QUARTER_COL])
wg[W_COL] = pd.to_numeric(wg[W_COL], errors='coerce').fillna(0)

# Aggregate any duplicate assoc×quarter weights
if wg.duplicated([ADD_COL, QUARTER_COL]).any():
    wg = wg.groupby([ADD_COL, QUARTER_COL], as_index=False)[W_COL].sum()

# Harmonize quarters in rf and merge weights
rf[QUARTER_COL] = q_to_start(rf[QUARTER_COL])
rf = rf.merge(wg, on=[ADD_COL, QUARTER_COL], how='left', validate='m:1')
rf[W_COL] = pd.to_numeric(rf[W_COL], errors='coerce').fillna(1.0)

# Exclude Miami-Dade & Broward
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in rf.columns:
    rf = exclude_mia_broward(rf, COUNTY_COL)

# Compute Δ_i with the shared method
delta, res_stage1 = compute_delta(
    rf,
    assoc_col=ASSOC_COL,
    county_col=COUNTY_COL,
    zip_col=ZIP_COL,
    q_col=QUARTER_COL,
    stories_col=STORIES_COL,
    hoa_psf_col=HOA_PSF_COL,
    w_col=W_COL,
    covars=COVARS,
    pre_start=PRE_START,
    pre_end=PRE_END,
    post_start=POST_START,
    post_end=POST_END,
    ctrl_story_max=2,
    risk_source=RISK_SOURCE,
    fema_col=FEMA_COL,
    fs_col=FS_COL,
    verbose=True
)
# delta has columns: [ASSOC_COL, 'delta_pct']
delta_out_path = os.path.join(OUTPUT_DIR, "delta_assoc_unit.csv")

# Ensure clean ordering and no index
delta[[ASSOC_COL, 'delta_pct']].to_csv(
    delta_out_path,
    index=False
)

print(f"Exported delta_assoc to {delta_out_path}")

# ---------------- Stage 2: MLS capitalization regressions ----------------
def ensure_numeric_df(X):
    for c in X.columns:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors='coerce')
    return X

def _to_qstr(q):
    """Convert MLS quarter variable to 'YYYYQx' string."""
    return pd.PeriodIndex(q, freq='Q').astype(str).str.upper()

# Load MLS dataset
mls = pd.read_stata(PATH_DTA_MLS).copy()

# Restrict to treated group: 3+ story associations
mls = mls[pd.to_numeric(mls[STORIES_COL], errors='coerce') >= 3].copy()
mls[QUARTER_COL] = _to_qstr(mls[QUARTER_COL])

# Exclude Miami-Dade & Broward to match Stage 1
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in mls.columns:
    mls = exclude_mia_broward(mls, COUNTY_COL)

# Keep only the MLS pre window (your chosen capitalization window)
mls_pre = mls[
    (mls[QUARTER_COL] >= MLS_PRE_START_Q) &
    (mls[QUARTER_COL] <= MLS_PRE_END_Q)
].copy()

# Merge Δ_i onto MLS panel (inner join → only associations with Δ)
mls_pre = mls_pre.merge(delta, on=ASSOC_COL, how='inner')

# --- build log outcomes ---

def ln_pos(df, col_in, col_out):
    """Keep rows with col_in>0 and add log(col_in) column."""
    x = pd.to_numeric(df[col_in], errors='coerce')
    df = df[x > 0].copy()
    df[col_out] = np.log(x[x > 0].astype(float))
    return df

pre_list = ln_pos(mls_pre.copy(), MLS_LIST_PSF, 'ln_list_psf')
pre_sold = ln_pos(mls_pre.copy(), MLS_SOLD_PSF, 'ln_sold_psf')

pre_dom = mls_pre.copy()
pre_dom = pre_dom[pd.to_numeric(pre_dom[MLS_DOM], errors='coerce') >= 0].copy()
pre_dom['ln_dom1p'] = np.log1p(pd.to_numeric(pre_dom[MLS_DOM], errors='coerce').astype(float))

# --- design matrix: Δ_i + ZIP FE + quarter FE ---

def design_cap(df, ycol):
    y = pd.to_numeric(df[ycol], errors='coerce')

    # ZIP FE
    fe_zip = pd.get_dummies(
        df[ZIP_COL].astype(str).str.upper().str.strip(),
        prefix='zip', drop_first=True
    )
    # Quarter FE (using the 'YYYYQx' strings)
    fe_q = pd.get_dummies(df[QUARTER_COL], prefix='q', drop_first=True)

    X = pd.concat([df[['delta_pct']], fe_zip, fe_q], axis=1)
    X = ensure_numeric_df(X)

    m = y.notna() & X.notna().all(axis=1)
    Xc = sm.add_constant(X.loc[m], has_constant='add')

    groups = df.loc[m, ZIP_COL].astype(str)  # cluster by ZIP
    return y.loc[m], Xc, groups

def run_cap(df, y_name, y_label):
    y, X, g = design_cap(df, y_name)
    if y.empty:
        print(f"[MLS] {y_label}: no usable observations.")
        return None
    fit = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': g})
    print(f"\n[MLS PRE] {y_label} on Δ_i (pp) + ZIP FE + quarter FE, "
          f"unweighted | n={int(fit.nobs)} R²={fit.rsquared:.3f}")
    print(fit.summary())
    return fit

res_list = run_cap(pre_list, 'ln_list_psf', 'ln(list/ft²)')
res_sold = run_cap(pre_sold, 'ln_sold_psf', 'ln(sold/ft²)')
res_dom  = run_cap(pre_dom,  'ln_dom1p',   'ln(1+DOM)')


# ---------------- LaTeX body exporter (3 outcomes, Δ_i row only) ----------------
def stars(p):
    if p is None or not np.isfinite(p):
        return ''
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

def coef_se_line(res, name, scale=1.0, dec=4):
    """Return ('b***', '(se)') for regressor `name`."""
    try:
        b = float(res.params[name]) * scale
        se = float(res.bse[name]) * scale
        p = float(res.pvalues[name])
        return f"{b:.{dec}f}{stars(p)}", f"({se:.{dec}f})"
    except Exception:
        return "", ""

def export_body_threecols(res_list, res_sold, res_dom,
                          out_path,
                          var_rows,
                          col_labels,
                          notes=None,
                          fe_markers=True,
                          dec=4):
    """
    Write only the LaTeX body (no \\begin{tabular}) for 3 outcome columns:
    Listed Price, Sold Price, Days on Market.
    `var_rows` is [(param_name, row_label, scale), ...].
    """
    # if all results are None, write an empty file
    if (res_list is None) and (res_sold is None) and (res_dom is None):
        with open(out_path, 'w', encoding='utf-8') as f:
            f.write('% empty\n')
        print(f"[LaTeX] wrote empty {out_path}")
        return

    def coef_or_blank(res, name, scale):
        if res is None:
            return "", ""
        return coef_se_line(res, name, scale=scale, dec=dec)

    def safe_r2(res):
        try:
            return f"{res.rsquared:.3f}"
        except Exception:
            return ""

    def safe_nobs(res):
        try:
            return str(int(round(res.nobs)))
        except Exception:
            return ""

    lines = []

    # header row
    lines.append(" & " + " & ".join(col_labels) + r" \\")
    lines.append(r"\midrule")

    # coefficient rows
    for name, label, scale in var_rows:
        b1, se1 = coef_or_blank(res_list, name, scale)
        b2, se2 = coef_or_blank(res_sold, name, scale)
        b3, se3 = coef_or_blank(res_dom,  name, scale)

        lines.append(label + " & " + " & ".join([b1, b2, b3]) + r" \\")
        lines.append(" & " + " & ".join([se1, se2, se3]) + r" \\")
        lines.append(r"\addlinespace")

    # fit stats
    r2_1, r2_2, r2_3 = safe_r2(res_list), safe_r2(res_sold), safe_r2(res_dom)
    n1, n2, n3       = safe_nobs(res_list), safe_nobs(res_sold), safe_nobs(res_dom)

    lines.append(r"$R^2$ & " + " & ".join([r2_1, r2_2, r2_3]) + r" \\")
    lines.append("Observations & " + " & ".join([n1, n2, n3]) + r" \\")
    if fe_markers:
        fe_vals = ["Yes" if res is not None else "" for res in (res_list, res_sold, res_dom)]
        lines.append(r"ZIP and Quarter FE & " + " & ".join(fe_vals) + r" \\")

    if notes:
        lines.append(r"\midrule")
        lines.append(notes)

    with open(out_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(lines) + "\n")

    print(f"[LaTeX] wrote {out_path}")


# rows to report (just Δ_i)
rows_common = [('delta_pct', r'$\Delta_i$ (pp)', 1)]

# column headers in the LaTeX table
col_labels = [
    r'Listed Price',
    r'Sold Price',
    r'Days on Market'
]

latex_path = os.path.join(OUTPUT_DIR, 'cap_mls_delta_3outcomes_body_post_unit.tex')
export_body_threecols(
    res_list,
    res_sold,
    res_dom,
    latex_path,
    rows_common,
    col_labels,
    notes=(r"\multicolumn{4}{l}{ZIP-clustered SE in parentheses. "
           r"\sym{*} $p<0.10$, \sym{**} $p<0.05$, \sym{***} $p<0.01$} \\")
)

print("\n[Done] LaTeX body saved in", latex_path)


[Δ] Raw rows in pre/post windows: 20,088
[Δ] Rows used in WLS: 17,041 (dropped 3,047)
[Δ] Associations with both pre & post residual means + county controls: 1,276

[Δ] Distribution (pp):
count    1276.000000
mean        4.912511
std        30.557217
min      -271.624128
25%        -9.549780
50%         4.727499
75%        19.221543
max       176.199459
Name: delta_pct, dtype: float64
Exported delta_assoc to exports/delta_all_vars\delta_assoc_unit.csv

[MLS PRE] ln(list/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=7689 R²=0.659
                            OLS Regression Results                            
Dep. Variable:            ln_list_psf   R-squared:                       0.659
Model:                            OLS   Adj. R-squared:                  0.650
Method:                 Least Squares   F-statistic:                     2011.
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          1.10e-187
Time:                        15:13:04   Log-Likelihood:      




[MLS PRE] ln(sold/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=6865 R²=0.668
                            OLS Regression Results                            
Dep. Variable:            ln_sold_psf   R-squared:                       0.668
Model:                            OLS   Adj. R-squared:                  0.658
Method:                 Least Squares   F-statistic:                     1740.
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          9.68e-182
Time:                        15:13:04   Log-Likelihood:                -1856.9
No. Observations:                6865   AIC:                             4118.
Df Residuals:                    6663   BIC:                             5498.
Df Model:                         201                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------




[MLS PRE] ln(1+DOM) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=7327 R²=0.106
                            OLS Regression Results                            
Dep. Variable:               ln_dom1p   R-squared:                       0.106
Model:                            OLS   Adj. R-squared:                  0.081
Method:                 Least Squares   F-statistic:                     111.5
Date:                Sat, 17 Jan 2026   Prob (F-statistic):           3.85e-74
Time:                        15:13:04   Log-Likelihood:                -13333.
No. Observations:                7327   AIC:                         2.707e+04
Df Residuals:                    7125   BIC:                         2.846e+04
Df Model:                         201                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------



In [5]:
# ============================================================
# CAPITALIZATION: MLS prices/DOM on Δ_i  (shared Δ method) - pre period
# ============================================================
import os
import numpy as np
import pandas as pd
import statsmodels.api as sm

# ---------------- PATHS ----------------
PATH_DTA_REDFIN = '../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta'
PATH_WGTS       = '../../final_datasets/master_datasets/hoa_redfin_weights_unit.dta'
PATH_DTA_MLS    = '../../final_datasets/master_datasets/master_dataset_unit_obs_redfin.dta'
OUTPUT_DIR      = 'exports/delta_all_vars'
os.makedirs(OUTPUT_DIR, exist_ok=True)

# ---------------- CORE COLS ----------------
ADD_COL     = 'address_attom'
ASSOC_COL   = 'assoc_name_final'
COUNTY_COL  = 'mm_fips_county_name_attom'
ZIP_COL     = 'zip5_attom'
QUARTER_COL = 'quarter'
HOA_PSF_COL = 'hoa_sq_ft'
W_COL       = 'final_weight'
STORIES_COL = 'num_stories_final_assoc'

# MLS outcomes
MLS_LIST_PSF = 'listed_price_sq_ft'
MLS_SOLD_PSF = 'sold_price_sq_ft'
MLS_DOM      = 'dom'

# ---------------- WINDOWS ----------------
# Δ_i uses 2019–2022Q1 (pre) vs 2025 (post), consistent with lending spec
PRE_START   = pd.Timestamp('2019-01-01')
PRE_END     = pd.Timestamp('2022-03-31')   # through 2022Q1
POST_START  = pd.Timestamp('2025-01-01')
POST_END    = pd.Timestamp('2025-12-31')

# MLS capitalization regressions use this **pre** window (your existing choice)
MLS_PRE_START_Q = '2019Q1'
MLS_PRE_END_Q   = '2022Q1'

# ---------------- COVARIATES & RISK ----------------
COVARS = [
    'gym_redfin_assoc','pool_redfin_assoc','spa_broad_redfin_assoc',
    'tennis_redfin_assoc','golf_redfin_assoc','garage_redfin_assoc',
    'boat_redfin_assoc','elevator_redfin_assoc','view_redfin_assoc',
    'senior_community_redfin_assoc','property_age_assoc_qtr',
    'frac_npexcorp_state_attom_assoc','frac_corp_own_attom_assoc',
    'corp_mgmt_city_attom_assoc'
]



RISK_SOURCE = 'fema'          # 'fema', 'firststreet', or None
FEMA_COL    = 'fema_flood_risk_bucket_assoc'
FS_COL      = 'firststreet_risk_cat_assoc'

EXCLUDE_MIAMI_BROWARD = True

# ---------------- Stage 1: load Redfin + weights, compute Δ_i ----------------
# Load Redfin HOA panel
rf = pd.read_stata(PATH_DTA_REDFIN)

# Load weights and harmonize quarter keys
wg = pd.read_stata(PATH_WGTS)[[ADD_COL, QUARTER_COL, W_COL]].copy()
wg[QUARTER_COL] = q_to_start(wg[QUARTER_COL])
wg[W_COL] = pd.to_numeric(wg[W_COL], errors='coerce').fillna(0)

# Aggregate any duplicate assoc×quarter weights
if wg.duplicated([ADD_COL, QUARTER_COL]).any():
    wg = wg.groupby([ADD_COL, QUARTER_COL], as_index=False)[W_COL].sum()

# Harmonize quarters in rf and merge weights
rf[QUARTER_COL] = q_to_start(rf[QUARTER_COL])
rf = rf.merge(wg, on=[ADD_COL, QUARTER_COL], how='left', validate='m:1')
rf[W_COL] = pd.to_numeric(rf[W_COL], errors='coerce').fillna(1.0)

# Exclude Miami-Dade & Broward
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in rf.columns:
    rf = exclude_mia_broward(rf, COUNTY_COL)

# Compute Δ_i with the shared method
delta, res_stage1 = compute_delta(
    rf,
    assoc_col=ASSOC_COL,
    county_col=COUNTY_COL,
    zip_col=ZIP_COL,
    q_col=QUARTER_COL,
    stories_col=STORIES_COL,
    hoa_psf_col=HOA_PSF_COL,
    w_col=W_COL,
    covars=COVARS,
    pre_start=PRE_START,
    pre_end=PRE_END,
    post_start=POST_START,
    post_end=POST_END,
    ctrl_story_max=2,
    risk_source=RISK_SOURCE,
    fema_col=FEMA_COL,
    fs_col=FS_COL,
    verbose=True
)
# delta has columns: [ASSOC_COL, 'delta_pct']

# ---------------- Stage 2: MLS capitalization regressions ----------------
def ensure_numeric_df(X):
    for c in X.columns:
        if not np.issubdtype(X[c].dtype, np.number):
            X[c] = pd.to_numeric(X[c], errors='coerce')
    return X

def _to_qstr(q):
    """Convert MLS quarter variable to 'YYYYQx' string."""
    return pd.PeriodIndex(q, freq='Q').astype(str).str.upper()

# Load MLS dataset
mls = pd.read_stata(PATH_DTA_MLS).copy()

# Restrict to treated group: 3+ story associations
mls = mls[pd.to_numeric(mls[STORIES_COL], errors='coerce') >= 3].copy()
mls[QUARTER_COL] = _to_qstr(mls[QUARTER_COL])

# Exclude Miami-Dade & Broward to match Stage 1
if EXCLUDE_MIAMI_BROWARD and COUNTY_COL in mls.columns:
    mls = exclude_mia_broward(mls, COUNTY_COL)

# Keep only the MLS pre window (your chosen capitalization window)
mls_pre = mls[
    (mls[QUARTER_COL] >= MLS_PRE_START_Q) &
    (mls[QUARTER_COL] <= MLS_PRE_END_Q)
].copy()

# Merge Δ_i onto MLS panel (inner join → only associations with Δ)
mls_pre = mls_pre.merge(delta, on=ASSOC_COL, how='inner')

# --- build log outcomes ---

def ln_pos(df, col_in, col_out):
    """Keep rows with col_in>0 and add log(col_in) column."""
    x = pd.to_numeric(df[col_in], errors='coerce')
    df = df[x > 0].copy()
    df[col_out] = np.log(x[x > 0].astype(float))
    return df

pre_list = ln_pos(mls_pre.copy(), MLS_LIST_PSF, 'ln_list_psf')
pre_sold = ln_pos(mls_pre.copy(), MLS_SOLD_PSF, 'ln_sold_psf')

pre_dom = mls_pre.copy()
pre_dom = pre_dom[pd.to_numeric(pre_dom[MLS_DOM], errors='coerce') >= 0].copy()
pre_dom['ln_dom1p'] = np.log1p(pd.to_numeric(pre_dom[MLS_DOM], errors='coerce').astype(float))

# --- design matrix: Δ_i + ZIP FE + quarter FE ---

def design_cap(df, ycol):
    y = pd.to_numeric(df[ycol], errors='coerce')

    # ZIP FE
    fe_zip = pd.get_dummies(
        df[ZIP_COL].astype(str).str.upper().str.strip(),
        prefix='zip', drop_first=True
    )
    # Quarter FE (using the 'YYYYQx' strings)
    fe_q = pd.get_dummies(df[QUARTER_COL], prefix='q', drop_first=True)

    X = pd.concat([df[['delta_pct']], fe_zip, fe_q], axis=1)
    X = ensure_numeric_df(X)

    m = y.notna() & X.notna().all(axis=1)
    Xc = sm.add_constant(X.loc[m], has_constant='add')

    groups = df.loc[m, ZIP_COL].astype(str)  # cluster by ZIP
    return y.loc[m], Xc, groups

def run_cap(df, y_name, y_label):
    y, X, g = design_cap(df, y_name)
    if y.empty:
        print(f"[MLS] {y_label}: no usable observations.")
        return None
    fit = sm.OLS(y, X).fit(cov_type='cluster', cov_kwds={'groups': g})
    print(f"\n[MLS PRE] {y_label} on Δ_i (pp) + ZIP FE + quarter FE, "
          f"unweighted | n={int(fit.nobs)} R²={fit.rsquared:.3f}")
    print(fit.summary())
    return fit

res_list = run_cap(pre_list, 'ln_list_psf', 'ln(list/ft²)')
res_sold = run_cap(pre_sold, 'ln_sold_psf', 'ln(sold/ft²)')
res_dom  = run_cap(pre_dom,  'ln_dom1p',   'ln(1+DOM)')


# ---------------- LaTeX body exporter (3 outcomes, Δ_i row only) ----------------
def stars(p):
    if p is None or not np.isfinite(p):
        return ''
    return '***' if p < 0.01 else '**' if p < 0.05 else '*' if p < 0.10 else ''

def coef_se_line(res, name, scale=1.0, dec=4):
    """Return ('b***', '(se)') for regressor `name`."""
    try:
        b = float(res.params[name]) * scale
        se = float(res.bse[name]) * scale
        p = float(res.pvalues[name])
        return f"{b:.{dec}f}{stars(p)}", f"({se:.{dec}f})"
    except Exception:
        return "", ""

def export_body_threecols(res_list, res_sold, res_dom,
                          out_path,
                          var_rows,
                          col_labels,
                          notes=None,
                          fe_markers=True,
                          dec=4):
    """
    Write only the LaTeX body (no \\begin{tabular}) for 3 outcome columns:
    Listed Price, Sold Price, Days on Market.
    `var_rows` is [(param_name, row_label, scale), ...].
    """
    # if all results are None, write an empty file
    if (res_list is None) and (res_sold is None) and (res_dom is None):
        with open(out_path, 'w', encoding='utf-8') as f:
            f.write('% empty\n')
        print(f"[LaTeX] wrote empty {out_path}")
        return

    def coef_or_blank(res, name, scale):
        if res is None:
            return "", ""
        return coef_se_line(res, name, scale=scale, dec=dec)

    def safe_r2(res):
        try:
            return f"{res.rsquared:.3f}"
        except Exception:
            return ""

    def safe_nobs(res):
        try:
            return str(int(round(res.nobs)))
        except Exception:
            return ""

    lines = []

    # header row
    lines.append(" & " + " & ".join(col_labels) + r" \\")
    lines.append(r"\midrule")

    # coefficient rows
    for name, label, scale in var_rows:
        b1, se1 = coef_or_blank(res_list, name, scale)
        b2, se2 = coef_or_blank(res_sold, name, scale)
        b3, se3 = coef_or_blank(res_dom,  name, scale)

        lines.append(label + " & " + " & ".join([b1, b2, b3]) + r" \\")
        lines.append(" & " + " & ".join([se1, se2, se3]) + r" \\")
        lines.append(r"\addlinespace")

    # fit stats
    r2_1, r2_2, r2_3 = safe_r2(res_list), safe_r2(res_sold), safe_r2(res_dom)
    n1, n2, n3       = safe_nobs(res_list), safe_nobs(res_sold), safe_nobs(res_dom)

    lines.append(r"$R^2$ & " + " & ".join([r2_1, r2_2, r2_3]) + r" \\")
    lines.append("Observations & " + " & ".join([n1, n2, n3]) + r" \\")
    if fe_markers:
        fe_vals = ["Yes" if res is not None else "" for res in (res_list, res_sold, res_dom)]
        lines.append(r"ZIP and Quarter FE & " + " & ".join(fe_vals) + r" \\")

    if notes:
        lines.append(r"\midrule")
        lines.append(notes)

    with open(out_path, 'w', encoding='utf-8') as f:
        f.write("\n".join(lines) + "\n")

    print(f"[LaTeX] wrote {out_path}")


# rows to report (just Δ_i)
rows_common = [('delta_pct', r'$\Delta_i$ (pp)', 1)]

# column headers in the LaTeX table
col_labels = [
    r'Listed Price',
    r'Sold Price',
    r'Days on Market'
]

latex_path = os.path.join(OUTPUT_DIR, 'cap_mls_delta_3outcomes_body_pre_unit.tex')
export_body_threecols(
    res_list,
    res_sold,
    res_dom,
    latex_path,
    rows_common,
    col_labels,
    notes=(r"\multicolumn{4}{l}{ZIP-clustered SE in parentheses. "
           r"\sym{*} $p<0.10$, \sym{**} $p<0.05$, \sym{***} $p<0.01$} \\")
)

print("\n[Done] LaTeX body saved in", latex_path)

[Δ] Raw rows in pre/post windows: 20,088
[Δ] Rows used in WLS: 17,041 (dropped 3,047)
[Δ] Associations with both pre & post residual means + county controls: 1,276

[Δ] Distribution (pp):
count    1276.000000
mean        4.912511
std        30.557217
min      -271.624128
25%        -9.549780
50%         4.727499
75%        19.221543
max       176.199459
Name: delta_pct, dtype: float64

[MLS PRE] ln(list/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=6634 R²=0.708
                            OLS Regression Results                            
Dep. Variable:            ln_list_psf   R-squared:                       0.708
Model:                            OLS   Adj. R-squared:                  0.698
Method:                 Least Squares   F-statistic:                     381.1
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          6.87e-130
Time:                        15:14:15   Log-Likelihood:                -2061.4
No. Observations:                6634   AIC:      




[MLS PRE] ln(sold/ft²) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=6636 R²=0.741
                            OLS Regression Results                            
Dep. Variable:            ln_sold_psf   R-squared:                       0.741
Model:                            OLS   Adj. R-squared:                  0.732
Method:                 Least Squares   F-statistic:                     403.1
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          3.74e-132
Time:                        15:14:15   Log-Likelihood:                -1571.8
No. Observations:                6636   AIC:                             3558.
Df Residuals:                    6429   BIC:                             4965.
Df Model:                         206                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------




[MLS PRE] ln(1+DOM) on Δ_i (pp) + ZIP FE + quarter FE, unweighted | n=6627 R²=0.151
                            OLS Regression Results                            
Dep. Variable:               ln_dom1p   R-squared:                       0.151
Model:                            OLS   Adj. R-squared:                  0.124
Method:                 Least Squares   F-statistic:                     190.6
Date:                Sat, 17 Jan 2026   Prob (F-statistic):          1.76e-102
Time:                        15:14:15   Log-Likelihood:                -7113.4
No. Observations:                6627   AIC:                         1.464e+04
Df Residuals:                    6420   BIC:                         1.605e+04
Df Model:                         206                                         
Covariance Type:              cluster                                         
                 coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------

