In [1]:
import pandas as pd
import numpy as np

# Load the Stata dataset
df = pd.read_stata(r"C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\final_datasets\master_datasets\master_dataset_unit_crossection.dta")

# Variables to summarize
vars_of_interest = [
    'total_monthly_hoa_attom_unit', 'num_units_final_assoc', 'num_bldgs_final_assoc',
    'num_stories_final_assoc', 'sqft_attom_unit', 'yr_blt_attom_bldg',
    'miles_to_coast_assoc', 'frac_npexcorp_state_attom_assoc', 'frac_corp_own_attom_assoc',
    'corp_mgmt_city_attom_assoc', 'med_income_nonprimary_assoc',
    'percap_income_prop_zip', 'firststreet_risk_cat_assoc'
]

# Label mapping for LaTeX
var_rename = {
    'total_monthly_hoa_attom_unit': 'Monthly HOA Fee',
    'num_units_final_assoc': 'Units',
    'num_bldgs_final_assoc': 'Buildings',
    'num_stories_final_assoc': 'Stories',
    'sqft_attom_unit': 'Unit Sqft',
    'yr_blt_attom_bldg': 'Year Built',
    'miles_to_coast_assoc': 'Miles to Coast',
    'frac_npexcorp_state_attom_assoc': 'Frac. Out-of-State Owners',
    'frac_corp_own_attom_assoc': 'Frac. Corp Owners',
    'corp_mgmt_city_attom_assoc': 'Corp Mgmt (City)',
    'med_income_nonprimary_assoc': 'Med Income (Nonprimary)',
    'percap_income_prop_zip': 'Per Capita Income (ZIP)',
    'firststreet_risk_cat_assoc': 'Flood Risk (First Street)'
}

# Assign groups
def story_group(n):
    if pd.isna(n): return np.nan
    elif n <= 2: return '1-2'
    elif n <= 4: return '3-4'
    elif n <= 9: return '5-9'
    else: return '10+'

def county_group(c):
    if c in ['MIAMI-DADE', 'BROWARD']:
        return 'Miami/Broward'
    elif pd.isna(c):
        return np.nan
    else:
        return 'Other'

df['StoryGroup'] = df['num_stories_final_assoc'].apply(story_group)
df['CountyGroup'] = df['mm_fips_county_name_attom'].apply(county_group)

# Add 'All' category for story and county
df['StoryGroupAll'] = df['StoryGroup'].fillna('All')
df['CountyGroupAll'] = df['CountyGroup'].fillna('All')

# Define story/county categories
story_groups = ['1-2', '3-4', '5-9', '10+', 'All']
county_groups = ['Miami/Broward', 'Other', 'All']
group_combos = [(s, c) for s in story_groups for c in county_groups]

# Collect summary stats
rows = []

for var in vars_of_interest:
    for stat in ['Mean', 'Median', 'Std. Dev.']:
        row = {'Variable': var_rename.get(var, var), 'Statistic': stat}
        for s, c in group_combos:
            sub = df.copy()
            if s != 'All':
                sub = sub[sub['StoryGroup'] == s]
            if c != 'All':
                sub = sub[sub['CountyGroup'] == c]
            x = sub[var].dropna()
            val = np.nan
            if len(x) > 0:
                if stat == 'Mean':
                    val = x.mean()
                elif stat == 'Median':
                    val = x.median()
                elif stat == 'Std. Dev.':
                    val = x.std()
            col_label = f"{s} | {c}"
            row[col_label] = round(val, 2) if pd.notna(val) else ''
        rows.append(row)

summary_df = pd.DataFrame(rows)

# Sort for cleaner presentation
summary_df = summary_df.sort_values(by=['Variable', 'Statistic'])

summary_df

Unnamed: 0,Variable,Statistic,1-2 | Miami/Broward,1-2 | Other,1-2 | All,3-4 | Miami/Broward,3-4 | Other,3-4 | All,5-9 | Miami/Broward,5-9 | Other,5-9 | All,10+ | Miami/Broward,10+ | Other,10+ | All,All | Miami/Broward,All | Other,All | All
6,Buildings,Mean,3.66,4.94,4.63,5.88,8.21,7.25,8.56,6.43,7.48,2.72,3.02,2.81,4.37,5.41,4.93
7,Buildings,Median,1.0,1.0,1.0,2.0,2.0,2.0,2.0,1.0,2.0,1.0,1.0,1.0,1.0,1.0,1.0
8,Buildings,Std. Dev.,6.59,12.44,11.34,8.78,16.15,13.66,16.05,14.49,15.31,3.25,4.73,3.75,8.83,12.63,11.04
27,Corp Mgmt (City),Mean,0.54,0.6,0.58,0.65,0.63,0.64,0.62,0.58,0.6,0.57,0.64,0.59,0.6,0.61,0.61
28,Corp Mgmt (City),Median,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0
29,Corp Mgmt (City),Std. Dev.,0.5,0.49,0.49,0.48,0.48,0.48,0.48,0.49,0.49,0.5,0.48,0.49,0.49,0.49,0.49
36,Flood Risk (First Street),Mean,2.73,2.88,2.84,2.32,2.89,2.65,2.9,4.04,3.43,4.83,4.45,4.73,3.67,3.53,3.6
37,Flood Risk (First Street),Median,3.0,3.0,3.0,1.0,3.0,1.0,3.0,5.0,4.0,5.0,5.0,5.0,4.0,4.0,4.0
38,Flood Risk (First Street),Std. Dev.,1.81,2.0,1.96,1.73,2.05,1.94,1.98,1.98,2.06,1.36,1.78,1.5,1.97,2.08,2.03
24,Frac. Corp Owners,Mean,0.21,0.13,0.15,0.15,0.12,0.14,0.15,0.09,0.12,0.23,0.09,0.19,0.2,0.11,0.16


In [2]:
from collections import OrderedDict

# Variables to round as integers
int_vars = {
    'Units',
    'Buildings',
    'Stories',
    'Unit Sqft',
    'Year Built',
    'Flood Risk (First Street)',
    'Per Capita Income (ZIP)',
    'Med Income (Nonprimary)'
}

# All other variables will be rounded to 2 decimals


# Step 1: Extract column info
fixed_cols = ['Variable', 'Statistic']
data_cols = [col for col in summary_df.columns if col not in fixed_cols]

# Parse multilevel headers
col_tuples = [tuple(col.split('|')) for col in data_cols]
col_tuples = [(s.strip(), c.strip()) for s, c in col_tuples]

# Group by story level for multicolumn construction
story_order = ['1-2', '3-4', '5-9', '10+', 'All']
county_order = ['Miami/Broward', 'Other', 'All']
ordered_cols = [(s, c) for s in story_order for c in county_order if (s, c) in col_tuples]

# Reorder columns
new_columns = fixed_cols + [f'{s} | {c}' for s, c in ordered_cols]
summary_df = summary_df[new_columns]

# Step 2: Round numeric values
# for col in summary_df.columns:
#     if col not in fixed_cols:
#         summary_df[col] = pd.to_numeric(summary_df[col], errors='coerce').round(2)
for col in summary_df.columns:
    if col in ['Variable', 'Statistic']:
        continue

    for var in summary_df['Variable'].unique():
        mask = summary_df['Variable'] == var

        if var in int_vars:
            summary_df.loc[mask, col] = summary_df.loc[mask, col].apply(
                lambda x: (
                    f"{int(round(v))}"
                    if pd.notna(v := pd.to_numeric(x, errors="coerce"))
                    else ""
                )
            )
        else:
            summary_df.loc[mask, col] = summary_df.loc[mask, col].apply(
                lambda x: (
                    f"{v:.2f}"
                    if pd.notna(v := pd.to_numeric(x, errors="coerce"))
                    else ""
                )
            )

    
# for col in summary_df.columns:
#     if col in ['Variable', 'Statistic']:
#         continue

#     for var in summary_df['Variable'].unique():
#         mask = summary_df['Variable'] == var

#         if var in int_vars:
#             # Round and convert to int, then to string to avoid ".0"
#             summary_df.loc[mask, col] = (
#                 summary_df.loc[mask, col]
#                 .apply(lambda x: f"{int(round(x))}" if pd.notna(x) else '')
#             )
#         else:
#             # Round to 2 decimals and format as string
#             summary_df.loc[mask, col] = (
#                 summary_df.loc[mask, col]
#                 .apply(lambda x: f"{x:.2f}" if pd.notna(x) else '')
#             )

# Step 3: Generate LaTeX with custom multicolumn header
def generate_latex_table(df):
    lines = []
    lines.append(r'\begin{tabular}{ll' + 'c' * len(ordered_cols) + '}')
    lines.append(r'\toprule')

    # First header row: StoryGroup
    story_counts = OrderedDict()
    for s, _ in ordered_cols:
        story_counts[s] = story_counts.get(s, 0) + 1
    header1 = ['Variable', 'Statistic'] + [
        rf'\multicolumn{{{story_counts[s]}}}{{c}}{{{s} Stories}}' for s in story_counts
    ]
    lines.append(' & '.join(header1) + r' \\')
    lines.append(r'\cmidrule(lr){3-' + f'{2+len(ordered_cols)}' + '}')

    # Second header row: CountyGroup
    header2 = [' ', ' '] + [c for _, c in ordered_cols]
    lines.append(' & '.join(header2) + r' \\')
    lines.append(r'\midrule')

    # Data rows
    for _, row in df.iterrows():
        row_vals = [str(row[col]) for col in df.columns]
        lines.append(' & '.join(row_vals) + r' \\')

    lines.append(r'\bottomrule')
    lines.append(r'\end{tabular}')
    return '\n'.join(lines)

# Export LaTeX table
latex_code = generate_latex_table(summary_df)
with open('summary_stats_twolevel_header.tex', 'w') as f:
    f.write(latex_code)
