In [1]:
%run _bootstrap.py
from src.paths import DATA_ROOT

In [3]:
import pandas as pd
import numpy as np
from pathlib import Path

# Load the combined permits file
combined_path = DATA_ROOT / "data" / "dewey_permits_builty" / "condo_permits_combined.parquet"
permits = pd.read_parquet(combined_path)

print(f"Loaded {len(permits):,} condo permits")
print(f"Columns: {permits.shape[1]}")

# Check FILE_DATE (may have mixed types)
valid_dates = permits['FILE_DATE'].dropna()
if len(valid_dates) > 0:
    print(f"Date range: {valid_dates.min()} to {valid_dates.max()}")
else:
    print("No valid dates found")

# Key fields we're interested in:
# 1. building_address_attom - already exists
# 2. FILE_DATE - permit application date (available)
# 3. DESCRIPTION - what the permit is for (available)

print("\n--- Key Fields for Analysis ---")
print(f"building_address_attom: {permits['building_address_attom'].notna().sum():,} non-null")
print(f"FILE_DATE: {permits['FILE_DATE'].notna().sum():,} non-null")
print(f"DESCRIPTION: {permits['DESCRIPTION'].notna().sum():,} non-null")

print("\n--- Sample Descriptions ---")
print(permits['DESCRIPTION'].value_counts().head(20))


Loaded 816,900 condo permits
Columns: 89
Date range: 1955-01-01 to 2025-10-14

--- Key Fields for Analysis ---
building_address_attom: 816,900 non-null
FILE_DATE: 786,242 non-null
DESCRIPTION: 764,398 non-null

--- Sample Descriptions ---
DESCRIPTION
PLUMBING-WATER HEATER CHANGE OUT                                           13056
MECHANICAL-HVAC                                                            12452
AC Change Out                                                               7175
5 UNITS OR MORE  - RESIDENTIAL                                              7009
Building Combination CONV                     EQUAL A/C CHG OUT     (M)     6079
MECH ALTER/REPAIR                                                           4310
RESIDENTIAL-ALTERATION                                                      3812
WATERHEATER REPL                                                            3250
A/C CHANGE OUT                                                              3249
A/C REPLACEMENT     

In [5]:
# Categorize permits: structural vs. non-structural
# Structural items: roof, floor, load-bearing walls, plumbing, fireproofing, foundation, etc.

# Define keywords for structural work
STRUCTURAL_KEYWORDS = {
    'roof': ['roof', 'reroof', 'shingle', 'membrane', 'slate', 'tile'],
    'floor': ['floor', 'subfloor', 'joist', 'beam'],
    'walls': ['wall', 'load-bearing', 'bearing wall', 'shear wall', 'firewall', 'exterior wall'],
    'plumbing': ['plumb', 'drain', 'sewer', 'water line', 'pipe', 'sanitary'],
    'fireproofing': ['fireproof', 'fire-proof', 'fire rating', 'fire-rated', 'fire stop', 'firestop'],
    'foundation': ['foundation', 'footings', 'pilings', 'piles'],
    'structural': ['structural', 'beam', 'column', 'truss', 'bracing', 'frame', 'framing'],
    'hvac': ['hvac', 'heating', 'cooling', 'air conditioning', 'ventilation', 'ductwork'],
    'electrical': ['electrical', 'panel', 'wiring', 'circuit', 'service upgrade', 'lighting'],
    'windows_doors': ['window', 'door', 'glass', 'glazing'],
}

def categorize_permit(description):
    """Categorize permit as structural or other based on description."""
    if pd.isna(description):
        return 'unknown'
    
    desc_lower = str(description).lower()
    
    # Check for structural keywords
    for category, keywords in STRUCTURAL_KEYWORDS.items():
        for keyword in keywords:
            if keyword in desc_lower:
                return f'structural_{category}'
    
    return 'other'

# Apply categorization
permits['permit_category'] = permits['DESCRIPTION'].apply(categorize_permit)

print("Permit Categories:")
print(permits['permit_category'].value_counts().sort_values(ascending=False))
print(f"\nTotal structural permits: {(permits['permit_category'].str.startswith('structural')).sum():,}")
print(f"Total other permits: {(permits['permit_category'] == 'other').sum():,}")
print(f"Unknown: {(permits['permit_category'] == 'unknown').sum():,}")

# Show examples of each category
print("\n--- Examples by Category ---")
for category in sorted(permits['permit_category'].unique()):
    category_data = permits[permits['permit_category'] == category]['DESCRIPTION'].dropna()
    sample_desc = category_data.iloc[0] if len(category_data) > 0 else "N/A"
    count = len(permits[permits['permit_category'] == category])
    print(f"{category:25s} ({count:6,}): {sample_desc[:70]}")


Permit Categories:
permit_category
other                       443804
structural_roof              86544
structural_windows_doors     83094
unknown                      52502
structural_hvac              39576
structural_plumbing          37594
structural_electrical        32602
structural_floor             18164
structural_walls             16443
structural_structural         5302
structural_foundation         1235
structural_fireproofing         40
Name: count, dtype: int64

Total structural permits: 320,594
Total other permits: 443,804
Unknown: 52,502

--- Examples by Category ---
other                     (443,804): Replace kitchen outlets
structural_electrical     (32,602): BCR2302538 // UNIT #: 606 // electrical installation for 1 and 1/2 bat
structural_fireproofing   (    40): MASTER B1401618 // mezzanine baths -  revision to add (fire stopping d
structural_floor          (18,164): Unit#310/ Replacement of damaged wood floors with natural stone (No ba
structural_foundation     (

In [6]:
# Create indicator for structural permits and keep only relevant columns
permits['is_structural'] = permits['permit_category'].str.startswith('structural')

# Keep only relevant columns
relevant_cols = [
    'building_address_attom',
    'FILE_DATE',
    'PERMIT_NUMBER',
    'DESCRIPTION',
    'permit_category',
    'is_structural',
    'COUNTY',
    'JURISDICTION',
    'JOB_VALUE'
]

permits_clean = permits[relevant_cols].copy()

print(f"Final dataset: {len(permits_clean):,} permits")
print(f"\nColumns kept: {permits_clean.columns.tolist()}")
print(f"\nStructural permits: {permits_clean['is_structural'].sum():,}")
print(f"Non-structural permits: {(~permits_clean['is_structural']).sum():,}")

print("\nSample data:")
print(permits_clean.head(3))


Final dataset: 816,900 permits

Columns kept: ['building_address_attom', 'FILE_DATE', 'PERMIT_NUMBER', 'DESCRIPTION', 'permit_category', 'is_structural', 'COUNTY', 'JURISDICTION', 'JOB_VALUE']

Structural permits: 320,594
Non-structural permits: 496,306

Sample data:
                     building_address_attom   FILE_DATE PERMIT_NUMBER  \
0   7832 COLLINS AVE, MIAMI BEACH, FL 33141  2022-02-16    BOA2215385   
1  2463 PINE TREE DR, MIAMI BEACH, FL 33140  2022-04-14    BCR2201109   
2   2625 COLLINS AVE, MIAMI BEACH, FL 33140  2023-05-10    BPI2324335   

                                         DESCRIPTION  \
0                            Replace kitchen outlets   
1  UNIT#PH-A-/ DEMO EXISTING SHOWER, MAKE 66' NEW...   
2      Unit#409/Replacement of 2 sliding glass doors   

            permit_category  is_structural             COUNTY JURISDICTION  \
0                     other          False  Miami-Dade County  Miami Beach   
1  structural_windows_doors           True  Miami-Dade Cou

In [13]:
# Export to Stata file
output_dir = DATA_ROOT / "final_datasets"
output_dir.mkdir(parents=True, exist_ok=True)

output_file = output_dir / "condo_permits_cleaned.dta"

print(f"Exporting {len(permits_clean):,} permits to Stata format...")

# Prepare data for export
export_data = permits_clean.copy()
export_data['FILE_DATE'] = export_data['FILE_DATE'].astype(str)

# Remove special characters from all string columns (Stata limitation)
for col in export_data.select_dtypes(include='object').columns:
    # Replace problematic characters
    export_data[col] = export_data[col].astype(str).str.replace('[^\x00-\x7F]+', '', regex=True)

export_data.to_stata(output_file, write_index=False, version=117)

print(f"✓ Exported to: {output_file}")
print(f"  File size: {output_file.stat().st_size / 1e6:.2f} MB")
print(f"  Note: Special/Unicode characters were removed for Stata compatibility")


Exporting 816,900 permits to Stata format...
✓ Exported to: C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\final_datasets\condo_permits_cleaned.dta
  File size: 200.39 MB
  Note: Special/Unicode characters were removed for Stata compatibility


In [12]:
# Export to Parquet (preserves full text including Unicode characters)
parquet_file = output_dir / "condo_permits_cleaned.parquet"

print(f"Exporting {len(permits_clean):,} permits to Parquet format...")
permits_clean.to_parquet(parquet_file, compression='snappy', index=False)

print(f"✓ Exported to: {parquet_file}")
print(f"  File size: {parquet_file.stat().st_size / 1e6:.2f} MB")
print(f"  Format: Parquet with snappy compression (preserves full text)")


Exporting 816,900 permits to Parquet format...
✓ Exported to: C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\final_datasets\condo_permits_cleaned.parquet
  File size: 41.40 MB
  Format: Parquet with snappy compression (preserves full text)
