In [1]:
%run _bootstrap.py
from src.paths import DATA_ROOT


In [None]:
from pathlib import Path
import pandas as pd
import numpy as np
import glob

# Define paths (using DATA_ROOT for portability)
PERMITS_BASE = DATA_ROOT / "data" / "dewey_permits_builty" / "dewey-downloads"
CONDO_REF_PATH = DATA_ROOT / "final_datasets" / "attom_all_condo_units_basic_info.dta"

# Folders
folders = [
    PERMITS_BASE / "florida-permits-miami-broward",
    PERMITS_BASE / "florida-permits-ex-miami-broward"
]

# Count parquet files
total_files = sum(len(list(f.glob("*.parquet"))) for f in folders)
print(f"Total parquet files to process: {total_files}")

# Explore one file from each folder to understand schema
for folder in folders:
    sample_file = list(folder.glob("*.parquet"))[0]
    print(f"\n--- {folder.name} ---")
    print(f"Sample file: {sample_file.name}")
    
    df_sample = pd.read_parquet(sample_file)
    print(f"Shape: {df_sample.shape}")
    print(f"Columns: {df_sample.columns.tolist()}")
    print(f"First row sample (address columns):")
    addr_cols = [c for c in df_sample.columns if any(x in c.lower() for x in ['addr', 'street', 'city', 'zip'])]
    if addr_cols:
        print(df_sample[addr_cols].head(1).to_string())


Total parquet files to process: 193

--- florida-permits-miami-broward ---
Sample file: building-permits-united-states_0_0_0.snappy.parquet
Shape: (231907, 88)
Columns: ['ACCESSORY_STRUCTURE', 'ACCESSORY_STRUCTURE_TYPE', 'AIR_SOURCE_HEAT_PUMP', 'APN', 'APPLICANT_CITY', 'APPLICANT_STATE', 'APPLICANT_ZIPCODE', 'ATTACHED_DETACHED', 'BATH_ITEMS_INSTALLED', 'BATH_REMODEL', 'CITY', 'COMMERCIAL', 'CONDITION', 'CONSTRUCTION_TYPE', 'COUNTY', 'COUNTY_FIPS', 'DATA', 'DEMOLITION', 'DEMO_FULL_PARTIAL', 'DESCRIPTION', 'DWELLING_TYPE', 'EFFICIENCY', 'ELECTRICAL_PANEL_UPGRADE', 'ELECTRICAL_SERVICE_UPGRADE', 'EV_CHARGER', 'EXTERIOR_REMODEL', 'EXTERIOR_REMODEL_TYPE', 'FILE_DATE', 'FINAL_DATE', 'FOUNDATION', 'FOUNDATION_TYPE', 'FOUNDATION_WORK_TYPE', 'GROUND_SOURCE_HEAT_PUMP', 'HEAT_PUMP_WATER_HEATER', 'INDUCTION_STOVE', 'JOB_VALUE', 'JURISDICTION', 'KITCHEN_ITEMS_INSTALLED', 'KITCHEN_REMODEL', 'MULTIFAMILY', 'NEW_ADU', 'NEW_DWELLING', 'NUM_BATHS', 'NUM_BEDS', 'NUM_BUILDINGS', 'NUM_DOORS', 'NUM_FLOORS', 

In [5]:
# Load and inspect the condo reference dataset (Stata file)
print("--- Condo Reference Dataset ---")
condo_ref = pd.read_stata(CONDO_REF_PATH)
print(f"Shape: {condo_ref.shape}")
print(f"Columns: {condo_ref.columns.tolist()}")

# Check what address-related columns are available
addr_cols = [c for c in condo_ref.columns if any(x in c.lower() for x in ['addr', 'address', 'street', 'city', 'zip'])]
print(f"\nAddress-related columns in condo ref: {addr_cols}")

if addr_cols:
    print(f"\nSample address data:")
    print(condo_ref[addr_cols].head(3))


--- Condo Reference Dataset ---
Shape: (1556817, 10)
Columns: ['id_attom', 'mm_fips_county_name_attom', 'sa_nbr_bath_attom', 'sa_nbr_bedrms_attom', 'sa_sqft_attom', 'sa_x_coord_attom', 'sa_y_coord_attom', 'address_attom', 'building_address_attom', 'flag_80pct_same_coords_building']

Address-related columns in condo ref: ['address_attom', 'building_address_attom']

Sample address data:
                            address_attom               building_address_attom
0   008 CONFIDENTIAL 8, ORLANDO, FL 32812  008 CONFIDENTIAL, ORLANDO, FL 32812
1     010 CONFIDENTIAL, ORLANDO, FL 32806  010 CONFIDENTIAL, ORLANDO, FL 32806
2  024 CONFIDENTIAL 24, ORLANDO, FL 32822  024 CONFIDENTIAL, ORLANDO, FL 32822


In [7]:
# Load a sample permit file to inspect address columns
sample_permit = pd.read_parquet(list(folders[0].glob("*.parquet"))[0])
print("--- Permits Data Structure ---")
print(f"Shape: {sample_permit.shape}")
print(f"\nAll columns:")
for i, col in enumerate(sample_permit.columns, 1):
    print(f"  {i:2d}. {col}")

# Show all address-related columns
print(f"\n--- Address-related columns in permits ---")
addr_cols = [c for c in sample_permit.columns if any(x in c.lower() for x in ['addr', 'address', 'street', 'city', 'zip', 'location'])]
print(f"Found: {addr_cols}")

if addr_cols:
    print(f"\nSample data (first 3 rows):")
    print(sample_permit[addr_cols].head(3).to_string())
    
    print(f"\n--- Data types ---")
    print(sample_permit[addr_cols].dtypes)


--- Permits Data Structure ---
Shape: (231907, 88)

All columns:
   1. ACCESSORY_STRUCTURE
   2. ACCESSORY_STRUCTURE_TYPE
   3. AIR_SOURCE_HEAT_PUMP
   4. APN
   5. APPLICANT_CITY
   6. APPLICANT_STATE
   7. APPLICANT_ZIPCODE
   8. ATTACHED_DETACHED
   9. BATH_ITEMS_INSTALLED
  10. BATH_REMODEL
  11. CITY
  12. COMMERCIAL
  13. CONDITION
  14. CONSTRUCTION_TYPE
  15. COUNTY
  16. COUNTY_FIPS
  17. DATA
  18. DEMOLITION
  19. DEMO_FULL_PARTIAL
  20. DESCRIPTION
  21. DWELLING_TYPE
  22. EFFICIENCY
  23. ELECTRICAL_PANEL_UPGRADE
  24. ELECTRICAL_SERVICE_UPGRADE
  25. EV_CHARGER
  26. EXTERIOR_REMODEL
  27. EXTERIOR_REMODEL_TYPE
  28. FILE_DATE
  29. FINAL_DATE
  30. FOUNDATION
  31. FOUNDATION_TYPE
  32. FOUNDATION_WORK_TYPE
  33. GROUND_SOURCE_HEAT_PUMP
  34. HEAT_PUMP_WATER_HEATER
  35. INDUCTION_STOVE
  36. JOB_VALUE
  37. JURISDICTION
  38. KITCHEN_ITEMS_INSTALLED
  39. KITCHEN_REMODEL
  40. MULTIFAMILY
  41. NEW_ADU
  42. NEW_DWELLING
  43. NUM_BATHS
  44. NUM_BEDS
  45. NUM_BUILDIN

In [8]:
# Build address from permits: STREET + CITY + STATE + ZIPCODE
def build_permit_address(row):
    """Construct full address from permit components."""
    parts = []
    if pd.notna(row.get('STREET')):
        parts.append(str(row['STREET']).strip().upper())
    if pd.notna(row.get('CITY')):
        parts.append(str(row['CITY']).strip().upper())
    if pd.notna(row.get('STATE')):
        parts.append(str(row['STATE']).strip().upper())
    if pd.notna(row.get('ZIPCODE')):
        parts.append(str(row['ZIPCODE']).strip())
    
    return ", ".join(parts) if parts else None

# Test on sample
sample_permit['permit_address'] = sample_permit.apply(build_permit_address, axis=1)
print("Sample permit addresses (first 5):")
print(sample_permit[['STREET', 'CITY', 'STATE', 'ZIPCODE', 'permit_address']].head(5))

# Check for matches with condo ref
print(f"\n--- Merge Strategy ---")
print(f"Condo ref: {len(condo_ref):,} records with 'building_address_attom'")
print(f"Permits: {len(sample_permit):,} records with constructed 'permit_address'")

# Quick sample match check
condo_addrs = set(condo_ref['building_address_attom'].dropna())
permit_addrs = set(sample_permit['permit_address'].dropna())
overlap = condo_addrs.intersection(permit_addrs)
print(f"\nAddresses in common (this sample file): {len(overlap)}")
if overlap:
    print(f"Examples: {list(overlap)[:3]}")


Sample permit addresses (first 5):
             STREET         CITY STATE ZIPCODE  \
0  7832 Collins Ave  Miami Beach    FL   33141   
1     824 W W 47 St  Miami Beach    FL   33140   
2     1244 Ocean Dr  Miami Beach    FL   33139   
3      300 Alton Rd  Miami Beach    FL   33139   
4  1001 Collins Ave  Miami Beach    FL   33139   

                             permit_address  
0  7832 COLLINS AVE, MIAMI BEACH, FL, 33141  
1     824 W W 47 ST, MIAMI BEACH, FL, 33140  
2     1244 OCEAN DR, MIAMI BEACH, FL, 33139  
3      300 ALTON RD, MIAMI BEACH, FL, 33139  
4  1001 COLLINS AVE, MIAMI BEACH, FL, 33139  

--- Merge Strategy ---
Condo ref: 1,556,817 records with 'building_address_attom'
Permits: 231,907 records with constructed 'permit_address'

Addresses in common (this sample file): 0


In [9]:
# Debug: Look at actual condo addresses for Miami Beach (same city as sample permits)
print("--- Condo addresses from Miami Beach ---")
mb_condos = condo_ref[condo_ref['building_address_attom'].str.contains('MIAMI BEACH', case=False, na=False)]
print(f"Found {len(mb_condos)} condos in Miami Beach")
print("\nSample condo addresses:")
print(mb_condos['building_address_attom'].head(10).values)

print("\n--- Permit addresses from Miami Beach (from sample file) ---")
mb_permits = sample_permit[sample_permit['CITY'].str.contains('Miami Beach', case=False, na=False)]
print(f"Found {len(mb_permits)} permits in Miami Beach")
print("\nSample permit addresses:")
print(mb_permits['permit_address'].head(10).values)

# Try matching on just street + city (without state/zip)
print("\n--- Testing Street + City match only ---")
def build_permit_address_simple(row):
    parts = []
    if pd.notna(row.get('STREET')):
        parts.append(str(row['STREET']).strip().upper())
    if pd.notna(row.get('CITY')):
        parts.append(str(row['CITY']).strip().upper())
    return ", ".join(parts) if parts else None

sample_permit['permit_addr_simple'] = sample_permit.apply(build_permit_address_simple, axis=1)

# Extract street + city from condo addresses (before the comma + state + zip)
def extract_street_city_from_condo(addr):
    if pd.isna(addr):
        return None
    # Format appears to be: NUMBER ADDRESS, CITY, STATE ZIP
    # Try to get: ADDRESS, CITY
    parts = str(addr).split(',')
    if len(parts) >= 2:
        return f"{parts[0].strip()}, {parts[1].strip()}".upper()
    return None

condo_ref['addr_simple'] = condo_ref['building_address_attom'].apply(extract_street_city_from_condo)

mb_condos = condo_ref[condo_ref['building_address_attom'].str.contains('MIAMI BEACH', case=False, na=False)]
print(f"Condo simple addresses sample:")
print(mb_condos['addr_simple'].head(10).values)

mb_permits = sample_permit[sample_permit['CITY'].str.contains('Miami Beach', case=False, na=False)]
print(f"\nPermit simple addresses sample:")
print(mb_permits['permit_addr_simple'].head(10).values)

# Check overlap
condo_simple = set(condo_ref['addr_simple'].dropna())
permit_simple = set(sample_permit['permit_addr_simple'].dropna())
overlap_simple = condo_simple.intersection(permit_simple)
print(f"\nMatches on street + city: {len(overlap_simple)}")
if overlap_simple:
    print(f"Examples: {list(overlap_simple)[:5]}")


--- Condo addresses from Miami Beach ---
Found 53630 condos in Miami Beach

Sample condo addresses:
['1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139'
 '1 CENTURY LN, MIAMI BEACH, FL 33139']

--- Permit addresses from Miami Beach (from sample file) ---
Found 26111 permits in Miami Beach

Sample permit addresses:
['7832 COLLINS AVE, MIAMI BEACH, FL, 33141'
 '824 W W 47 ST, MIAMI BEACH, FL, 33140'
 '1244 OCEAN DR, MIAMI BEACH, FL, 33139'
 '300 ALTON RD, MIAMI BEACH, FL, 33139'
 '1001 COLLINS AVE, MIAMI BEACH, FL, 33139'
 '738 LINCOLN RD, MIAMI BEACH, FL, 33139'
 '220 COLLINS AVE, MIAMI BEACH, FL, 33139'
 '3127 INDIAN CREEK DR, MIAMI BEACH, FL, 33140'
 '605 LINCOLN RD, MIAMI BEACH, FL, 33139'
 

In [None]:
# Create output directory for condo-only permit subsets
OUTPUT_DIR = DATA_ROOT / "data" / "dewey_permits_builty" / "condo_filtered"
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)

# Update address key to include ZIP code: street + city + zip
def build_addr_key_with_zip(addr_simple, zipcode):
    """Combine street+city with zip code for matching."""
    if pd.isna(addr_simple) or pd.isna(zipcode):
        return None
    return f"{addr_simple}, {str(zipcode).strip()}"

# Add ZIP to condo lookup key
condo_ref['addr_key'] = condo_ref.apply(
    lambda row: build_addr_key_with_zip(row['addr_simple'], 
                                        row['building_address_attom'].split(',')[-1].split()[-1] if pd.notna(row['building_address_attom']) else None),
    axis=1
)

# Build lookup table from condo ref: street+city+zip -> list of condo records
condo_lookup = {}
for idx, row in condo_ref.iterrows():
    addr_key = row['addr_key']
    if pd.notna(addr_key):
        if addr_key not in condo_lookup:
            condo_lookup[addr_key] = []
        condo_lookup[addr_key].append(row.to_dict())

print(f"Built lookup table: {len(condo_lookup):,} unique street+city+zip combinations")

# Merge function: takes permit dataframe, returns only rows that match condo addresses
def filter_permits_to_condos(permit_df):
    """Filter permit records to only those matching condo addresses (street+city+zip)."""
    permit_df = permit_df.copy()
    permit_df['permit_addr_simple'] = permit_df.apply(build_permit_address_simple, axis=1)
    permit_df['permit_addr_key'] = permit_df.apply(
        lambda row: build_addr_key_with_zip(row['permit_addr_simple'], row['ZIPCODE']),
        axis=1
    )
    
    # Filter to only rows with matches
    matched = permit_df[permit_df['permit_addr_key'].isin(condo_lookup.keys())].copy()
    return matched

# Process all permits in batches
print(f"\n--- Processing {total_files} permit files ---")
total_matched = 0
file_count = 0

for folder in folders:
    print(f"\n{folder.name}:")
    parquet_files = sorted(folder.glob("*.parquet"))
    
    for parquet_file in parquet_files:
        file_count += 1
        
        # Load permit file
        permit_df = pd.read_parquet(parquet_file)
        
        # Filter to condo addresses
        matched_df = filter_permits_to_condos(permit_df)
        matched_count = len(matched_df)
        
        if matched_count > 0:
            # Write filtered output
            output_file = OUTPUT_DIR / f"condo_permits_{file_count:03d}_{parquet_file.stem}.parquet"
            matched_df.to_parquet(output_file)
            total_matched += matched_count
            print(f"  File {file_count:3d}/{total_files} ({parquet_file.name}): {matched_count:7,} matched → {output_file.name}")
        else:
            print(f"  File {file_count:3d}/{total_files} ({parquet_file.name}): 0 matched (skipped)")

print(f"\n--- SUMMARY ---")
print(f"Processed: {file_count} files")
print(f"Total condo-matching permits: {total_matched:,}")
print(f"Output directory: {OUTPUT_DIR}")
print(f"Output files: {len(list(OUTPUT_DIR.glob('*.parquet')))}")


Built lookup table: 351,723 unique street+city+zip combinations

--- Processing 193 permit files ---

florida-permits-miami-broward:
  File   1/193 (building-permits-united-states_0_0_0.snappy.parquet):   8,788 matched → condo_permits_001_building-permits-united-states_0_0_0.snappy.parquet
  File   2/193 (building-permits-united-states_0_1_0.snappy.parquet):  14,738 matched → condo_permits_002_building-permits-united-states_0_1_0.snappy.parquet
  File   3/193 (building-permits-united-states_0_1_1.snappy.parquet):      11 matched → condo_permits_003_building-permits-united-states_0_1_1.snappy.parquet
  File   4/193 (building-permits-united-states_0_2_0.snappy.parquet):   9,111 matched → condo_permits_004_building-permits-united-states_0_2_0.snappy.parquet
  File   5/193 (building-permits-united-states_0_3_0.snappy.parquet):   7,504 matched → condo_permits_005_building-permits-united-states_0_3_0.snappy.parquet
  File   6/193 (building-permits-united-states_0_4_0.snappy.parquet):   4,818

In [12]:
# Summary of filtered output
filtered_files = sorted(OUTPUT_DIR.glob("*.parquet"))
print(f"✓ Batch processing complete!")
print(f"\nFiltered output summary:")
print(f"  Output directory: {OUTPUT_DIR}")
print(f"  Files created: {len(filtered_files)}")

if filtered_files:
    # Load and combine all filtered files to get summary stats
    all_filtered = []
    for f in filtered_files:
        df = pd.read_parquet(f)
        all_filtered.append(df)
    
    combined = pd.concat(all_filtered, ignore_index=True)
    print(f"  Total condo-matching permits: {len(combined):,}")
    
    # Check date column
    if 'FILE_DATE' in combined.columns:
        dates = pd.to_datetime(combined['FILE_DATE'], errors='coerce')
        print(f"  Date range: {dates.min()} to {dates.max()}")
    
    print(f"  Columns retained: {combined.shape[1]}")
    print(f"\n  Geographic distribution:")
    print(combined['COUNTY'].value_counts().head(10))


✓ Batch processing complete!

Filtered output summary:
  Output directory: C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\data\dewey_permits_builty\condo_filtered
  Files created: 188
  Total condo-matching permits: 816,900
  Date range: 1955-01-01 00:00:00 to 2025-10-14 00:00:00
  Columns retained: 90

  Geographic distribution:
COUNTY
Broward County         133314
Pinellas County        123618
Miami-Dade County      116137
Lee County             102903
Palm Beach County       56884
Indian River County     43207
Martin County           33451
Sarasota County         21354
Hillsborough County     20682
Charlotte County        20286
Name: count, dtype: int64


In [13]:
# Combine all filtered permit files into a single parquet
print("Combining all filtered permits into single file...")

filtered_files = sorted(OUTPUT_DIR.glob("*.parquet"))
all_permits = []

for i, f in enumerate(filtered_files, 1):
    df = pd.read_parquet(f)
    all_permits.append(df)
    if i % 50 == 0:
        print(f"  Loaded {i}/{len(filtered_files)} files...")

combined_permits = pd.concat(all_permits, ignore_index=True)
print(f"\nCombined data:")
print(f"  Total rows: {len(combined_permits):,}")
print(f"  Columns: {combined_permits.shape[1]}")
print(f"  Memory usage: {combined_permits.memory_usage(deep=True).sum() / 1e9:.2f} GB")

# Write combined file
combined_output_path = OUTPUT_DIR.parent / "condo_permits_combined.parquet"
print(f"\nWriting combined file...")
combined_permits.to_parquet(combined_output_path, compression='snappy', index=False)
print(f"✓ Done: {combined_output_path}")
print(f"  File size: {combined_output_path.stat().st_size / 1e9:.2f} GB")


Combining all filtered permits into single file...
  Loaded 50/188 files...
  Loaded 100/188 files...
  Loaded 150/188 files...

Combined data:
  Total rows: 816,900
  Columns: 90
  Memory usage: 6.50 GB

Writing combined file...
✓ Done: C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\data\dewey_permits_builty\condo_permits_combined.parquet
  File size: 0.87 GB


In [None]:
# Check if combined parquet has building_address_attom
combined_path = DATA_ROOT / "data" / "dewey_permits_builty" / "condo_permits_combined.parquet"
combined = pd.read_parquet(combined_path)

print("Columns in combined parquet:")
print(combined.columns.tolist())

if 'building_address_attom' in combined.columns:
    print("\n✓ YES, 'building_address_attom' is in the file")
    print(f"  Non-null count: {combined['building_address_attom'].notna().sum():,}")
    print(f"  Example values:")
    print(combined[['STREET', 'CITY', 'ZIPCODE', 'building_address_attom']].head(3))
else:
    print("\n✗ NO, 'building_address_attom' is NOT in the file")
    print("  (It was used only for filtering/matching, not included in output)")


Columns in combined parquet:
['ACCESSORY_STRUCTURE', 'ACCESSORY_STRUCTURE_TYPE', 'AIR_SOURCE_HEAT_PUMP', 'APN', 'APPLICANT_CITY', 'APPLICANT_STATE', 'APPLICANT_ZIPCODE', 'ATTACHED_DETACHED', 'BATH_ITEMS_INSTALLED', 'BATH_REMODEL', 'CITY', 'COMMERCIAL', 'CONDITION', 'CONSTRUCTION_TYPE', 'COUNTY', 'COUNTY_FIPS', 'DATA', 'DEMOLITION', 'DEMO_FULL_PARTIAL', 'DESCRIPTION', 'DWELLING_TYPE', 'EFFICIENCY', 'ELECTRICAL_PANEL_UPGRADE', 'ELECTRICAL_SERVICE_UPGRADE', 'EV_CHARGER', 'EXTERIOR_REMODEL', 'EXTERIOR_REMODEL_TYPE', 'FILE_DATE', 'FINAL_DATE', 'FOUNDATION', 'FOUNDATION_TYPE', 'FOUNDATION_WORK_TYPE', 'GROUND_SOURCE_HEAT_PUMP', 'HEAT_PUMP_WATER_HEATER', 'INDUCTION_STOVE', 'JOB_VALUE', 'JURISDICTION', 'KITCHEN_ITEMS_INSTALLED', 'KITCHEN_REMODEL', 'MULTIFAMILY', 'NEW_ADU', 'NEW_DWELLING', 'NUM_BATHS', 'NUM_BEDS', 'NUM_BUILDINGS', 'NUM_DOORS', 'NUM_FLOORS', 'NUM_ROOMS', 'NUM_UNITS', 'NUM_WINDOWS', 'OWNER_CITY', 'OWNER_STATE', 'OWNER_ZIPCODE', 'PERMIT_DATE', 'PERMIT_NUMBER', 'POOL_SPA', 'POOL_TYP

In [None]:
# Add building_address_attom to the combined permits by merging with condo ref
print("Adding building_address_attom to permits...")

# Load combined permits
combined_path = DATA_ROOT / "data" / "dewey_permits_builty" / "condo_permits_combined.parquet"
permits = pd.read_parquet(combined_path)

print(f"Permits before merge: {len(permits):,} rows")

# Prepare condo ref with just the lookup columns needed
condo_lookup_minimal = condo_ref[['addr_key', 'building_address_attom']].drop_duplicates(subset=['addr_key']).copy()
print(f"Condo lookup table: {len(condo_lookup_minimal):,} unique addresses")

# Merge permits with condo address info
permits_merged = permits.merge(
    condo_lookup_minimal,
    left_on='permit_addr_key',
    right_on='addr_key',
    how='left'
)

print(f"After merge: {len(permits_merged):,} rows")
print(f"building_address_attom populated: {permits_merged['building_address_attom'].notna().sum():,} rows")

# Drop helper columns and duplicate addr_key
permits_merged = permits_merged.drop(columns=['addr_key', 'permit_addr_simple', 'permit_addr_key'])

print(f"\nFinal columns: {permits_merged.shape[1]}")
print(f"Sample:")
print(permits_merged[['STREET', 'CITY', 'ZIPCODE', 'building_address_attom']].head(3))

# Overwrite combined file
print("\nWriting updated parquet...")
permits_merged.to_parquet(combined_path, compression='snappy', index=False)
print(f"✓ Updated: {combined_path}")
print(f"  File size: {combined_path.stat().st_size / 1e9:.2f} GB")


Adding building_address_attom to permits...
Permits before merge: 816,900 rows
Condo lookup table: 351,723 unique addresses
After merge: 816,900 rows
building_address_attom populated: 816,900 rows

Final columns: 89
Sample:
              STREET         CITY ZIPCODE  \
0   7832 Collins Ave  Miami Beach   33141   
1  2463 Pine Tree Dr  Miami Beach   33140   
2   2625 Collins Ave  Miami Beach   33140   

                     building_address_attom  
0   7832 COLLINS AVE, MIAMI BEACH, FL 33141  
1  2463 PINE TREE DR, MIAMI BEACH, FL 33140  
2   2625 COLLINS AVE, MIAMI BEACH, FL 33140  

Writing updated parquet...
✓ Updated: C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\data\dewey_permits_builty\condo_permits_combined.parquet
  File size: 0.87 GB
