In [1]:
import pandas as pd
import re

# -----------------------------
# Address normalization function
# -----------------------------
def normalize_address(addr):
    if pd.isna(addr):
        return None
    # Uppercase
    addr = addr.upper()
    # Remove punctuation
    addr = re.sub(r'[^\w\s]', ' ', addr)
    # Replace common abbreviations
    replacements = {
        r'\bSTREET\b': 'ST',
        r'\bST\b': 'ST',
        r'\bAVENUE\b': 'AVE',
        r'\bROAD\b': 'RD',
        r'\bDRIVE\b': 'DR',
        r'\bBOULEVARD\b': 'BLVD',
        r'\bLANE\b': 'LN',
        r'\bCOURT\b': 'CT',
        r'\bPLACE\b': 'PL',
        r'\bTERRACE\b': 'TER',
        r'\bHIGHWAY\b': 'HWY'
    }
    for pat, repl in replacements.items():
        addr = re.sub(pat, repl, addr)
    # Collapse multiple spaces
    addr = re.sub(r'\s+', ' ', addr).strip()
    return addr

# -----------------------------
# Step 1: Import master dataset
# -----------------------------
master_path = r"C:\Users\ngodin\Dropbox\RESEARCH\active_projects\florida_condo\final_datasets\master_datasets\master_dataset_unit_crossection.dta"
master_df = pd.read_stata(master_path)

# Keep unique combos
crosswalk = master_df[['building_address_attom', 'assoc_name_final']].drop_duplicates()

# Create normalized address
crosswalk['building_address_normalized'] = crosswalk['building_address_attom'].apply(normalize_address)

# Drop duplicates based on normalized merge key
crosswalk = crosswalk.drop_duplicates(subset=['building_address_normalized'])

crosswalk.head()

Unnamed: 0,building_address_attom,assoc_name_final,building_address_normalized
0,"3100 N PALM AIRE DR, POMPANO BEACH, FL 33069","PALM-AIRE COUNTRY CLUB APTS. CONDOMINIUM, INC....",3100 N PALM AIRE DR POMPANO BEACH FL 33069
1,"92 SW 3RD ST, MIAMI, FL 33130","THE RIVER FRONT MASTER ASSOCIATION, INC. 33130",92 SW 3RD ST MIAMI FL 33130
2,"20225 NE 34TH CT, MIAMI, FL 33180","20225 NE 34TH CT, MIAMI, FL 33180",20225 NE 34TH CT MIAMI FL 33180
3,"2200 GLADYS ST, LARGO, FL 33774",AUTUMN CHASE CONDOMINIUM 33774,2200 GLADYS ST LARGO FL 33774
4,"10200 GANDY BLVD N, ST PETERSBURG, FL 33702",VANTAGE POINT A CONDO 33702,10200 GANDY BLVD N ST PETERSBURG FL 33702


In [3]:
# -----------------------------
# Step 2: Import violations CSV
# -----------------------------
fll_violations_path = r"data/fort_lauderdale_code_violations_raw.csv"
fll_violations_df = pd.read_csv(fll_violations_path)

# Drop missing addresses
fll_violations_df = fll_violations_df[fll_violations_df['Address'].notna()]

# Normalize addresses
fll_violations_df['address_normalized'] = fll_violations_df['Address'].apply(normalize_address)

# Convert Date to datetime
fll_violations_df['Date'] = pd.to_datetime(fll_violations_df['Date'], errors='coerce')

# Sort by Date and keep oldest per Record Nu + normalized address
fll_violations_df = (
    fll_violations_df.sort_values('Date')
    .drop_duplicates(subset=['Record Number', 'address_normalized'], keep='first')
)

# -----------------------------
# Output for checking
# -----------------------------
print("Crosswalk shape:", crosswalk.shape)
print("Violations shape:", fll_violations_df.shape)

fll_violations_df.head()

Crosswalk shape: (17929, 3)
Violations shape: (31767, 7)


Unnamed: 0,Date,Record Number,Record Type,Address,Status,Unnamed: 5,address_normalized
31967,1996-03-22,CE96031021,Building Code Case,"715 NW 3 AVE, FORT LAUDERDALE FL -",Closed,,715 NW 3 AVE FORT LAUDERDALE FL
31966,1996-08-20,CE96080899,Building Code Case,"623 NW 13 TER, FORT LAUDERDALE FL 33311",Closed,,623 NW 13 TER FORT LAUDERDALE FL 33311
31965,1996-09-16,CE96090726,Building Code Case,"1525 NW 15 ST, FORT LAUDERDALE FL -",Open,,1525 NW 15 ST FORT LAUDERDALE FL
31964,1996-11-04,CE96110080,Building Code Case,"1207 NE 12 AVE, FORT LAUDERDALE FL -",Open,,1207 NE 12 AVE FORT LAUDERDALE FL
31963,1997-01-09,CE97010219,Building Code Case,"926 NW 2 ST, FORT LAUDERDALE FL 33311-",Closed,,926 NW 2 ST FORT LAUDERDALE FL 33311
