In [1]:
import os
import json
import re
import pandas as pd
import numpy as np

from typing import Tuple
from pandas import Int64Dtype
from typing import Dict, Tuple, Sequence

In [2]:
all_real_estate_path = r'C:\Users\markd\hamilton-county-homes-scraper-main\data\raw\home_sales\homes_all.csv'
all_real_estate = pd.read_csv(all_real_estate_path, low_memory=False)

In [3]:
# Maps *every* commonly-seen spelling / abbreviation → canonical USPS suffix
street_type_map: dict[str, str] = {
    # ――― A ―――
    "ALY": "ALLEY", "ALLEE": "ALLEY", "ALLY": "ALLEY", "ALLEY": "ALLEY","AL": "ALLEY",
    "ANX": "ANNEX", "ANEX": "ANNEX", "ANNEX": "ANNEX",
    "ARC": "ARCADE", "ARCADE": "ARCADE",
    "AV": "AVENUE", "AVE": "AVENUE", "AVEN": "AVENUE", "AVENU": "AVENUE",
    "AVN": "AVENUE", "AVNUE": "AVENUE", "AVENUE": "AVENUE",

    # ――― B ―――
    "BYU": "BAYOU", "BAYOO": "BAYOU", "BAYOU": "BAYOU",
    "BCH": "BEACH", "BEACH": "BEACH",
    "BND": "BEND", "BEND": "BEND",
    "BLF": "BLUFF", "BLUF": "BLUFF", "BLUFF": "BLUFF",
    "BLFS": "BLUFFS", "BLUFFS": "BLUFFS",
    "BTM": "BOTTOM", "BOTTM": "BOTTOM", "BOT": "BOTTOM", "BOTTOM": "BOTTOM",
    "BLVD": "BOULEVARD", "BOUL": "BOULEVARD", "BOULV": "BOULEVARD","BV": "BOULEVARD",
    "BOULEVARD": "BOULEVARD",
    "BR": "BRANCH", "BRNCH": "BRANCH", "BRANCH": "BRANCH",
    "BRG": "BRIDGE", "BRDGE": "BRIDGE", "BRIDGE": "BRIDGE",
    "BRK": "BROOK", "BROOK": "BROOK",
    "BRKS": "BROOKS", "BROOKS": "BROOKS",
    "BG": "BURG", "BURG": "BURG",
    "BGS": "BURGS", "BURGS": "BURGS",
    "BYP": "BYPASS", "BYPA": "BYPASS", "BYPAS": "BYPASS",
    "BYPS": "BYPASS", "BYPASS": "BYPASS",
    # ――― C ―――
    "CP": "CAMP", "CMP": "CAMP", "CAMP": "CAMP",
    "CYN": "CANYON", "CNYN": "CANYON", "CANYN": "CANYON", "CANYON": "CANYON",
    "CPE": "CAPE", "CAPE": "CAPE",
    "CSWY": "CAUSEWAY", "CAUSEWAY": "CAUSEWAY", "CAUSWA": "CAUSEWAY",
    "CEN": "CENTER", "CENT": "CENTER", "CENTER": "CENTER", "CENTR": "CENTER",
    "CENTRE": "CENTER", "CNTER": "CENTER", "CNTR": "CENTER",
    "CTRS": "CENTERS", "CENTERS": "CENTERS",
    "CIR": "CIRCLE", "CR": "CIRCLE", "CIRC": "CIRCLE", "CIRCL": "CIRCLE",
    "CRCL": "CIRCLE", "CRCLE": "CIRCLE", "CIRCLE": "CIRCLE",
    "CIRS": "CIRCLES", "CIRCLES": "CIRCLES",
    "CLF": "CLIFF", "CLIFF": "CLIFF",
    "CLFS": "CLIFFS", "CLIFFS": "CLIFFS",
    "CLB": "CLUB", "CLUB": "CLUB",
    "CMN": "COMMON", "COMMON": "COMMON",
    "CMNS": "COMMONS", "COMMONS": "COMMONS",
    "COR": "CORNER", "CORNER": "CORNER",
    "CORS": "CORNERS", "CORNERS": "CORNERS",
    "CRSE": "COURSE", "COURSE": "COURSE",
    "CT": "COURT", "COURT": "COURT",
    "CTS": "COURTS", "COURTS": "COURTS",
    "CV": "COVE", "COVE": "COVE",
    "CVS": "COVES", "COVES": "COVES",
    "CRK": "CREEK", "CREEK": "CREEK",
    "CRES": "CRESCENT", "CRSENT": "CRESCENT", "CRSNT": "CRESCENT",
    "CRESCENT": "CRESCENT",
    "CRST": "CREST", "CREST": "CREST",
    "XING": "CROSSING", "CRSSNG": "CROSSING", "CROSSING": "CROSSING",
    "XRD": "CROSSROAD", "CROSSROAD": "CROSSROAD",
    "XRDS": "CROSSROADS", "CROSSROADS": "CROSSROADS",
    "CURV": "CURVE", "CURVE": "CURVE",
    # ――― D ―――
    "DL": "DALE", "DALE": "DALE",
    "DM": "DAM", "DAM": "DAM",
    "DV": "DIVIDE", "DIV": "DIVIDE", "DVD": "DIVIDE", "DIVIDE": "DIVIDE",
    "DR": "DRIVE", "DRIV": "DRIVE", "DRV": "DRIVE", "DRIVE": "DRIVE",
    "DRS": "DRIVES", "DRIVES": "DRIVES",
    # ――― E ―――
    "EST": "ESTATE", "ESTATE": "ESTATE",
    "ESTS": "ESTATES", "ESTATES": "ESTATES",
    "EXP": "EXPRESSWAY", "EXPY": "EXPRESSWAY", "EXPR": "EXPRESSWAY",
    "EXPRESS": "EXPRESSWAY", "EXPW": "EXPRESSWAY", "EXPRESSWAY": "EXPRESSWAY",
    "EXT": "EXTENSION", "EXTN": "EXTENSION", "EXTNSN": "EXTENSION",
    "EXTENSION": "EXTENSION",
    "EXTS": "EXTENSIONS", "EXTENSIONS": "EXTENSIONS",
    # ――― F ―――
    "FALL": "FALL", "FLS": "FALLS", "FALLS": "FALLS",
    "FRY": "FERRY", "FRRY": "FERRY", "FERRY": "FERRY",
    "FLD": "FIELD", "FIELD": "FIELD",
    "FLDS": "FIELDS", "FIELDS": "FIELDS",
    "FLT": "FLAT", "FLAT": "FLAT",
    "FLTS": "FLATS", "FLATS": "FLATS",
    "FRD": "FORD", "FORD": "FORD",
    "FRDS": "FORDS", "FORDS": "FORDS",
    "FRST": "FOREST", "FOREST": "FOREST",
    "FORG": "FORGE", "FRG": "FORGE", "FORGE": "FORGE",
    "FRGS": "FORGES", "FORGES": "FORGES",
    "FRK": "FORK", "FORK": "FORK",
    "FRKS": "FORKS", "FORKS": "FORKS",
    "FT": "FORT", "FRT": "FORT", "FORT": "FORT",
    "FWY": "FREEWAY", "FREEWAY": "FREEWAY", "FREEWY": "FREEWAY",
    "FRWAY": "FREEWAY", "FRWY": "FREEWAY",
    # ――― G ―――
    "GDN": "GARDEN", "GARDN": "GARDEN", "GRDEN": "GARDEN", "GRDN": "GARDEN",
    "GARDEN": "GARDEN",
    "GDNS": "GARDENS", "GRDNS": "GARDENS", "GARDENS": "GARDENS",
    "GTWY": "GATEWAY", "GATEWY": "GATEWAY", "GATWAY": "GATEWAY",
    "GTWAY": "GATEWAY", "GATEWAY": "GATEWAY",
    "GLN": "GLEN", "GLEN": "GLEN",
    "GLNS": "GLENS", "GLENS": "GLENS",
    "GRN": "GREEN", "GREEN": "GREEN",
    "GRNS": "GREENS", "GREENS": "GREENS",
    "GRV": "GROVE", "GROV": "GROVE", "GROVE": "GROVE",
    "GRVS": "GROVES", "GROVES": "GROVES",
    # ――― H ―――
    "HBR": "HARBOR", "HARB": "HARBOR", "HARBR": "HARBOR",
    "HRBOR": "HARBOR", "HARBOR": "HARBOR",
    "HBRS": "HARBORS", "HARBORS": "HARBORS",
    "HVN": "HAVEN", "HAVEN": "HAVEN",
    "HTS": "HEIGHTS", "HT": "HEIGHTS", "HEIGHTS": "HEIGHTS",
    "HWY": "HIGHWAY", "HIGHWAY": "HIGHWAY", "HIGHWY": "HIGHWAY",
    "HIWAY": "HIGHWAY", "HIWY": "HIGHWAY", "HWAY": "HIGHWAY",
    "HL": "HILL", "HILL": "HILL",
    "HLS": "HILLS", "HILLS": "HILLS",
    "HOLW": "HOLLOW", "HLLW": "HOLLOW", "HOLLOW": "HOLLOW",
    "HOLWS": "HOLLOW", "HOLLOWS": "HOLLOW",
    # ――― I ―――
    "INLT": "INLET", "INLET": "INLET",
    "IS": "ISLAND", "ISLND": "ISLAND", "ISLAND": "ISLAND",
    "ISS": "ISLANDS", "ISLNDS": "ISLANDS", "ISLANDS": "ISLANDS",
    "ISLE": "ISLE", "ISLES": "ISLES",
    # ――― J ―――
    "JCT": "JUNCTION", "JCTION": "JUNCTION", "JCTN": "JUNCTION",
    "JUNCTN": "JUNCTION", "JUNCTON": "JUNCTION", "JUNCTION": "JUNCTION",
    "JCTNS": "JUNCTIONS", "JCTS": "JUNCTIONS", "JUNCTIONS": "JUNCTIONS",
    # ――― K ―――
    "KY": "KEY", "KEY": "KEY",
    "KYS": "KEYS", "KEYS": "KEYS",
    "KNL": "KNOLL", "KNOL": "KNOLL", "KNOLL": "KNOLL",
    "KNLS": "KNOLLS", "KNOLLS": "KNOLLS",
    # ――― L ―――
    "LK": "LAKE", "LAKE": "LAKE",
    "LKS": "LAKES", "LAKES": "LAKES",
    "LAND": "LAND",
    "LNDG": "LANDING", "LNDNG": "LANDING", "LANDING": "LANDING",
    "LN": "LANE", "LANE": "LANE",
    "LGT": "LIGHT", "LIGHT": "LIGHT",
    "LGTS": "LIGHTS", "LIGHTS": "LIGHTS",
    "LF": "LOAF", "LOAF": "LOAF",
    "LCK": "LOCK", "LOCK": "LOCK",
    "LCKS": "LOCKS", "LOCKS": "LOCKS",
    "LDG": "LODGE", "LDGE": "LODGE", "LODG": "LODGE", "LODGE": "LODGE",
    "LOOP": "LOOP", "LOOPS": "LOOP",
    # ――― M ―――
    "MALL": "MALL",
    "MNR": "MANOR", "MANOR": "MANOR",
    "MNRS": "MANORS", "MANORS": "MANORS",
    "MDW": "MEADOW", "MEADOW": "MEADOW",
    "MDWS": "MEADOWS", "MEDOWS": "MEADOWS", "MEADOWS": "MEADOWS",
    "MEWS": "MEWS",
    "ML": "MILL", "MILL": "MILL",
    "MLS": "MILLS", "MILLS": "MILLS",
    "MSN": "MISSION", "MISSN": "MISSION", "MSSN": "MISSION",
    "MISSION": "MISSION",
    "MTWY": "MOTORWAY", "MOTORWAY": "MOTORWAY",
    "MT": "MOUNT", "MNT": "MOUNT", "MOUNT": "MOUNT",
    "MTN": "MOUNTAIN", "MNTAIN": "MOUNTAIN", "MNTN": "MOUNTAIN",
    "MOUNTIN": "MOUNTAIN", "MTIN": "MOUNTAIN", "MOUNTAIN": "MOUNTAIN",
    "MTNS": "MOUNTAINS", "MNTNS": "MOUNTAINS", "MOUNTAINS": "MOUNTAINS",
    # ――― N ―――
    "NCK": "NECK", "NECK": "NECK",
    # ――― O ―――
    "ORCH": "ORCHARD", "ORCHRD": "ORCHARD", "ORCHARD": "ORCHARD",
    "OVAL": "OVAL", "OVL": "OVAL",
    "OPAS": "OVERPASS", "OVERPASS": "OVERPASS",
    # ――― P ―――
    "PARK": "PARK", "PRK": "PARK",
    "PARKS": "PARKS",
    "PKWY": "PARKWAY", "PW": "PARKWAY", "PARKWAY": "PARKWAY", "PARKWY": "PARKWAY",
    "PKWAY": "PARKWAY", "PKY": "PARKWAY",
    "PKWYS": "PARKWAYS", "PARKWAYS": "PARKWAYS",
    "PASS": "PASS",
    "PSGE": "PASSAGE", "PASSAGE": "PASSAGE",
    "PATH": "PATH", "PATHS": "PATHS",
    "PIKE": "PIKE", "PK": "PIKE","PIKES": "PIKES",
    "PNE": "PINE", "PINE": "PINE",
    "PNES": "PINES", "PINES": "PINES",
    "PL": "PLACE", "PLACE": "PLACE",
    "PLN": "PLAIN", "PLAIN": "PLAIN",
    "PLNS": "PLAINS", "PLAINS": "PLAINS",
    "PLZ": "PLAZA", "PLAZA": "PLAZA", "PLZA": "PLAZA",
    "PT": "POINT", "POINT": "POINT",
    "PTS": "POINTS", "POINTS": "POINTS",
    "PRT": "PORT", "PORT": "PORT",
    "PRTS": "PORTS", "PORTS": "PORTS",
    "PR": "PRAIRIE", "PRAIRIE": "PRAIRIE", "PRR": "PRAIRIE",
    # ――― R ―――
    "RAD": "RADIAL", "RADL": "RADIAL", "RADIEL": "RADIAL", "RADIAL": "RADIAL",
    "RAMP": "RAMP",
    "RNCH": "RANCH", "RANCH": "RANCH",
    "RNCHS": "RANCHES", "RANCHES": "RANCHES",
    "RPD": "RAPID", "RAPID": "RAPID",
    "RPDS": "RAPIDS", "RAPIDS": "RAPIDS",
    "RST": "REST", "REST": "REST",
    "RDG": "RIDGE", "RDGE": "RIDGE", "RIDGE": "RIDGE",
    "RDGS": "RIDGES", "RIDGES": "RIDGES",
    "RIV": "RIVER", "RIVR": "RIVER", "RVR": "RIVER", "RIVER": "RIVER",
    "RD": "ROAD", "ROAD": "ROAD",
    "RDS": "ROADS", "ROADS": "ROADS",
    "RTE": "ROUTE", "ROUTE": "ROUTE",
    "ROW": "ROW",
    "RUE": "RUE",
    "RUN": "RUN",
    # ――― S ―――
    "SHL": "SHOAL", "SHOAL": "SHOAL",
    "SHLS": "SHOALS", "SHOALS": "SHOALS",
    "SHR": "SHORE", "SHOAR": "SHORE", "SHORE": "SHORE",
    "SHRS": "SHORES", "SHOARS": "SHORES", "SHORES": "SHORES",
    "SKWY": "SKYWAY", "SKYWAY": "SKYWAY",
    "SPG": "SPRING", "SPNG": "SPRING", "SPRNG": "SPRING", "SPRING": "SPRING",
    "SPGS": "SPRINGS", "SPNGS": "SPRINGS",
    "SPRNGS": "SPRINGS", "SPRINGS": "SPRINGS",
    "SPUR": "SPUR", "SPURS": "SPURS",
    "SQ": "SQUARE", "SQR": "SQUARE", "SQRE": "SQUARE",
    "SQU": "SQUARE", "SQUARE": "SQUARE",
    "SQRS": "SQUARES", "SQS": "SQUARES", "SQUARES": "SQUARES",
    "STA": "STATION", "STATN": "STATION", "STN": "STATION",
    "STATION": "STATION",
    "STRA": "STRAVENUE", "STRAV": "STRAVENUE", "STRAVEN": "STRAVENUE",
    "STRAVN": "STRAVENUE", "STRVN": "STRAVENUE",
    "STRVNUE": "STRAVENUE", "STRAVENUE": "STRAVENUE",
    "STRM": "STREAM", "STREME": "STREAM", "STREAM": "STREAM",
    "ST": "STREET", "STR": "STREET", "STRT": "STREET", "STREET": "STREET",
    "STS": "STREETS", "STREETS": "STREETS",
    "SMT": "SUMMIT", "SUMIT": "SUMMIT", "SUMITT": "SUMMIT", "SUMMIT": "SUMMIT",
    # ――― T ―――
    "TER": "TERRACE", "TERR": "TERRACE", "TERRACE": "TERRACE", "TE": "TERRACE","TR": "TERRACE",
    "TRWY": "THROUGHWAY", "THROUGHWAY": "THROUGHWAY",
    "TRCE": "TRACE", "TRACE": "TRACE",
    "TRAK": "TRACK", "TRACK": "TRACK",
    "TRKS": "TRACKS", "TRK": "TRACKS", "TRACKS": "TRACKS",
    "TRFY": "TRAFFICWAY", "TRAFFICWAY": "TRAFFICWAY",
    "TRL": "TRAIL", "TRAIL": "TRAIL","TL": "TRAIL",
    "TRLS": "TRAILS", "TRAILS": "TRAILS",
    "TRLR": "TRAILER", "TRLRS": "TRAILER", "TRAILER": "TRAILER",
    "TUNL": "TUNNEL", "TUNLS": "TUNNEL", "TUNEL": "TUNNEL",
    "TUNNL": "TUNNEL", "TUNNEL": "TUNNEL",
    "TPKE": "TURNPIKE", "TRNPK": "TURNPIKE", "TURNPK": "TURNPIKE",
    "TURNPIKE": "TURNPIKE","TURN":"TURN",
    # ――― U ―――
    "UPAS": "UNDERPASS", "UNDERPASS": "UNDERPASS",
    "UN": "UNION", "UNION": "UNION",
    "UNS": "UNIONS", "UNIONS": "UNIONS",
    # ――― V ―――
    "VLY": "VALLEY", "VALLY": "VALLEY", "VLLY": "VALLEY", "VALLEY": "VALLEY",
    "VLYS": "VALLEYS", "VALLEYS": "VALLEYS",
    "VIA": "VIADUCT", "VDCT": "VIADUCT", "VIADCT": "VIADUCT",
    "VIADUCT": "VIADUCT",
    "VW": "VIEW", "VIEW": "VIEW",
    "VWS": "VIEWS", "VIEWS": "VIEWS",
    "VLG": "VILLAGE", "VILL": "VILLAGE", "VILLAG": "VILLAGE",
    "VILLG": "VILLAGE", "VILLIAGE": "VILLAGE", "VILLAGE": "VILLAGE",
    "VLGS": "VILLAGES", "VILLAGES": "VILLAGES",
    "VL": "VILLE", "VILLE": "VILLE",
    "VIS": "VISTA", "VST": "VISTA", "VSTA": "VISTA", "VIST": "VISTA",
    "VISTA": "VISTA",
    # ――― W ―――
    "WALK": "WALK", "WALKS": "WALK",
    "WALL": "WALL",
    "WY": "WAY", "WAY": "WAY",
    "WAYS": "WAYS",
    "WL": "WELL", "WELL": "WELL",
    "WLS": "WELLS", "WELLS": "WELLS",
    "WOODS":"WOODS",
}

# Canonical street-suffix  ➜  USPS standard suffix abbreviation
canonical_to_abbrev: dict[str, str] = {
    # ――― A ―――
    "ALLEY":        "ALY",
    "ALLEY":        "AL",
    "ANNEX":        "ANX",
    "ARCADE":       "ARC",
    "AVENUE":       "AVE",

    # ――― B ―――
    "BAYOU":        "BYU",
    "BEACH":        "BCH",
    "BEND":         "BND",
    "BLUFF":        "BLF",
    "BLUFFS":       "BLFS",
    "BOTTOM":       "BTM",
    "BOULEVARD":    "BLVD",
    "BOULEVARD":    "BV",
    "BRANCH":       "BR",
    "BRIDGE":       "BRG",
    "BROOK":        "BRK",
    "BROOKS":       "BRKS",
    "BURG":         "BG",
    "BURGS":        "BGS",
    "BYPASS":       "BYP",

    # ――― C ―――
    "CAMP":         "CP",
    "CANYON":       "CYN",
    "CAPE":         "CPE",
    "CAUSEWAY":     "CSWY",
    "CENTER":       "CTR",
    "CENTERS":      "CTRS",
    "CIRCLE":       "CR",
    "CIRCLE":       "CIR",
    "CIRCLES":      "CIRS",
    "CLIFF":        "CLF",
    "CLIFFS":       "CLFS",
    "CLUB":         "CLB",
    "COMMON":       "CMN",
    "COMMONS":      "CMNS",
    "CORNER":       "COR",
    "CORNERS":      "CORS",
    "COURSE":       "CRSE",
    "COURT":        "CT",
    "COURTS":       "CTS",
    "COVE":         "CV",
    "COVES":        "CVS",
    "CREEK":        "CRK",
    "CRESCENT":     "CRES",
    "CREST":        "CRST",
    "CROSSING":     "XING",
    "CROSSROAD":    "XRD",
    "CROSSROADS":   "XRDS",
    "CURVE":        "CURV",

    # ――― D ―――
    "DALE":         "DL",
    "DAM":          "DM",
    "DIVIDE":       "DV",
    "DRIVE":        "DR",
    "DRIVES":       "DRS",

    # ――― E ―――
    "ESTATE":       "EST",
    "ESTATES":      "ESTS",
    "EXPRESSWAY":   "EXPY",
    "EXTENSION":    "EXT",
    "EXTENSIONS":   "EXTS",

    # ――― F ―――
    "FALL":         "FALL",
    "FALLS":        "FLS",
    "FERRY":        "FRY",
    "FIELD":        "FLD",
    "FIELDS":       "FLDS",
    "FLAT":         "FLT",
    "FLATS":        "FLTS",
    "FORD":         "FRD",
    "FORDS":        "FRDS",
    "FOREST":       "FRST",
    "FORGE":        "FRG",
    "FORGES":       "FRGS",
    "FORK":         "FRK",
    "FORKS":        "FRKS",
    "FORT":         "FT",
    "FREEWAY":      "FWY",

    # ――― G ―――
    "GARDEN":       "GDN",
    "GARDENS":      "GDNS",
    "GATEWAY":      "GTWY",
    "GLEN":         "GLN",
    "GLENS":        "GLNS",
    "GREEN":        "GRN",
    "GREENS":       "GRNS",
    "GROVE":        "GRV",
    "GROVES":       "GRVS",

    # ――― H ―――
    "HARBOR":       "HBR",
    "HARBORS":      "HBRS",
    "HAVEN":        "HVN",
    "HEIGHTS":      "HTS",
    "HIGHWAY":      "HWY",
    "HILL":         "HL",
    "HILLS":        "HLS",
    "HOLLOW":       "HOLW",

    # ――― I ―――
    "INLET":        "INLT",
    "ISLAND":       "IS",
    "ISLANDS":      "ISS",
    "ISLE":         "ISLE",

    # ――― J ―――
    "JUNCTION":     "JCT",
    "JUNCTIONS":    "JCTS",

    # ――― K ―――
    "KEY":          "KY",
    "KEYS":         "KYS",
    "KNOLL":        "KNL",
    "KNOLLS":       "KNLS",

    # ――― L ―――
    "LAKE":         "LK",
    "LAKES":        "LKS",
    "LAND":         "LAND",
    "LANDING":      "LNDG",
    "LANE":         "LN",
    "LIGHT":        "LGT",
    "LIGHTS":       "LGTS",
    "LOAF":         "LF",
    "LOCK":         "LCK",
    "LOCKS":        "LCKS",
    "LODGE":        "LDG",
    "LOOP":         "LOOP",

    # ――― M ―――
    "MALL":         "MALL",
    "MANOR":        "MNR",
    "MANORS":       "MNRS",
    "MEADOW":       "MDW",
    "MEADOWS":      "MDWS",
    "MEWS":         "MEWS",
    "MILL":         "ML",
    "MILLS":        "MLS",
    "MISSION":      "MSN",
    "MOTORWAY":     "MTWY",
    "MOUNT":        "MT",
    "MOUNTAIN":     "MTN",
    "MOUNTAINS":    "MTNS",

    # ――― N ―――
    "NECK":         "NCK",

    # ――― O ―――
    "ORCHARD":      "ORCH",
    "OVAL":         "OVAL",
    "OVERPASS":     "OPAS",

    # ――― P ―――
    "PARK":         "PARK",
    "PARKS":        "PARKS",
    "PARKWAY":      "PW",
    "PARKWAY":      "PKWY",
    "PARKWAYS":     "PKWY",
    "PASS":         "PASS",
    "PASSAGE":      "PSGE",
    "PATH":         "PATH",
    "PIKE":         "PK",
    "PIKE":         "PIKE",
    "PINE":         "PNE",
    "PINES":        "PNES",
    "PLACE":        "PL",
    "PLAIN":        "PLN",
    "PLAINS":       "PLNS",
    "PLAZA":        "PLZ",
    "POINT":        "PT",
    "POINTS":       "PTS",
    "PORT":         "PRT",
    "PORTS":        "PRTS",
    "PRAIRIE":      "PR",

    # ――― R ―――
    "RADIAL":       "RADL",
    "RAMP":         "RAMP",
    "RANCH":        "RNCH",
    "RANCHES":      "RNCHS",
    "RAPID":        "RPD",
    "RAPIDS":       "RPDS",
    "REST":         "RST",
    "RIDGE":        "RDG",
    "RIDGES":       "RDGS",
    "RIVER":        "RIV",
    "ROAD":         "RD",
    "ROADS":        "RDS",
    "ROUTE":        "RTE",
    "ROW":          "ROW",
    "RUE":          "RUE",
    "RUN":          "RUN",

    # ――― S ―――
    "SHOAL":        "SHL",
    "SHOALS":       "SHLS",
    "SHORE":        "SHR",
    "SHORES":       "SHRS",
    "SKYWAY":       "SKWY",
    "SPRING":       "SPG",
    "SPRINGS":      "SPGS",
    "SPUR":         "SPUR",
    "SQUARE":       "SQ",
    "SQUARES":      "SQRS",
    "STATION":      "STA",
    "STRAVENUE":    "STRA",
    "STREAM":       "STRM",
    "STREET":       "ST",
    "STREETS":      "STS",
    "SUMMIT":       "SMT",

    # ――― T ―――
    "TERRACE":      "TE",
    "TERRACE":      "TER",
    "TERRACE":      "TR",
    "THROUGHWAY":   "TRWY",
    "TRACE":        "TRCE",
    "TRACK":        "TRAK",
    "TRACKS":       "TRKS",
    "TRAFFICWAY":   "TRFY",
    "TRAIL":        "TRL",
    "TRAIL":        "TL",
    "TRAILS":       "TRLS",
    "TRAILER":      "TRLR",
    "TURN":         "TURN",
    "TUNNEL":       "TUNL",
    "TURNPIKE":     "TPKE",

    # ――― U ―――
    "UNDERPASS":    "UPAS",
    "UNION":        "UN",
    "UNIONS":       "UNS",

    # ――― V ―――
    "VALLEY":       "VLY",
    "VALLEYS":      "VLYS",
    "VIADUCT":      "VIA",
    "VIEW":         "VW",
    "VIEWS":        "VWS",
    "VILLAGE":      "VLG",
    "VILLAGES":     "VLGS",
    "VILLE":        "VL",
    "VISTA":        "VIS",


    # ――― W ―――
    "WALK":         "WALK",
    "WALL":         "WALL",
    "WAY":          "WAY",
    "WAYS":         "WAYS",
    "WELL":         "WL",
    "WELLS":        "WLS",
    "WOODS":        "WOODS",

}

secondary_unit_type_map: dict[str, str] = {
    # — Apartment —
    "APT": "APARTMENT", "APT.": "APARTMENT", "APARTMENT": "APARTMENT",
    "APART": "APARTMENT", "APTMT": "APARTMENT",

    # — Basement —
    "BSMT": "BASEMENT", "BSMT.": "BASEMENT", "BASEMENT": "BASEMENT",
    "BSMENT": "BASEMENT", "BASMT": "BASEMENT",

    # — Building —
    "BLDG": "BUILDING", "BLDG.": "BUILDING", "BUILDING": "BUILDING",
    "BLDNG": "BUILDING",

    # — Department —
    "DEPT": "DEPARTMENT", "DEPT.": "DEPARTMENT", "DEPARTMENT": "DEPARTMENT",

    # — Floor —
    "FL": "FLOOR", "FL.": "FLOOR", "FLR": "FLOOR", "FLOOR": "FLOOR",

    # — Front —
    "FRNT": "FRONT", "FRNT.": "FRONT", "FRONT": "FRONT",

    # — Hangar —
    "HNGR": "HANGAR", "HNGR.": "HANGAR", "HANGER": "HANGAR", "HANGAR": "HANGAR",

    # — Key —
    "KEY": "KEY", "KEY.": "KEY",

    # — Lobby —
    "LBBY": "LOBBY", "LBBY.": "LOBBY", "LOBBY": "LOBBY",

    # — Lot —
    "LOT": "LOT", "LOT.": "LOT",

    # — Lower —
    "LOWR": "LOWER", "LOWR.": "LOWER", "LOWER": "LOWER",

    # — Office —
    "OFC": "OFFICE", "OFC.": "OFFICE", "OFFICE": "OFFICE", "OFFC": "OFFICE",

    # — Penthouse —
    "PH": "PENTHOUSE", "PH.": "PENTHOUSE", "PENTHOUSE": "PENTHOUSE",

    # — Pier —
    "PIER": "PIER", "PIER.": "PIER",

    # — Rear —
    "REAR": "REAR", "REAR.": "REAR",

    # — Room —
    "RM": "ROOM", "RM.": "ROOM", "ROOM": "ROOM",

    # — Side —
    "SIDE": "SIDE", "SIDE.": "SIDE",

    # — Slip —
    "SLIP": "SLIP", "SLIP.": "SLIP",

    # — Space —
    "SPC": "SPACE", "SPC.": "SPACE", "SPACE": "SPACE",

    # — Stop —
    "STOP": "STOP", "STOP.": "STOP",

    # — Suite —
    "STE": "SUITE", "STE.": "SUITE", "SUITE": "SUITE",

    # — Trailer —
    "TRLR": "TRAILER", "TRLR.": "TRAILER", "TRAILER": "TRAILER",

    # — Unit —
    "UNIT": "UNIT", "UNIT.": "UNIT",

    # — Upper —
    "UPPR": "UPPER", "UPPR.": "UPPER", "UPPER": "UPPER", "UPR": "UPPER",
}

secondary_unit_abbrev: dict[str, str] = {
    "APARTMENT":  "APT",
    "BASEMENT":   "BSMT",
    "BUILDING":   "BLDG",
    "DEPARTMENT": "DEPT",
    "FLOOR":      "FL",
    "FRONT":      "FRNT",
    "HANGAR":     "HNGR",
    "KEY":        "KEY",
    "LOBBY":      "LBBY",
    "LOT":        "LOT",
    "LOWER":      "LOWR",
    "OFFICE":     "OFC",
    "PENTHOUSE":  "PH",
    "PIER":       "PIER",
    "REAR":       "REAR",
    "ROOM":       "RM",
    "SIDE":       "SIDE",
    "SLIP":       "SLIP",
    "SPACE":      "SPC",
    "STOP":       "STOP",
    "SUITE":      "STE",
    "TRAILER":    "TRLR",
    "UNIT":       "UNIT",
    "UPPER":      "UPPR",
}

school_city_map = {
    'CINCINNATI CSD':'Cincinnati', 
    'DEER PARK CSD':'Cincinnati',
    'FINNEYTOWN LSD':'Cincinnati',
    'FOREST HILLS LSD':'Cincinnati',
    'INDIAN HILL EVSD':'Indian Hills',
    'LOCKLAND CSD': 'Cincinnati',
    'LOVELAND CSD':'Loveland', 
    'NORTHWEST LSD (HAMILTON CO.)':'Cincinnati',
    'MADEIRA CSD':'Madeira', 
    'MARIEMONT CSD':'Mariemont', 
    'MILFORD CSD':'Milford',
    'MOUNT HEALTHY CSD':'Cincinnati',
    'NORTH COLLEGE HILL CSD':'Cincinnati',
    'NORWOOD CSD':'Norwood', 
    'OAK HILLS LSD':'Cincinnati', 
    'PRINCETON CSD':'Cincinnati',
    'READING CSD':'Reading',
    'SOUTHWEST LSD (HAMILTON CO.)':'Harrison',
    'ST. BERNARD-ELMWOOD PLACE CSD':'Cincinnati',
    'SYCAMORE CSD':'Montgomery', 
    'THREE RIVERS LSD':'Cleves',
    'WINTON WOODS CSD':'Cincinnati',
    'WYOMING CSD':'Wyoming'
}

zip_code_map = {
    'CINCINNATI CSD':[45202, 45203, 45204, 45205, 45206, 45207, 45208, 45209, 45211, 45212, 45213, 45214, 
                      45215, 45216, 45217, 45219, 45220, 45223, 45224, 45225, 45226, 45227, 45229, 45230, 
                      45231, 45232, 45233, 45236, 45237, 45238, 45239, 45243, 45244, 45248], 
    'DEER PARK CSD': [45242,45236],
    'FINNEYTOWN LSD':[45232, 45231,45224, 45216,45215],
    'FOREST HILLS LSD':[45226, 45230, 45244, 45255],
    'INDIAN HILL EVSD':[45111, 45140, 45147, 45150, 45236, 45242, 45243, 45249],
    'LOCKLAND CSD':[45216,45215],
    'LOVELAND CSD':[45140, 45249], 
    'NORTH COLLEGE HILL CSD':[45224, 45231, 45239],
    'NORTHWEST LSD (HAMILTON CO.)':[45002, 45014, 45211, 45223, 45231, 45239, 45240, 45247, 45251, 45252],
    'NORWOOD CSD':[45207, 45208, 45209, 45212, 45229], 
    'MADEIRA CSD':[45227, 45236, 45243], 
    'MARIEMONT CSD':[45174, 45226, 45227, 45243], 
    'MILFORD CSD':[45140, 45147, 45150, 45174, 45243, 45244],
    'MOUNT HEALTHY CSD':[45218, 45231, 45240, 45251],
    'OAK HILLS LSD':[45002, 45051, 45204, 45211, 45233, 45238, 45247, 45248], 
    'PRINCETON CSD': [45040, 45069, 45215, 45240, 45241, 45242, 45246, 45249],
    'READING CSD':[45215, 45236, 45237],
    'SOUTHWEST LSD (HAMILTON CO.)':[45002, 45013, 45030, 45033, 45041, 45052, 45053],
    'ST. BERNARD-ELMWOOD PLACE CSD':[45216, 45217, 45229],
    'SYCAMORE CSD': [45140, 45236, 45241, 45242, 45249], 
    'THREE RIVERS LSD':[45001, 45002, 45052, 45233, 45248],
    'WINTON WOODS CSD': [45215, 45218, 45231, 45240, 45246],
    'WYOMING CSD':[45215, 45216] 
}

direction_map = {
    "N": "NORTH",
    "S": "SOUTH",
    "E": "EAST",
    "W": "WEST",
    "NW": "NORTHWEST",
    "SW": "SOUTHWEST",
    "NE": "NORTHEAST",
    "SE": "SOUTHEAST",
    }

direction_map_tl = {i[1]:i[0] for i in direction_map.items()}

street_prefix_map = {
    # Saint / Sainte / Saints
    "ST":      "SAINT",
    "ST.":     "SAINT",
    "STE":     "SAINTE",
    "STE.":    "SAINTE",
    "STS":     "SAINTS",
    "STS.":    "SAINTS",

    # Spanish-language saints
    "SAN":     "SAN",
    "SANTA":   "SANTA",
    "SANTO":   "SANTO",
    "SANTOS":  "SANTOS",

    # Mount / Mountain
    "MT":      "MOUNT",
    "MT.":     "MOUNT",
    "MTN":     "MOUNTAIN",
    "MTN.":    "MOUNTAIN",

    # Fort
    "FT":      "FORT",
    "FT.":     "FORT",

    # Point
    "PT":      "POINT",
    "PT.":     "POINT",

    # Lake
    "LK":      "LAKE",
    "LK.":     "LAKE",

    # Peak / Park  (choose meaning at runtime if context matters)
    "PK":      "PEAK",
    "PK.":     "PEAK",

    # Port
    "PORT":    "PORT",
    "PRT":     "PORT",
}

apt_head_works = {
    "#",
    "APT", 
    "UNIT", 
    "STE", 
    "SUITE", 
    "ROOM", 
    "RM"
    }

In [3]:
import os
from pathlib import Path
from typing import Union
import pandas as pd

def get_file_path(base_dir: str, data_type: str, filename: str) -> Path:
    """
    Constructs a file path by combining base directory, data type subfolder, and filename.

    Args:
        base_dir (str): Base directory for saving the file.
        data_type (str): Subdirectory like 'raw' or 'processed'.
        filename (str): Name of the output file.

    Returns:
        Path: A Path object representing the complete file path.
    """
    return Path(base_dir) / "data" / data_type / filename

In [4]:
# API endpoint and cache location
CACHE_PATH = "data/processed/"

# Load cache from disk if it exists
def load_cache_from_disk(filepath=CACHE_PATH) -> dict:
    """
    Load previously saved geocoding results from a JSON file.

    Args:
        filepath (str): Path to the cache JSON file.

    Returns:
        dict: Dictionary mapping parcel numbers to geocoding results.
    """
    if os.path.exists(filepath):
        with open(filepath, "r") as f:
            return json.load(f)
    return {}

# Save updated cache to disk
def save_cache_to_disk(cache: dict, filepath=CACHE_PATH):
    """
    Save geocoding cache to disk as a JSON file.

    Args:
        cache (dict): Dictionary containing geocoded parcel data.
        filepath (str): File path to save the cache.
    """
    with open(filepath, "w") as f:
        json.dump(cache, f)

In [5]:
def hyphen_space_replace(address):
        try:
            if re.search(r"\s*-\s*",address):
                address = re.sub(r"\s*-\s*", "-", address)
                return address
            else:
                 return address
        except TypeError:
             return None

In [7]:
all_real_estate['address'] = all_real_estate['address'].apply(hyphen_space_replace)

In [8]:
all_real_estate['address'] = all_real_estate['address'].str.replace("  "," ")

In [10]:
os.chdir(r'C:\Users\markd\hamilton-county-homes-scraper-main\data\raw\home_sales')
all_real_estate.to_csv('homes_all.csv')

In [12]:
gold_path = r'C:\Users\markd\hamilton-county-homes-scraper-main\notebooks\GoldData.csv'
gold = pd.read_csv(gold_path)

In [46]:
from sklearn.model_selection import train_test_split
y = gold['tags']
X = gold[gold.columns.drop('tags')]

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=.1, random_state=42)

0 : AddressNumberPrefix
1 : AddressNumber
2 : AddressNumberSuffix
3 : StreetNamePreModifier
4 : StreetNamePreDirectional
5 : StreetNamePreType
6 : StreetName
7 : StreetNamePostType
8 : StreetNamePostDirectional
9 : SubaddressType
10 : SubaddressIdentifier
11 : BuildingName
12 : OccupancyType
13 : OccupancyIdentifier
14 : CornerOf
15 : LandmarkName
16 : PlaceName
17 : StateName
18 : ZipCode
19 : USPSBoxType
20 : USPSBoxID
21 : USPSBoxGroupType
22 : USPSBoxGroupID
23 : IntersectionSeparator
24 : Recipient
25 : NotAddress7

In [55]:
gold.tags.value_counts()

tags
AddressNumber|OccupancyIdentifier|StreetName|StreetNamePostType|EOS                                                   1167
AddressNumber|OccupancyIdentifier|StreetNamePreDirectional|StreetName|StreetNamePostType|EOS                          1019
AddressNumber|StreetName|StreetName|StreetNamePostType|EOS                                                             381
StreetName|StreetNamePostType|EOS                                                                                      211
AddressNumber|OccupancyIdentifier|StreetName|StreetName|StreetNamePostType|EOS                                         177
AddressNumber|AddressNumber|StreetName|StreetNamePostType|EOS                                                          159
AddressNumber|OccupancyIdentifier|StreetNamePreDirectional|StreetName|StreetName|StreetNamePostType|EOS                125
StreetName|StreetName|StreetNamePostType|EOS                                                                            50
AddressNumb

In [47]:
train = X_train.merge(y_train, right_index=True, left_index=True)

In [51]:
for j in X_test['address']:
    print(j)

15 106 W FOURTH ST
11025 REED HARTMAN HW
2401 7C INGLESIDE AVE
4333 WESTWOOD NORTHERN BV
15 413 W FOURTH ST
170 3 PALISADES POINTE
2412 6C INGLESIDE AVE
1308 1310 WILLIAM H TAFT RD
4229 WESTWOOD NORTHERN BV
3693 45 W GALBRAITH RD
8019 G1 OAKBRIDGE WY
4921 304 N ARBOR WOODS CT
1440 605 W KEMPER RD
11101 E ALLENHURST BV
3707 P62 W GALBRAITH RD
1525 WHITEWATER TRAILS BV
7987 G4 OAKBRIDGE WY
3110 WESTWOOD NORTHERN BV
26 301 E SIXTH ST
2200 1107 VICTORY PW
1201 1042 EDGECLIFF PL
3687 J37 W GALBRAITH RD
3041 1C PRESERVE LN
3503 10 WEST FORK RD
4 202 W FOURTEENTH ST
1087 120 WITTSHIRE LN
480 G3 HERITAGE SQUARE
3191 2D PRESERVE LN
165 G5 TIMEPIECE LN
8231 310 CORNELL RD
810 1108 MATSON PL
4510 G20 CLEARWATER PL
GLENDALE MILFORD RD
530 7 S STATE ST
20 403 W TWELFTH ST
CLOVERHILL TE
3608 65 WESTWOOD NORTHERN BV
2000 WESTWOOD NORTHERN BV
323 1N W FIFTH ST
FLORALRUN CT
415 8D BOND PL
KANAUGA CT
353 606 W FOURTH ST
5184 118 S EAGLESNEST DR
NIGHTINGALE CT
3097 WESTWOOD NORTHERN BV
1201 1181 EDGECLIF

In [14]:
gold['tags'] = gold['tags'].str.replace('|PreDirectional|','|StreetNamePreDirectional|')

In [15]:
def tokens_tags_to_xml(tokens, tags):
    """
    Convert two equal-length lists (tokens, tags) into an XML address line.
    Assumes one 'EOS' marker per record that should be ignored.
    """
    parts = [
        f"<{tag}>{token}</{tag}>"
        for token, tag in zip(tokens, tags)
        if tag != "EOS"          # drop the sentinel
    ]
    return "  <AddressString>" + " ".join(parts) + "</AddressString>"

def collection_to_xml(records):
    """
    records: iterable of (tokens, tags) tuples/lists
    """
    lines = [tokens_tags_to_xml(toks, tgs) for toks, tgs in records]
    return "<AddressCollection>\n" + "\n".join(lines) + "\n</AddressCollection>"

# def create_xml(df):
#     tags = df['tags'].str.split("|")
#     tokens = df['tokens'].str.split("|")
#     for 
#     records = collection_to_xml((tgs,toks))



In [16]:
pd.DataFrame(gold['tags'].str.split("|"))

Unnamed: 0,tags
0,"[AddressNumber, StreetNamePreDirectional, Stre..."
1,"[AddressNumber, AddressNumber, StreetNamePreDi..."
2,"[AddressNumber, StreetName, StreetName, Street..."
3,"[AddressNumber, AddressNumber, StreetName, Str..."
4,"[AddressNumber, OccupancyIdentifier, StreetNam..."
...,...
3449,"[StreetName, StreetNamePostType, EOS]"
3450,"[StreetName, StreetNamePostType, EOS]"
3451,"[StreetName, StreetNamePostType, EOS]"
3452,"[StreetName, StreetName, StreetNamePostType, EOS]"


In [52]:
all_records = [(toks, tgs) for toks, tgs in zip(train['tokens'].str.split("|"), train['tags'].str.split("|"))]



In [53]:
xml_lines = collection_to_xml(all_records)

In [54]:
print(xml_lines)

<AddressCollection>
  <AddressString><AddressNumber>207</AddressNumber> <OccupancyIdentifier>8A</OccupancyIdentifier> <StreetNamePreDirectional>E</StreetNamePreDirectional> <StreetName>UNIVERSITY</StreetName> <StreetNamePostType>AVE</StreetNamePostType></AddressString>
  <AddressString><AddressNumber>860</AddressNumber> <OccupancyIdentifier>G8</OccupancyIdentifier> <StreetName>SOUTHMEADOW</StreetName> <StreetNamePostType>CR</StreetNamePostType></AddressString>
  <AddressString><AddressNumber>1617</AddressNumber> <OccupancyIdentifier>405</OccupancyIdentifier> <StreetNamePreDirectional>E</StreetNamePreDirectional> <StreetName>MCMILLAN</StreetName> <StreetNamePostType>AVE</StreetNamePostType></AddressString>
  <AddressString><AddressNumber>11076</AddressNumber> <StreetNamePreDirectional>E</StreetNamePreDirectional> <StreetName>ALLENHURST</StreetName> <StreetNamePostType>BV</StreetNamePostType></AddressString>
  <AddressString><AddressNumber>1440</AddressNumber> <OccupancyIdentifier>1015</

In [None]:
(gold['tags'].str.split("|"),gold['tokens'].str.split("|"))

0       [AddressNumber, PreDirectional, StreetName, St...
1       [AddressNumber, AddressNumber, PreDirectional,...
2       [AddressNumber, StreetName, StreetName, Street...
3       [AddressNumber, AddressNumber, StreetName, Str...
4       [AddressNumber, OccupancyIdentifier, StreetNam...
                              ...                        
3449                [StreetName, StreetNamePostType, EOS]
3450                [StreetName, StreetNamePostType, EOS]
3451                [StreetName, StreetNamePostType, EOS]
3452    [StreetName, StreetName, StreetNamePostType, EOS]
3453                [StreetName, StreetNamePostType, EOS]
Name: tags, Length: 3454, dtype: object

In [None]:
import re
from dataclasses import dataclass
from typing import Optional, Tuple, List

import pandas as pd
import usaddress
from word2number import w2n

from hch_scraper.config.mappings.street_types import (
    street_suffix_normalization_map,
    direction_normalization_map,
)
from hch_scraper.config.mappings.secondary_units import (
    secondary_unit_normalization_map,
)

# ─────────────────────────────────────────────────────────────────────────────
# Pre-compiled regexes
# ─────────────────────────────────────────────────────────────────────────────

HYPHEN_RE   = re.compile(r"\b(\d+)\s*-\s*(\d+)\b")
FRACTION_RE = re.compile(r"\b(\d+)\s+(\d+)/(\d+)\b")
ORDINAL_RE  = re.compile(r"\b\d+(?:st|nd|rd|th)\b", re.IGNORECASE)

EXTRA_INFO_RE = re.compile(
    r"\s*\([A-Za-z]+\)\s*",
    re.IGNORECASE | re.VERBOSE,
)

_NUMERIC_RE = re.compile(r"^\d+$")

"""
Address Parsing and Enrichment Utility

This module provides functionality to extract structured address components (house number, street, apartment)
from raw address strings using spaCy, and enrich them with ZIP code and city information using Hamilton County's
centerline and ZIP code datasets.

Key Features:
- Parses raw addresses into structured components using NLP.

"""       

# ─────────────────────────────────────────────────────────────────────────────
# Dataclass for parsed addresses
# ─────────────────────────────────────────────────────────────────────────────

import re
from dataclasses import dataclass
from typing import Optional, Tuple, List

import pandas as pd
import usaddress
from word2number import w2n

from hch_scraper.config.mappings.street_types import (
    street_suffix_normalization_map,
    direction_normalization_map,
)
from hch_scraper.config.mappings.secondary_units import (
    secondary_unit_normalization_map,
)

# ─────────────────────────────────────────────────────────────────────────────
# Pre-compiled regexes
# ─────────────────────────────────────────────────────────────────────────────

HYPHEN_RE   = re.compile(r"\b(\d+)\s*-\s*(\d+)\b")
FRACTION_RE = re.compile(r"\b(\d+)\s+(\d+)/(\d+)\b")
ORDINAL_RE  = re.compile(r"\b\d+(?:st|nd|rd|th)\b", re.IGNORECASE)

EXTRA_INFO_RE = re.compile(
    r"\s*\([A-Za-z]+\)\s*",
    re.IGNORECASE | re.VERBOSE,
)

_NUMERIC_RE = re.compile(r"^\d+$")

# ─────────────────────────────────────────────────────────────────────────────
# Dataclass for parsed addresses
# ─────────────────────────────────────────────────────────────────────────────

@dataclass(slots=True, frozen=True)
class AddressParts:
    ParcelNumber:               Optional[str] = None
    Recipient:                  Optional[str] = None
    AddressNumber:              Optional[str] = None
    AddressNumberPrefix:        Optional[str] = None
    AddressNumberSuffix:        Optional[str] = None
    StreetName:                 Optional[str] = None
    StreetNamePreDirectional:   Optional[str] = None
    StreetNamePreModifier:      Optional[str] = None
    StreetNamePreType:          Optional[str] = None
    StreetNamePostDirectional:  Optional[str] = None
    StreetNamePostModifier:     Optional[str] = None
    StreetNamePostType:         Optional[str] = None
    CornerOf:                   Optional[str] = None
    IntersectionSeparator:      Optional[str] = None
    LandmarkName:               Optional[str] = None
    USPSBoxGroupID:             Optional[str] = None
    USPSBoxGroupType:           Optional[str] = None
    USPSUSPSBoxID:              Optional[str] = None
    USPSBoxType:                Optional[str] = None
    BuildingName:               Optional[str] = None
    OccupancyType:              Optional[str] = None
    OccupancyIdentifier:        Optional[str] = None
    SubaddressIdentifier:       Optional[str] = None
    SubaddressType:             Optional[str] = None
    PlaceName:                  Optional[str] = None
    StateName:                  Optional[str] = None
    AddressType:                Optional[str] = None


EMPTY_PARSE = AddressParts()  # optional convenience

# ─────────────────────────────────────────────────────────────────────────────
# Pre-clean + tagging
# ─────────────────────────────────────────────────────────────────────────────

def _preclean(addr: str) -> str:
    """
    Light, non-destructive cleanup before usaddress:
    - trim
    - collapse whitespace
    - remove simple parenthetical tags
    - normalize fractions  '915 1/2' -> '915.5'
    """
    if not isinstance(addr, str):
        return ""

    s = addr.strip()
    s = EXTRA_INFO_RE.sub(" ", s)
    s = re.sub(r"\s+", " ", s)
    s = FRACTION_RE.sub(_collapse_fraction, s)
    return s


def tag_address(
    row: pd.Series,
    addr_col: str,
    parcel_col: str,
) -> Tuple[Optional[AddressParts], List[str]]:
    """
    row         : one DataFrame row (Series)
    addr_col    : name of the address column in that row
    parcel_col  : name of the parcel-number column
    """
    issues: list[str] = []

    addr_raw = row[addr_col]

    if not isinstance(addr_raw, str) or not addr_raw.strip():
        issues.append("Empty or non-string input")
        return None, issues

    addr_clean = _preclean(addr_raw)
    parcel_id = row[parcel_col]

    try:
        usparsed, _ = usaddress.tag(addr_clean)
    except usaddress.RepeatedLabelError as err:
        issues.append(f"Repeated label: {err}")
        return None, issues
    except Exception as err:
        issues.append(str(err))
        return None, issues

    parts = AddressParts(
        ParcelNumber               = parcel_id,
        Recipient                  = usparsed.get("Recipient"),
        AddressNumber              = usparsed.get("AddressNumber"),
        AddressNumberPrefix        = usparsed.get("AddressNumberPrefix"),
        AddressNumberSuffix        = usparsed.get("AddressNumberSuffix"),
        StreetName                 = usparsed.get("StreetName"),
        StreetNamePreDirectional   = usparsed.get("StreetNamePreDirectional"),
        StreetNamePreModifier      = usparsed.get("StreetNamePreModifier"),
        StreetNamePreType          = usparsed.get("StreetNamePreType"),
        StreetNamePostDirectional  = usparsed.get("StreetNamePostDirectional"),
        StreetNamePostModifier     = usparsed.get("StreetNamePostModifier"),
        StreetNamePostType         = usparsed.get("StreetNamePostType"),
        CornerOf                   = usparsed.get("CornerOf"),
        IntersectionSeparator      = usparsed.get("IntersectionSeparator"),
        LandmarkName               = usparsed.get("LandmarkName"),
        USPSBoxGroupID             = usparsed.get("USPSBoxGroupID"),
        USPSBoxGroupType           = usparsed.get("USPSBoxGroupType"),
        USPSUSPSBoxID              = usparsed.get("USPSUSPSBoxID"),
        USPSBoxType                = usparsed.get("USPSBoxType"),
        BuildingName               = usparsed.get("BuildingName"),
        OccupancyType              = usparsed.get("OccupancyType"),
        OccupancyIdentifier        = usparsed.get("OccupancyIdentifier"),
        SubaddressIdentifier       = usparsed.get("SubaddressIdentifier"),
        SubaddressType             = usparsed.get("SubaddressType"),
        PlaceName                  = usparsed.get("PlaceName"),
        StateName                  = usparsed.get("StateName"),
        AddressType                = usparsed.get("AddressType"),
    )
    return parts, issues

# ─────────────────────────────────────────────────────────────────────────────
# Normalization (USPS abbreviations, numeric house number, etc.)
# ─────────────────────────────────────────────────────────────────────────────

def normalize_address_parts(parts: AddressParts) -> AddressParts:
    """
    Apply USPS-style normalization to a parsed AddressParts:
    - AddressNumber -> numeric where possible
    - Directions -> N, S, E, W, NE, etc.
    - Street suffix -> ST, AVE, RD, ...
    - Unit type -> APT, STE, UNIT, ...
    - Street/city/state to upper-case
    """
    data = parts.__dict__.copy()

    # House number normalization
    data["AddressNumber"] = _coerce_address_number(parts.AddressNumber)

    # Directions (N, S, E, W...)
    if parts.StreetNamePreDirectional:
        raw = parts.StreetNamePreDirectional.upper().rstrip(".")
        data["StreetNamePreDirectional"] = direction_normalization_map.get(raw, raw)

    if parts.StreetNamePostDirectional:
        raw = parts.StreetNamePostDirectional.upper().rstrip(".")
        data["StreetNamePostDirectional"] = direction_normalization_map.get(raw, raw)

    # Street Suffix (ST, AVE, RD, etc.)
    if parts.StreetNamePostType:
        raw = parts.StreetNamePostType.upper().rstrip(".")
        data["StreetNamePostType"] = street_suffix_normalization_map.get(raw, raw)

    # Unit type (APT, STE, UNIT)
    if parts.OccupancyType:
        raw = parts.OccupancyType.upper().rstrip(".")
        data["OccupancyType"] = secondary_unit_normalization_map.get(raw, raw)

    # Normalize street name capitalization
    if parts.StreetName:
        data["StreetName"] = parts.StreetName.upper()

    # Normalize city and state
    if parts.PlaceName:
        data["PlaceName"] = parts.PlaceName.upper()
    if parts.StateName:
        data["StateName"] = parts.StateName.upper()

    return AddressParts(**data)

# ─────────────────────────────────────────────────────────────────────────────
# Helpers
# ─────────────────────────────────────────────────────────────────────────────

def _collapse_fraction(m: re.Match) -> str:
    whole, num, den = m.groups()
    value = int(whole) + int(num) / int(den)      # 915 + 1/2 → 915.5
    return f"{value:g}".rstrip(".")


def _coerce_address_number(value: Optional[str]) -> Optional[str]:
    """
    Return a strictly numeric house number, or the original value if we
    can’t make a safe conversion. Handles 'one hundred twenty-three' → '123'.
    """
    if value is not None and HYPHEN_RE.match(value):
        value = HYPHEN_RE.sub(lambda m: m.group(1), value)

    if not value or _NUMERIC_RE.match(value):
        return value  # already OK or empty

    try:
        numeric = w2n.word_to_num(value.lower())
        return str(numeric)
    except (ValueError, TypeError):
        return value


ModuleNotFoundError: No module named 'hch_scraper'

In [None]:
# Constructing the test cases DataFrame
test_cases = [
    (1, "123 Main St", "canonical number + street + suffix"),
    (2, "456 N Maple Ave", "leading directional prefix"),
    (3, "6305 State Route 48", "long “Route” style suffix"),
    (4, "987 Elm St Apt 2B", "APT head-word with alphanum ID"),
    (5, "555 Oak Rd #12", "hash-sign apartment"),
    (6, "700 Commerce Blvd Suite 210", "SUITE keyword"),
    (7, "415 Roosevelt Blvd RM 203", "RM keyword"),
    (8, "456 W Center St Apt #3C", "compound direction + hash after APT"),
    (9, "344-346 Elm Street", "hyphen range – should keep first num"),
    (10, "915 1/2 Walnut Ave", "space-slash fraction"),
    (11, "915½ Walnut Ave", "Unicode ½ character"),
    (12, "One Park Place", "spelled-out house number"),
    (13, "123 THIRTY-SECOND ST", "spelled-out ordinal in street name"),
    (14, "122 6th Avenue", "numeric ordinal token inside street name"),
    (15, "25 St John St", '"St" prefix meaning Saint'),
    (16, "50 Santa Maria Ave", "Spanish saint word"),
    (17, "64 Mt Pleasant Rd", "Mt → Mount"),
    (18, "341 Ft Jackson Rd", "Ft → Fort"),
    (19, "99 PK Trail", "PK abbreviation (Peak / Park ambiguity)"),
    (20, "789 East 5th St", "spelled-out directional before number-street"),
    (21, "1200 S Mountain View AVE", "trailing AVE, leading S dir"),
    (22, "527 North-East 3rd Street", "compound spelled direction as part of name"),
    (23, "1001 Pine St B", "single-letter apartment without keyword"),
    (24, "121A Maple Lane", "alphanum right in house-number slot"),
    (25, "123 Broadway (Kroger)", "parenthetical trailer to be stripped"),
    (26, "330 1/2 E 14th St", "direction + fraction + ordinal street"),
    (27, "89 Port Washington Ct", "“Port” prefix"),
    (28, "225 NE 45th St Ste 4", "compound direction + suite"),
    (29, "742 10th St", "street name is ordinal number"),
    (30, "1000 Lakeshore Dr.", "suffix with trailing period"),
]

df_test_addresses = pd.DataFrame(test_cases, columns=["case_number", "raw_address", "feature"])

In [7]:
# pip install seqeval usaddress tqdm
from seqeval.metrics import classification_report, precision_score, recall_score, f1_score
import usaddress, json, tqdm
path = r"C:\Users\markd\hamilton-county-homes-scraper-main\data\raw\home_sales\homes_all.csv"
def yield_gold_and_pred(path):
    """
    Assumes each line in `path` is JSON: {"tokens": ["912", "1/2", ...],
                                          "tags":   ["B-AddressNumber", "I-AddressNumber", ...]}
    """
    with open(path) as fh:
        for rec in fh:
            print(rec)
            r = json.loads(rec)
            gold_tags = r["tags"]
            pred_tags = [tag for _, tag in usaddress.tag(" ".join(r["tokens"]))[0]]
            yield gold_tags, pred_tags

In [47]:
parts_usaddress = all_real_estate.apply(
    tag_address,
    axis=1,
    args=("address", "parcel_number")
)

In [48]:
parts_usaddress

0         AddressParts(ParcelNumber='001-0001-0001-00', ...
1         AddressParts(ParcelNumber='001-0001-0001-00', ...
2         AddressParts(ParcelNumber='001-0001-0001-00', ...
3         AddressParts(ParcelNumber='001-0001-0001-00', ...
4         AddressParts(ParcelNumber='001-0001-0002-00', ...
                                ...                        
727394    AddressParts(ParcelNumber='671-0030-0219-00', ...
727395    AddressParts(ParcelNumber='671-0030-0219-00', ...
727396    AddressParts(ParcelNumber='671-0030-0219-00', ...
727397    AddressParts(ParcelNumber='671-0030-0220-00', ...
727398    AddressParts(ParcelNumber='671-0030-0220-00', ...
Length: 727399, dtype: object

In [49]:
import dataclasses as dc
from dataclasses import asdict
import pandas as pd

bad = []       # collect indices of non-dataclass rows
records = []

for i, tok in parts_usaddress.items():          # parts is a Series
    if dc.is_dataclass(tok):
        records.append(asdict(tok))
    else:
        bad.append((i, type(tok).__name__, tok))

if bad:                                # let’s see what broke
    print("⚠️  non-dataclass rows:")
    for idx, typ, val in bad:
        print(f"  row {idx}: {typ} → {val!r}")

address_df = pd.DataFrame.from_records(records)

⚠️  non-dataclass rows:
  row 18654: tuple → (None, ['Empty or non-string input'])
  row 25861: tuple → (None, ['Empty or non-string input'])
  row 25862: tuple → (None, ['Empty or non-string input'])
  row 25864: tuple → (None, ['Empty or non-string input'])
  row 25865: tuple → (None, ['Empty or non-string input'])
  row 25867: tuple → (None, ['Empty or non-string input'])
  row 57641: tuple → (None, ['Empty or non-string input'])
  row 57655: tuple → (None, ['Empty or non-string input'])
  row 58426: tuple → (None, ['Empty or non-string input'])
  row 58427: tuple → (None, ['Empty or non-string input'])
  row 77195: tuple → (None, ["Repeated label: \nERROR: Unable to tag this string because more than one area of the string has the same label\n\nORIGINAL STRING:  713 #1 E MCMILLAN AVE\nPARSED TOKENS:    [('713', 'AddressNumber'), ('#', 'AddressNumberPrefix'), ('1', 'AddressNumber'), ('E', 'StreetNamePreDirectional'), ('MCMILLAN', 'StreetName'), ('AVE', 'StreetNamePostType')]\nUNCERTA

In [50]:
address_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 727028 entries, 0 to 727027
Data columns (total 27 columns):
 #   Column                     Non-Null Count   Dtype 
---  ------                     --------------   ----- 
 0   ParcelNumber               727028 non-null  object
 1   Recipient                  148 non-null     object
 2   AddressNumber              706608 non-null  object
 3   AddressNumberPrefix        0 non-null       object
 4   AddressNumberSuffix        1942 non-null    object
 5   StreetName                 725654 non-null  object
 6   StreetNamePreDirectional   35625 non-null   object
 7   StreetNamePreModifier      11 non-null      object
 8   StreetNamePreType          1080 non-null    object
 9   StreetNamePostDirectional  251 non-null     object
 10  StreetNamePostModifier     0 non-null       object
 11  StreetNamePostType         721160 non-null  object
 12  CornerOf                   189 non-null     object
 13  IntersectionSeparator      33 non-null      

In [51]:
address_df.loc[(~address_df['PlaceName'].isna()),'IssueType'] = 'StateNameIssue'

In [52]:
mask = ((~address_df['PlaceName'].isna()) & (address_df.StateName.isna()))
address_df.loc[mask,'IssueType'] =  'PlaceNameTypeIssue'

In [53]:
mask = ((~address_df['SubaddressIdentifier'].isna()))
address_df.loc[mask,'IssueType'] = 'SubaddressIdentifierTypeIssue'

In [54]:
mask = ((~address_df['OccupancyIdentifier'].isna())
        &(address_df['OccupancyIdentifier'].isin(['BV','3 LN','R','# 1A E'])))
address_df.loc[mask,'IssueType'] = 'OccupancyIdentifierTypeIssue'

In [55]:
mask = ((~address_df['BuildingName'].isna()))
address_df.loc[mask,"IssueType"] = "BuildingNameTypeIssue"

In [56]:
mask = ((~address_df['USPSBoxType'].isna()))
address_df.loc[mask,"IssueType"] = "USPSBoxTypeIssue"

In [57]:
mask = ((~address_df['LandmarkName'].isna()))
address_df.loc[mask,"IssueType"] = "LandmarkNameTypeIssue"

In [58]:
mask = ((~address_df['IntersectionSeparator'].isna())
        &(address_df['StreetName']!='GLENVALLEY'))
address_df.loc[mask,"IssueType"] = "IntersectionSeparatorTypeIssue"

In [59]:
mask = ((~address_df['CornerOf'].isna()))
address_df.loc[mask, "IssueType"] = "CornerOfIssueType"

In [60]:
mask = ((~address_df['StreetNamePreDirectional'].isna())
        &~((address_df['StreetNamePreDirectional'].isin(direction_map))
           |(address_df['StreetNamePreDirectional'].isin(direction_map_tl))))
address_df.loc[mask,'IssueType'] = "StreetNamePreDirectionalTypeIssue"

In [61]:
address_df.loc[(~address_df['StreetNamePreModifier'].isna()),'IssueType'] = 'StreetNamePreModifierIssueType'

In [62]:
address_df.loc[(address_df['AddressNumberSuffix'].str.contains("-")==True),'IssueType'] = 'AddressNumberPrefixIssue'

In [63]:
address_df.loc[address_df['OccupancyType']=='INTERSTATE','IssueType'] = 'OccupancyTypeIssue'

In [64]:
address_num_issues = ['NEW','GLENDALE','ST','DRY','MT','RED','CITYSCAPE','INDIAN','U','LAUREL','IVY','LOVELAND']

In [65]:
address_df.loc[address_df['AddressNumber'].isin(address_num_issues),'IssueType'] = 'AddressNumberTypeIssue'

In [66]:
address_df.loc[~address_df.Recipient.isna(), 'IssueType'] = 'RecipientTypeIssue'

In [67]:
address_df.loc[address_df['AddressNumber'].str.contains(" ")==True,'IssueType'] = 'AddressNumberTypeIssue'

In [68]:
issue_parcels = address_df[~address_df.IssueType.isna()]['ParcelNumber'].unique()

In [69]:
test_cases_df = all_real_estate[all_real_estate.parcel_number.isin(issue_parcels)][['parcel_number','address']].drop_duplicates().copy()

In [None]:
test_cases_df = test_cases_df.set_index('parcel_number')

In [81]:
test_cases_df = test_cases_df['address'].drop_duplicates().reset_index()

In [None]:
test_cases_df['address'] = test_cases_df['address'].apply(hyphen_space_replace)

In [94]:
test_cases_df["tokens"] = test_cases_df['address'].str.replace(" ","|").copy()
test_cases_df["tokens"] += "|_EOS_"

In [95]:
test_cases_df["tags"] = "AddressNumber|StreetName|StreetNamePostType|EOS"

In [96]:
test_cases_df.sort_values('address')['address'][271]

'1009-1013-019 E MCMILLAN AVE'

In [97]:
test_cases_df.to_csv('GoldData.csv')

In [None]:
AddressNumber|StreetName|PreDirectional|StreetName|StreetNamePostType|EOS



StreetName|StreetNamePostType|EOS
AddressNumber|AddressNumber|StreetName|StreetNamePostType|EOS
AddressNumber|StreetName|StreetNamePostType|EOS
StreetName|StreetName|StreetNamePostType|EOS
AddressNumber|AddressNumber|StreetName|StreetNamePostType|EOS
AddressNumber|OccupancyIdentifier|StreetName|StreetNamePostType|EOS
AddressNumber|OccupancyIdentifier|StreetNamePreType|StreetName|StreetNamePostType|EOS
    ParcelNumber:               Optional[str] = None
    Recipient:                  Optional[str] = None
    AddressNumber:              Optional[str] = None
    AddressNumberPrefix:        Optional[str] = None
    AddressNumberSuffix:        Optional[str] = None
    StreetName:                 Optional[str] = None
    StreetNamePreDirectional:   Optional[str] = None
    StreetNamePreModifier:      Optional[str] = None
    StreetNamePreType:          Optional[str] = None
    StreetNamePostDirectional:  Optional[str] = None
    StreetNamePostModifier:     Optional[str] = None
    StreetNamePostType:         Optional[str] = None
    CornerOf:                   Optional[str] = None
    IntersectionSeparator:      Optional[str] = None
    LandmarkName:               Optional[str] = None
    USPSBoxGroupID:             Optional[str] = None
    USPSBoxGroupType:           Optional[str] = None
    USPSUSPSBoxID:              Optional[str] = None
    USPSBoxType:                Optional[str] = None
    BuildingName:               Optional[str] = None
    OccupancyType:              Optional[str] = None
    OccupancyIdentifier:        Optional[str] = None
    SubaddressIdentifier:       Optional[str] = None
    SubaddressType:             Optional[str] = None



In [26]:
all_real_estate[all_real_estate.address.str.contains('ROSELAND')==True].address.drop_duplicates().to_list()

['5419 ROSELAND MOUND CINCINNATI, OH 44444',
 '5421 ROSELAND MOUND CINCINNATI, OH 44444',
 '5431 ROSELAND MOUND CINCINNATI, OH 44444',
 '5433 ROSELAND MOUND CINCINNATI, OH 44444',
 '5435 ROSELAND MOUND CINCINNATI, OH 44444',
 '5434 ROSELAND MOUND CINCINNATI, OH 44444',
 '5428 ROSELAND MOUND CINCINNATI, OH 44444',
 '5426 ROSELAND MOUND CINCINNATI, OH 44444',
 '5416 ROSELAND MOUND CINCINNATI, OH 44444',
 '5412 ROSELAND MOUND CINCINNATI, OH 44444',
 '5410 ROSELAND MOUND CINCINNATI, OH 44444',
 '5406 ROSELAND MOUND CINCINNATI, OH 44444']

In [None]:
all_real_estate["st_dir_name"] = all_real_estate["st_dir"].map(direction_map_tl)

In [None]:
all_real_estate["full_st_name"] = (all_real_estate['st_dir_name']
                                    .str.cat(all_real_estate[["street_name", "st_suffix"]], sep=" ", na_rep="")      
                                    .str.replace(r"\s+", " ", regex=True)                      
                                    .str.strip()   )

In [None]:
center_path = r'C:\Users\markd\hamilton-county-homes-scraper-main\data\raw\downloads\Countywide_Street_Centerlines.csv'
center = pd.read_csv(center_path)

  center = pd.read_csv(center_path)


In [None]:
grouped = center[center["CLASS"].isin([2,3,4,5])].groupby(["STREET_NORM"])

In [None]:
def build_interval_lookup(grouped: pd.core.groupby.GroupBy) -> dict[str, tuple[pd.IntervalIndex, np.ndarray, np.ndarray]]:
    """
    Pre-compute two IntervalIndexes (left & right) **per street**.

    Returns
    -------
    {
        "MAIN ST": (left_intv, left_zip, left_parity),
        ...
    }
    """
    lookups = {}

    for street, segs in grouped:
        street = street[0] if isinstance(street, tuple) else street
        left_intv   = _make_interval(segs["L_F_ADD"], segs["L_T_ADD"])
        right_intv  = _make_interval(segs["R_F_ADD"], segs["R_T_ADD"])

        left_parity  = (segs["L_F_ADD"].astype("Int64") & 1).to_numpy()
        right_parity = (segs["R_F_ADD"].astype("Int64") & 1).to_numpy()

        lookups[street] = (
            left_intv,
            segs["ZIPL"].to_numpy(),
            left_parity,
            right_intv,
            segs["ZIPR"].to_numpy(),
            right_parity,
        )

    return lookups

def _make_interval(start: pd.Series, end: pd.Series) -> pd.IntervalIndex:
    try:
        # cast to nullable Int so bitwise ops work and NAs are preserved
        s = start.astype("Int64")
        e = end.astype("Int64")

        lower = np.minimum(s, e)   # element-wise
        upper = np.maximum(s, e)
        return pd.IntervalIndex.from_arrays(lower, upper, closed="both")
    except ValueError:
        return None

In [None]:
lookups = build_interval_lookup(grouped=grouped)

In [None]:
LookupT = Tuple[
    pd.IntervalIndex, np.ndarray, np.ndarray,   # L_intv, L_zip, L_par
    pd.IntervalIndex, np.ndarray, np.ndarray    # R_intv, R_zip, R_par
]

def fast_zip(
    street:        pd.Series,          # e.g. "MAIN ST"
    addr_num:      pd.Series,          # nullable Int64
    lookups:       Dict[str, LookupT],
    ) -> pd.Series:
    """
    Vectorised ZIP lookup for a whole dataframe column.

    Parameters
    ----------
    street       : Series[str]        – street names (same length as addr_num)
    addr_num     : Series[Int64]      – nullable house numbers
    lookups      : {street: (L_intv, L_zip, L_par, R_intv, R_zip, R_par)}

    Returns
    -------
    Series[Int64]   – one ZIP (or <NA>) per input row
    """

    out = pd.array(np.repeat(pd.NA, len(street)), dtype="Int64")

    for st, (L_intv, L_zip, L_par,
             R_intv, R_zip, R_par) in lookups.items():

        rows = (street == st) & addr_num.notna()    # ← subset just once
        if not rows.any():
            continue
        nums = addr_num[rows].fillna(0).to_numpy("int64")    # 1-D int64 array
        out[rows] = _find_zips(
            nums, L_intv, L_par, L_zip,
                  R_intv, R_par, R_zip
        )
    return pd.Series(out, index=addr_num.index, name="ZIP")


def _find_zips(nums,
               L_intv, L_par, L_zip,
               R_intv, R_par, R_zip):

    # 1. Interval look-ups (vectorised)
    posL, okL = _safe_pos(L_intv, nums)
    posR, okR = _safe_pos(R_intv, nums)
    
    parity = nums & 1       # 0 = even, 1 = odd
    goodL  = okL & (L_par[posL] == parity)
    goodR  = okR & (R_par[posR] == parity)

    res = pd.array(np.repeat(pd.NA, nums.size), dtype="Int64")
    if goodL.any():
        res[goodL] = L_zip[posL[goodL]]
    needs_R = res.isna() & goodR
    if needs_R.any():
        res[needs_R] = R_zip[posR[needs_R]]
    return res

def _safe_pos(intv, nums: np.ndarray):
    """
    Robust interval lookup.

    Returns
    -------
    pos : int64 array   – position in *intv*  (-1 where no match / intv is None)
    ok  : bool  array   – True where pos is valid
    """
    if intv is None:  # ← NEW: handle the None case
        pos = np.full(nums.shape, -1, dtype=np.int64)
        ok  = pos != -1
        return pos, ok
    else:
        pos = intv.get_indexer(nums)
        ok  = pos != -1
        return pos, ok

In [7]:
import re
import os
import pandas as pd
import numpy as np

import difflib
import spacy
from spacy.lang.en import English
from spacy.util import compile_suffix_regex
from dataclasses import dataclass,asdict
from typing import Optional, Dict, Tuple


"""
Address Parsing and Enrichment Utility

This module provides functionality to extract structured address components (house number, street, apartment)
from raw address strings using spaCy, and enrich them with ZIP code and city information using Hamilton County's
centerline and ZIP code datasets.

Key Features:
- Parses raw addresses into structured components using NLP.
- Matches street names with fuzzy logic.
- Maps address ranges to ZIP codes and cities using centerline data.
- Provides an AddressEnricher class for convenient reuse.
"""

nlp = spacy.load("en_core_web_sm")
nlp = English()
suffixes = list(nlp.Defaults.suffixes)

# drop the “digit-unit” suffix pattern
suffixes = [s for s in suffixes if "(?<=[0-9])" not in s]
nlp.tokenizer.suffix_search = compile_suffix_regex(suffixes).search

# Words that spaCy typically tags as numbers but should be treated as part
# of the street name when parsing addresses.  This includes cardinal and
# ordinal forms so that streets like "THIRTY-SECOND" are not mistaken for
# house numbers.
SPELLED_OUT_NUMBERS = {
    # cardinal numbers
    "zero", "one", "two", "three", "four", "five", "six", "seven", "eight",
    "nine", "ten", "eleven", "twelve", "thirteen", "fourteen", "fifteen",
    "sixteen", "seventeen", "eighteen", "nineteen", "twenty", "thirty",
    "forty", "fifty", "sixty", "seventy", "eighty", "ninety",
    # ordinal numbers
    "first", "second", "third", "fourth", "fifth", "sixth", "seventh",
    "eighth", "ninth", "tenth", "eleventh", "twelfth", "thirteenth",
    "fourteenth", "fifteenth", "sixteenth", "seventeenth", "eighteenth",
    "nineteenth", "twentieth", "thirtieth", "fortieth", "fiftieth",
    "sixtieth", "seventieth", "eightieth", "ninetieth",
}

APT_HEAD_WORDS = {"#", "APT", "UNIT", "STE", "SUITE", "ROOM", "RM"}

# ❱ pre-compiled regexen (compiled once instead of every call)
HYPHEN_RE     = re.compile(r"\b(\d+)\s*-\s*(\d+)\b")
FRACTION_RE   = re.compile(r"\b(\d+)?\s+?(\d+)/(\d+)\b",re.IGNORECASE|re.VERBOSE)
ORDINAL_RE    = re.compile(r"\b\d+(?:st|nd|rd|th)\b", re.IGNORECASE)
APT_TAIL_RE = re.compile(r"(?:\s+|^)(?:\#\s*|(?:APT|UNIT|STE|SUITE|ROOM|RM)\s+)?([A-Z0-9]{1,6}(?:-[A-Z0-9]{1,4})?)\s*$",
    re.IGNORECASE | re.VERBOSE,
)
APT_LETTER_RE = re.compile(r"\b(?!(?:N|S|E|W|NE|NW|SE|SW))[a-zA-Z]{1,2}\b", re.IGNORECASE | re.VERBOSE)
APT_ALPHANUM_RE = re.compile(r"([a-zA-Z]{1,2}-?\d{1,4}|\d{1,4}-?[a-zA-Z]{1,2})", re.IGNORECASE | re.VERBOSE)
DIRECTION_RE = re.compile(r"\b\s*(?:N|S|E|W|NW|SW|NE|SE|SOUTH|NORTH|EAST|WEST|NORTHEAST|SOUTHEAST|NORTHWEST|SOUTHWEST)\s*\b", re.IGNORECASE)
EXTRA_INFO_RE = re.compile(r"\s*\([A-Za-z]+\)\s*",re.IGNORECASE | re.VERBOSE)

def build_interval_lookup(grouped: pd.core.groupby.GroupBy) -> dict[str, tuple[pd.IntervalIndex, np.ndarray, np.ndarray]]:
    """
    Pre-compute two IntervalIndexes (left & right) **per street**.

    Returns
    -------
    {
        "MAIN ST": (left_intv, left_zip, left_parity),
        ...
    }
    """
    lookups = {}

    for street, segs in grouped:
        street = street[0] if isinstance(street, tuple) else street
        left_intv   = _make_interval(segs["L_F_ADD"], segs["L_T_ADD"])
        right_intv  = _make_interval(segs["R_F_ADD"], segs["R_T_ADD"])

        left_parity  = (segs["L_F_ADD"].astype("Int64") & 1).to_numpy()
        right_parity = (segs["R_F_ADD"].astype("Int64") & 1).to_numpy()

        lookups[street] = (
            left_intv,
            segs["ZIPL"].to_numpy(),
            left_parity,
            right_intv,
            segs["ZIPR"].to_numpy(),
            right_parity,
        )

    return lookups

def _make_interval(start: pd.Series, end: pd.Series) -> pd.IntervalIndex:
    try:
        # cast to nullable Int so bitwise ops work and NAs are preserved
        s = start.astype("Int64")
        e = end.astype("Int64")

        lower = np.minimum(s, e)   # element-wise
        upper = np.maximum(s, e)
        return pd.IntervalIndex.from_arrays(lower, upper, closed="both")
    except ValueError:
        return None

def fuzzy_match_street_name(bad: str, valid_names: pd.Series, score_cut: float = 80) -> str:
    """
    Corrects misspelled street names.
    
    Args:
        bad (str): The street name that may not be correct.
        valid_names (pd.Series): A unique list of correct street names.
        score_cut (float) : A real number between 0 and 100 indicating the accuracy score cutoff.

    Returns:
        str: Either the original street name if cutoff below threshhold or the corrected street name.
 
    """
    cand, score, _ = process.extractOne(
        bad, valid_names, scorer=fuzz.token_set_ratio
    )
    return cand if cand and score >= score_cut else bad

@dataclass(slots=True)
class AddressParts:
    st_num:         Optional[str] = None
    apt_num:        Optional[str] = None
    st_dir:         Optional[str] = None
    street_name:    Optional[str] = None
    st_suffix:      Optional[str] = None

def tag_address(address: str) -> AddressParts:
    if not isinstance(address, str) or not address.strip():
        return AddressParts()

    apt_tail_match = APT_TAIL_RE.search(address)
    tagged    = AddressParts()
    
    if apt_tail_match and _apt_is_alphanumeric(apt_tail_match.group(1)):
        tagged.apt_num = apt_tail_match.group(1).lstrip("#")
        address = address[:apt_tail_match.start()].rstrip()

    address = HYPHEN_RE.sub(lambda m: m.group(1), address)

    direction_match = DIRECTION_RE.search(address)
    if direction_match:
        direction = direction_match[0].strip()
        if direction in direction_map :
            tagged.st_dir = direction_map [direction]
        else:
            tagged.st_dir = direction
        address = address.replace(direction_match[0],' ')

    address = address.replace('-', '')
    address = FRACTION_RE.sub(_collapse_fraction, address)
    
    extra_info_match = EXTRA_INFO_RE.search(address)
    if extra_info_match:
        address = address.replace(extra_info_match[0].rstrip(),'')

    doc        = nlp(address)
    start_idx  = 0      

    # ── PRIMARY NUMBER (same as before) ───────────────────────────────
    first = next((t for t in doc if not t.is_space), None)
    if first and first.like_num and first.lower_ not in SPELLED_OUT_NUMBERS:
        # keep the “don’t steal the street number if it’s ordinal” check
        if not _is_ordinal(first):
            tagged.st_num = first.text
            start_idx = first.i + 1
        else:
            start_idx = first.i          # ‘32nd’ will become part of street
    else:
        start_idx = 0
    # ── APARTMENT NUMBER (guarded by new helper) ─────────────────────
    tokens = list(doc)   
    is_apt, apt_val, consumed = _detect_apt(tokens, start_idx)

    if is_apt and tagged.apt_num is None:
        tagged.apt_num = apt_val
        start_idx += consumed

    # ── STREET & SUFFIX (ordinal tokens go in the “street” bucket) ───

    parts = []
    for tok in doc[start_idx:]:
        canon = _canonical_suffix(tok.text)
        if canon:
            tagged.st_suffix = canon
        elif tok.text in APT_HEAD_WORDS:
            break
        else:
            parts.append(tok.text)

    ALLOWED_PUNCT = "&"

    tagged.street_name =  " ".join(
        ch for ch in parts
            if ch.isalnum() or ch.isspace() or ch in ALLOWED_PUNCT
        ) or None
    return tagged

def _detect_apt(tokens: list[spacy.tokens.Token], idx: int) \
        -> tuple[bool, str | None, int]:
    """
    Examine tokens starting at **idx** to see whether they form an
    apartment / unit clause *immediately* after the house-number.

    Returns
    -------
    (is_apt?, apt_value_or_None, tokens_consumed)

    Handles
    -------
    #2              # 2            APT 2B            APT # 2B
    UNIT 4          SUITE 300-A    2B   (bare token)  5  (bare numeric)
    """
    if idx >= len(tokens):
        return False, None, 0

    tok = tokens[idx]

    if tok.text.startswith("#") and len(tok.text) > 1:
        return True, tok.text.lstrip("#"), 1
    
    if tok.text == "#" and idx + 1 < len(tokens):
        nxt = tokens[idx + 1]
        if _maybe_apt(nxt):
            return True, nxt.text, 2

    if tok.text.upper() in APT_HEAD_WORDS:
        j = idx + 1
        if j < len(tokens) and tokens[j].text == "#":   # skip optional '#'
            j += 1
        if j < len(tokens) and _maybe_apt(tokens[j]):
            return True, tokens[j].text.lstrip("#"), j - idx + 1

    if _maybe_apt(tok):
        return True, tok.text, 1

    return False, None, 0

def _is_alphanumeric(token):
    """Check if the token text is alphanumeric."""
    return re.match("^([a-zA-Z]{1,2}-?\d{1,4}|\d{1,4}-?[a-zA-Z]{1,2})", token.text) is not None

def _apt_is_alphanumeric(text):
    """Check if the token text is alphanumeric."""
    return re.match("([a-zA-Z]{1,2}-?\d{1,4}|\d{1,4}-?[a-zA-Z]{1,2})", text) is not None

def _is_alpha(token):
    """Check if the token text is alpha"""
    return re.match("^(?!(?:N|S|E|W|NE|NW|SE|SW))[a-zA-Z]{1,2}$", token.text) is not None


def _is_ordinal(tok) -> bool:
    """True for '4th', '32ND', '101st', …"""
    return bool(ORDINAL_RE.fullmatch(tok.text))

def _maybe_apt(tok) -> bool:
    """Heuristic for apartment / unit IDs (excludes ordinals)."""
    return (
        not _is_ordinal(tok)
        and tok.lower_ not in SPELLED_OUT_NUMBERS
        and (tok.like_num 
             or _is_alphanumeric(tok)
             or _is_alpha(tok))
    )

def _collapse_fraction(m: re.Match) -> str:
    whole, num, den = m.groups()
    value = int(whole) + int(num) / int(den)      # 915 + 1/2 → 915.5
    # stringify *once* so you don’t end up with 9150.5
    return f"{value:g}".rstrip(".")

def _canonical_suffix(raw: str, cutoff: float = 0.90) -> str | None:
    """Return canonical suffix ('STREET', 'AVENUE', …) or None."""
    tok = raw.upper().rstrip(".")          # "Av." -> "AV"
    if tok in street_type_map:
        return street_type_map[tok]

In [8]:
parts = all_real_estate['address'].apply(tag_address)

In [9]:
df = pd.DataFrame([asdict(r) for r in parts])

In [10]:
parts

0         AddressParts(st_num='2327', apt_num=None, st_d...
1         AddressParts(st_num='2327', apt_num=None, st_d...
2         AddressParts(st_num='2327', apt_num=None, st_d...
3         AddressParts(st_num='2327', apt_num=None, st_d...
4         AddressParts(st_num='2319', apt_num=None, st_d...
                                ...                        
727394    AddressParts(st_num=None, apt_num=None, st_dir...
727395    AddressParts(st_num=None, apt_num=None, st_dir...
727396    AddressParts(st_num=None, apt_num=None, st_dir...
727397    AddressParts(st_num='2500', apt_num=None, st_d...
727398    AddressParts(st_num='2500', apt_num=None, st_d...
Name: address, Length: 727399, dtype: object

In [11]:
df['st_dir_name'] = df['st_dir'].map(direction_map_tl)

In [24]:
df.apt_num.value_counts().to_clipboard()

In [18]:
all_real_estate[all_real_estate['address'].str.contains('MOUND')==True].address.drop_duplicates().to_list()

['5333 INDIAN MOUND AVE',
 '5312 INDIAN MOUND AVE',
 '5360 INDIAN MOUND AVE',
 '5342 INDIAN MOUND AVE',
 '5304 INDIAN MOUND AVE',
 '5331 INDIAN MOUND AVE',
 '5341 INDIAN MOUND AVE',
 '5306 INDIAN MOUND AVE',
 '5345 INDIAN MOUND AVE',
 '5353 INDIAN MOUND AVE',
 '5359 INDIAN MOUND AVE',
 '5328 INDIAN MOUND AVE',
 '5302 INDIAN MOUND AVE',
 'INDIAN MOUND AVE',
 '1087 MOUND ST',
 '1054 MOUND ST',
 '1052 MOUND ST',
 '1050 MOUND ST',
 '1048 MOUND ST',
 '1046 MOUND ST',
 '1036 MOUND ST',
 '1034 MOUND ST',
 '1032 MOUND ST',
 '1030 MOUND ST',
 '1028 MOUND ST',
 '1026 MOUND ST',
 '1024 MOUND ST',
 '940 MOUND ST',
 '938 MOUND ST',
 '936 MOUND ST',
 '934 MOUND ST',
 '932 MOUND ST',
 '930 MOUND ST',
 '928 MOUND ST',
 '926 MOUND ST',
 '924 ST MOUND ST',
 '922 MOUND ST',
 '920 MOUND ST',
 '918 MOUND ST',
 '943 MOUND ST',
 '941 MOUND ST',
 '939 MOUND ST',
 '937 MOUND ST',
 '935 MOUND ST',
 '933 MOUND ST',
 '931 MOUND ST',
 '929 MOUND ST',
 '927 MOUND ST',
 'MOUND ST',
 '919 MOUND ST',
 '917 MOUND ST',
