In [None]:


import os
os.chdir(r'C:\code\bedford-ubid')

import plotly.io as pio
import contextily as ctx  # For basemaps
from scipy import stats
from pathlib import Path

from urllib.parse import urlencode

import plotly.graph_objects as go
import plotly.express as px
from folium import plugins
import folium
import seaborn as sns
import matplotlib.pyplot as plt
from shapely.geometry import Point, Polygon
import geopandas as gpd
import pandas as pd
import numpy as np
import warnings
import pyproj
from pyproj import CRS

from pymodule.folium_plots import create_folium_polygon

warnings.filterwarnings('ignore')
plt.style.use('default')
sns.set_palette("husl")


# Set the default template to "plotly_white"

pio.templates.default = "simple_white"
%load_ext autoreload
%autoreload 2

In [2]:
buildings_proj = pd.read_pickle('./dataprocess/buildings_projected.pickle')

In [3]:
parcels_proj = pd.read_pickle('./dataprocess/parcels_projected.pickle')
parcels_proj = parcels_proj.loc[~parcels_proj.SBL.isna()]
assert len(parcels_proj['SBL'].unique()) == len(parcels_proj)
parcels_proj = parcels_proj.set_index('SBL', drop=False)

### DATA CLEANING FOR PARCELS

In [4]:
keep_parcel_cols = [
    'COUNTY_NAM', 
    'MUNI_NAME', 
    'SWIS', 
    'PARCEL_ADD', 
    'SBL',
    
    'CITYTOWN_N', 
    'CITYTOWN_S', 

    'LOC_ST_NBR', 
    'LOC_STREET', 
    'LOC_UNIT',
    'LOC_ZIP', 
    
    'PROP_CLASS', 
    'ROLL_SECTI', 
    'LAND_AV', 
    'TOTAL_AV',
    'FULL_MARKE',  
    'SQ_FT', 
    'ACRES',

    'SQFT_LIVIN',
    'GFA', 


    'PRIMARY_OW', 
    'MAIL_ADDR',
    'PO_BOX', 

    'MAIL_CITY', 
    'MAIL_STATE', 
    'MAIL_ZIP', 

    'BOOK', 
    'PAGE', 
    'GRID_EAST', 
    'GRID_NORTH', 
    'MUNI_PARCE', 
    

    'CALC_ACRES', 
    'geometry'
]

parcels_proj = parcels_proj[keep_parcel_cols]

### hybrid match

In [None]:
def find_best_parcel_hybrid(building_geom, parcels_gdf,
                            min_overlap_pct=0.8,
                            buffer_distance=10.0,
                            prefer_area_match=True):
    """
    Try area overlap first, then buffered centroid method
    """
    building_area = building_geom.area
    building_centroid = building_geom.centroid

    potential_matches = list(parcels_gdf.sindex.intersection(building_geom.bounds))

    best_match = None
    best_score = 0
    match_method = None

    for idx in potential_matches:
        parcel_geom = parcels_gdf.iloc[idx].geometry
        parcel_idx = parcels_gdf.iloc[idx].name

        try:
            # Method 1: Area overlap
            intersection = building_geom.intersection(parcel_geom)
            if not intersection.is_empty:
                overlap_pct = intersection.area / building_area
                if overlap_pct >= min_overlap_pct:
                    if overlap_pct > best_score:
                        best_score = overlap_pct
                        best_match = parcel_idx
                        match_method = f'area_{overlap_pct:.2f}'

            # Method 2: Buffered containment (if no good area match)
            if best_match is None or not prefer_area_match:
                if parcel_geom.buffer(buffer_distance).contains(building_centroid):
                    # Calculate a "score" based on distance to parcel edge
                    distance_to_edge = building_centroid.distance(parcel_geom.boundary)
                    score = 1.0 / (1.0 + distance_to_edge)  # Closer = higher score

                    if score > best_score or best_match is None:
                        best_score = score
                        best_match = parcel_idx
                        match_method = f'buffer_{distance_to_edge:.1f}ft'

        except Exception as e:
            continue

    return best_match, best_score, match_method


# Apply hybrid approach
results = []
for idx, building in buildings_proj.iterrows():
    parcel_idx, score, method = find_best_parcel_hybrid(
        building.geometry,
        parcels_proj,
        min_overlap_pct=0.80,
        buffer_distance=15.0
    )

    results.append({
        'building_idx': idx,
        'parcel_idx': parcel_idx,
        'match_score': score,
        'match_method': method
    })

matches_df = pd.DataFrame(results)

# Summary
method_counts = matches_df['match_method'].value_counts()
print("Match methods used:")
print(method_counts)
print(f"\nTotal matches: {len(matches_df[matches_df['parcel_idx'].notna()])}")



Match methods used:
match_method
area_1.00       6494
area_0.99         27
area_0.98         23
area_0.96         19
area_0.88         13
                ... 
buffer_3.8ft       1
buffer_6.1ft       1
buffer_3.2ft       1
area_0.81          1
buffer_7.5ft       1
Name: count, Length: 82, dtype: int64

Total matches: 6748


In [6]:
# Add parcel information to buildings
building_parcel_match = buildings_proj.merge(
    matches_df,
    left_index=True,
    right_on='building_idx',
    how='left'
)

# # Merge in parcel attributes
building_parcel_match = building_parcel_match.merge(
    parcels_proj,
    left_on='parcel_idx',
    right_index=True,
    how='left',
    suffixes=('', '_parcel')
)

In [7]:
building_parcel_match['parcel_idx']

0                        None
1        07100800020190000000
2                        None
3        06001400010050000000
4                        None
                 ...         
12223                    None
12224    04901300020130000000
12225                    None
12226    08401600020290000000
12227                    None
Name: parcel_idx, Length: 12228, dtype: object

### qc / stats

In [8]:

# parcels with at least one value
matched_parcels = [x for x in building_parcel_match['parcel_idx'].unique() if x is not None]

unmatched_parcels = [x for x in parcels_proj.index if x not in matched_parcels]

buildings_per_parcel = building_parcel_match['parcel_idx'].value_counts()
more_than_one_building = buildings_per_parcel.loc[buildings_per_parcel > 1]
outlier_buildings = buildings_per_parcel.loc[buildings_per_parcel > 5]

print(f'{len(matched_parcels)} matched parcels')
print(f'{len(unmatched_parcels)} unmatched parcels')
print(f'{len(more_than_one_building)} of {len(buildings_per_parcel)} parcels have more than one building')

print(f'{len(outlier_buildings)} of {len(buildings_per_parcel)} parcels > 5 buildings.')
print('highest 5:')
print(outlier_buildings.head())
print('lowest 5:')
print(outlier_buildings.tail())

fig = px.histogram(buildings_per_parcel.values)

fig.update_layout(
    title='distribution, buildings per parcel - BEDFORD',
    width=600,height=400
)

fig.show()

4943 matched parcels
1263 unmatched parcels
1105 of 4943 parcels have more than one building
41 of 4943 parcels > 5 buildings.
highest 5:
parcel_idx
06001500020410000000    37
05901200030010000000    30
06001200010010000000    27
06100700010010020000    22
05001300010080010000    17
Name: count, dtype: int64
lowest 5:
parcel_idx
06001500030300000000    6
07200600010130000000    6
07100800020010000000    6
05001800010110000000    6
06201700010100000000    6
Name: count, dtype: int64


### look at outlier specific buildings

In [None]:
outlier_buildings



coords = parcels_proj.loc[outlier_buildings.index[0:3]].geometry
coords

gdf = gpd.GeoDataFrame(coords, crs='EPSG:2262')
gdf_web = gdf.to_crs('EPSG:4326')

center_lat = gdf_web.geometry.centroid.y.iloc[0]
center_lon = gdf_web.geometry.centroid.x.iloc[0]

m = folium.Map(location=[center_lat, center_lon], zoom_start=15)

folium.GeoJson(
    gdf_web.geometry.iloc[0].__geo_interface__,
    style_function=lambda feature: {
        'fillColor': 'blue',
        'color': 'red',
        'weight': 2,
        'fillOpacity': 0.3,
    }
).add_to(m)

m.save('polygon_map.html')

gdf_web

Unnamed: 0_level_0,geometry
parcel_idx,Unnamed: 1_level_1
6001500020410000000,"POLYGON ((-73.68101 41.24138, -73.68096 41.241..."
5901200030010000000,"POLYGON ((-73.71527 41.24559, -73.71489 41.245..."
6001200010010000000,"POLYGON ((-73.6735 41.24679, -73.67414 41.2453..."


In [16]:
# import geopandas as gpd
# from shapely.geometry import Polygon



# # Create polygon and GeoDataFrame
# polygon = Polygon(coords)
# gdf = gpd.GeoDataFrame([1], geometry=[polygon], crs='EPSG:2262')

# # Simple plot
# gdf.plot(figsize=(10, 10))
# plt.title("Polygon in State Plane Coordinates")
# plt.show()

# # Or convert to lat/lon and plot
# gdf_latlon = gdf.to_crs('EPSG:4326')
# gdf_latlon.plot(figsize=(10, 10))
# plt.title("Polygon in Lat/Lon")
# plt.show()




In [20]:
import folium
import geopandas as gpd
from shapely.geometry import Polygon
from pymodule.folium_plots import create_folium_polygon


# Your polygon coordinates
# coords = [(2496682.0772464178, 490315.25984110986), (2496695.701732492, 490314.360436501), ...]  # your full coordinates

# Create the polygon
polygon = Polygon(coords)

# Create a GeoDataFrame
gdf = gpd.GeoDataFrame([1], geometry=[polygon], crs='EPSG:2262')  # NY State Plane East

# Convert to lat/lon for web mapping
gdf_web = gdf.to_crs('EPSG:4326')

# Get the centroid for map center
center_lat = gdf_web.geometry.centroid.y.iloc[0]
center_lon = gdf_web.geometry.centroid.x.iloc[0]

# Create folium map
m = folium.Map(location=[center_lat, center_lon], zoom_start=15)

# Add the polygon
folium.GeoJson(
    gdf_web.geometry.iloc[0].__geo_interface__,
    style_function=lambda feature: {
        'fillColor': 'blue',
        'color': 'red',
        'weight': 2,
        'fillOpacity': 0.3,
    }
).add_to(m)

# Display the map
m.save('polygon_map.html')  # Save to HTML file


TypeError: 'Polygon' object is not iterable

In [None]:
from pymodule.folium_plots import create_folium_polygon




create_folium_polygon(polygon).save('polygon_map_1.html')