In [None]:
"""
"""

import os, sys
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt

# Custom functions
sys.path.append(os.path.join(os.getcwd(),'code/'))
from __functions import *

# Load the environment variables

maindir = '/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping'

# Coordinate Ref. System
utm18n = 32618  # UTM Zone 18N (Washington, D.C.)
utm13n = 32613  # UTM Zone 13N (Denver, CO)

print("Ready !")

In [None]:
lookup = pd.read_csv(os.path.join(maindir,'data/tabular/raw/variable_lookup/RoofCoverStndCode_encoding.csv'))
lookup.head(10)

In [None]:
gdf_fps = [
    os.path.join(maindir, 'data/spatial/raw/dc_data/boundaries/dc_data_ocm_w_ztrax_matched.gpkg'),
    os.path.join(maindir, 'data/spatial/raw/denver_data/training/denver_data_reference.gpkg')
]
gdfs = {}
rois = ['dc','denver']
proj_utm = [utm18n, utm13n]
for i in range(len(gdf_fps)):
    gdfs[rois[i]] = gpd.read_file(gdf_fps[i]).to_crs(proj_utm[i])

print(f'Length of DC data: {len(gdfs["dc"])};\nLength of Denver data: {len(gdfs["denver"])}')

In [None]:
gdfs['denver'].class_code.unique()

In [None]:
print(gdfs['denver']['RoofCoverStndCode'].unique())

In [None]:
gdfs_ = {}

for key, gdf in gdfs.items():
    print(f'Processing {key} data ...')
    
    # Create the area attribute
    gdf['areaUTM'] = [geom.area for geom in gdf.geometry]
    gdf['areaUTMsqft'] = [geom.area*10.7639 for geom in gdf.geometry]
    
    # filter the footprints by distance metric and area mismatch
    gdf = gdf.loc[(gdf._distance <= 10) & (gdf.areaUTMsqft <= gdf.LotSizeSquareFeet)]
    
    # add the class code categorical variable
    gdf['class_code'] = gdf.RoofCoverStndCode.astype('category')  # category type is required for encoding
    print(list(gdf['class_code'].unique()))
    
    # # Filter out footprints below the 10th percentile of size for that class
    # gdfs_f = []
    # for cls in gdf['class_code'].unique():
    #     # Filter to that class
    #     gdf_cls = gdf[gdf['class_code'] == cls].copy()
    #     # Calculate the 10th percentile in building size
    #     p10 = np.percentile(gdf_cls['areaUTMsqft'], 10)
    #     gdf_cls = gdf_cls[gdf_cls['areaUTMsqft'] > p10].copy()  # filter based on the 10th percentile
    #     # append to the output list
    #     gdfs_f.append(gdf_cls)
    
    #     del p10, gdf_cls
    
    # # Merge them back
    # gdf = pd.concat(gdfs_f, ignore_index=True)
    
    # # Retain required columns
    # gdf = gdf[['class_code','areaUTMsqft','geometry']].reset_index(drop=True)
    
    # # Create a unique ID column
    # gdf['uid'] = gdf.index + 1
    # gdf['uid'] = gdf['uid'].astype(str) + gdf['class_code'].astype(str)
    
    # # Join to the description as well
    # lookup = lookup[['Description','Code']]
    # lookup = lookup.rename(columns={"Code": "class_code","Description": "description"})
    # gdf = gdf.merge(lookup, on='class_code')

    # # Append to dictionary
    # gdfs[key] = gdf

In [None]:
# Observe the class imbalance
footprints.class_code.value_counts()

In [None]:
# Combine SH and WS
ref = footprints.copy()  # Create a copy to avoid modifying the original DataFrame

# Merge the shingle classes (wood shingle and shingle)
merge = {'WS': 'WSH', 'SH': 'WSH'}
ref['class_code'] = ref['class_code'].replace(merge)

print(ref['class_code'].value_counts())  # check the counts

In [None]:
# Create the centroids for point sampling
reference_pt = ref.copy()
reference_pt['geometry'] = reference_pt['geometry'].centroid

In [None]:
from shapely.geometry import box

window_size = 64
half_window = window_size / 2

# Ensure the GeoDataFrame is in the correct coordinate system
gdf = ref.to_crs(epsg=32618)

# Create training locations with roof types
training_windows = []
training_roof_types = []

# Loop through each footprint individually
for geom, roof_type in zip(gdf.geometry, gdf['class_code']):
    centroid = geom.centroid
    window = box(centroid.x - half_window, centroid.y - half_window,
                 centroid.x + half_window, centroid.y + half_window)

    # Intersect with footprint centroids
    intersect = reference_pt[reference_pt.intersects(window)]
    
    # Get the total count and count for the class
    total_count = len(intersect)
    class_count = len(intersect[intersect['class_code'] == roof_type])

    if total_count > 0 and (class_count / total_count) > 0.50:
        training_windows.append(centroid)
        training_roof_types.append(roof_type)

    del intersect, centroid, window

# Create a GeoDataFrame for the training windows with roof types
training_gdf = gpd.GeoDataFrame({'geometry': training_windows, 'class_code': training_roof_types}, crs=gdf.crs)

# Plot the training locations with colors based on roof type
fig, ax = plt.subplots(figsize=(10, 10))
gdf.plot(ax=ax, color='grey', alpha=0.5, edgecolor='none')
training_gdf.plot(column='class_code', ax=ax, legend=True, cmap='Set1', edgecolor='black')
plt.title('Training Locations by Roof Material Type')
plt.show()

In [None]:
print(training_gdf.class_code.value_counts())

In [None]:
# Save this file out

training_gdf.to_file(os.path.join(maindir,'data/spatial/mod/dc_data/training/reference_samples_pure.gpkg'))