In [1]:
"""
Preparation of reference data for 
"""

import os, sys
import pandas as pd
import geopandas as gpd
import numpy as np
import matplotlib.pyplot as plt
import gc

# Custom functions
sys.path.append(os.path.join(os.getcwd(),'code/'))
from __functions import *

# Load the environment variables

maindir = '/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping'

# Coordinate Ref. System
utm13n = 32613  # UTM Zone 13N (Denver, CO)

print("Ready !")

Ready !


In [2]:
gdf_fp = os.path.join(maindir, 'data/spatial/raw/denver_data/training/denver_data_ocm_w_ztrax_matched_.gpkg')
gdf = gpd.read_file(gdf_fp)
gdf = gdf.to_crs(utm13n) # local projection
print(f'Length of Denver data: {len(gdf)};')

Length of Denver data: 463865;


In [None]:
gdf.RoofCoverStndCode.unique()

In [None]:
# add the class code categorical variable
gdf.loc[:, 'class_code'] = gdf['RoofCoverStndCode'].astype('category')
print(list(gdf['class_code'].unique()))

In [None]:
# Drop footprints with no roof material information
gdf = gdf[gdf['class_code'] != '']
gdf.class_code.value_counts()

In [None]:
# Remove footprints with no roof material code

In [None]:
# Create the area attribute and filter mismatches
gdf['areaUTM'] = [geom.area for geom in gdf.geometry]
gdf['areaUTMsqft'] = [geom.area * 10.7639 for geom in gdf.geometry]
# Calculate the percent difference between the area and reported lot size
gdf['areaPctDiff'] = (gdf['areaUTMsqft'] - gdf['LotSizeSquareFeet']).abs() / gdf['LotSizeSquareFeet'] * 100

# filter the footprints by distance metric (output from fuzzy join) and percent difference
gdf_ = gdf.loc[(gdf._distance <= 30) & (gdf.areaUTMsqft <= gdf.LotSizeSquareFeet)]
len(gdf_)

In [None]:
gdf_.class_code.value_counts()

In [None]:
# Handle 'bad' classes
bad_classes = ['','BU','OT'] # empty, "Built Up", "other"

out_gdfs = []
for cl in gdf.class_code.unique():

    if cl in bad_classes:
        continue

    _gdf = gdf.loc[gdf.class_code == cl]

    # skip small sample size
    if _gdf.shape[0] < 10:
        print(f'Class {cl} has shape {_gdf.shape} ... skipping ...')
        continue

    out_gdfs.append(_gdf)

# Concatenate the reference data
ref = pd.concat(out_gdfs)
print(ref['class_code'].value_counts())

del out_gdfs, _gdf
gc.collect()

In [None]:
# Load the class code lookup table
lookup = pd.read_csv(os.path.join(maindir,'data/tabular/raw/variable_lookup/RoofCoverStndCode_encoding.csv'))
lookup.head(25)

In [None]:
# Create a unique ID column
ref['uid'] = ref.index + 1
ref['uid'] = ref['uid'].astype(str) + ref['class_code'].astype(str)

# Join to the lookup table to get the description
lookup = lookup[['Description','Code']]
lookup = lookup.rename(columns={"Code": "class_code","Description": "description"})
ref = ref.merge(lookup, on='class_code')

# Select the required columns
ref = ref[['uid', 'class_code', 'description', 'areaUTMsqft', 'LotSizeSquareFeet', 'geometry']]
ref = ref.rename(columns={"LotSizeSquareFeet": "lotSizeSqft"})
ref.head()

In [None]:
desc_mapping = dict(zip(ref['class_code'], ref['description']))
print(f'Description map: \n{desc_mapping}')

In [None]:
# Summary statistics for footprint area

In [None]:
# Calculate the longest side length of the building
area_stats = ref.groupby('class_code', observed=True).apply(footprint_area_stats, include_groups=False).reset_index()
area_stats

In [None]:
# Box plot for mean footprint area across classes
plt.figure(figsize=(5, 3))
sns.boxplot(x='class_code', y='areaUTMsqft', data=ref)
# Set the y-axis to log scale
plt.yscale('log')
plt.title('Average Footprint Area Across Classes (Log Scale)')
plt.ylabel('Mean Area (sqm)')
plt.xlabel('Building Class')
plt.xticks(rotation=45)
plt.tight_layout()
plt.show()

In [None]:
# Map of reference data

In [None]:
# Plot the training locations with colors based on roof type
fig, ax = plt.subplots(figsize=(10, 10))
ref.plot(column='description', ax=ax, legend=True, cmap='Set1', edgecolor='none')
plt.title('Training Locations by Roof Material Type')
plt.show()

In [None]:
# Save to geopackage
ref.to_file(os.path.join(maindir,'data/spatial/mod/dc_data/training/reference_samples_pure.gpkg'))

In [None]:
gc.collect()