In [1]:
# %load_ext autoreload
# %autoreload 2

In [2]:
from pathlib import Path
import pandas as pd
from tqdm import tqdm
import json
import ast
import shapely.wkt
import geopandas as gpd
import concurrent.futures
import os

from nearmap_ai.feature_api import FeatureApi
from nearmap_ai import parcels
from nearmap_ai.constants import (
    LAT_LONG_CRS,
    BUILDING_ID,
    ROOF_ID,
    TRAMPOLINE_ID,
    POOL_ID,
    CONSTRUCTION_ID,
    SOLAR_ID,
    VEG_IDS,
    SURFACES_IDS,
    ROOF_CHAR_IDS,
)

pd.set_option('display.max_rows', 500)

country = "us"
workers = 30
area_unit = "sqft"

source_path = Path("/home/jovyan/data/source")
batches_path = Path("/home/jovyan/data/batches")
target_path = Path("/home/jovyan/data/processed")
samples_path = Path("/home/jovyan/data/samples")
final_path = Path("/home/jovyan/data/final")
transformed_path = Path("/home/jovyan/data/transformed")

first_columns = ["aoi_id", "date", "mesh_date", "link", "system_version"]
last_columns = ["geometry"]

In [3]:
feature_api = FeatureApi()
classes_df = feature_api.get_feature_class_ids()
classes_df = classes_df[classes_df["type"] == "Feature"]

classes = [BUILDING_ID, ROOF_ID, TRAMPOLINE_ID, POOL_ID, CONSTRUCTION_ID, SOLAR_ID]
classes_df = classes_df[classes_df.index.isin(classes)]

display(classes_df)

Unnamed: 0_level_0,type,description,schema
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0339726f-081e-5a6e-b9a9-42d95c1b5c8a,Feature,Swimming Pool,
3680e1b8-8ae1-5a15-8ec7-820078ef3298,Feature,Solar Panel,
753621ee-0b9f-515e-9bcf-ea40b96612ab,Feature,Trampoline,
a2a81381-13c6-57dc-a967-af696e45f6c7,Feature,Construction Site,
a2e4ae39-8a61-5515-9d18-8900aa6e6072,Feature,Building,
c08255a4-ba9f-562b-932c-ff76f2faeeeb,Feature,Roof,


In [4]:
# n = 10000

# source_files = list(source_path.glob("*.csv"))

# def batch_csv(source_path):
#     with open(source_path, "r") as fs:
#         header = fs.readline()
#         counter = 0
#         while True:
#             target_path = batches_path / f"{source_path.stem}_{str(counter).zfill(3)}.csv"
#             counter += 1
#             with open(target_path, "w") as ft:
#                 ft.write(header)
#                 for _ in range(n):
#                     line = fs.readline()
#                     if line == "":
#                         break
#                     ft.write(line)
#             if line == "":
#                 break
                
# with concurrent.futures.ProcessPoolExecutor(workers) as executor:
#     for source_path in source_files:
#         executor.submit(batch_csv, source_path)

In [5]:
def process_batch(source_path, force=False):
    outpath = target_path / source_path.name
    outpath_temp = target_path / f"{source_path.name}.tmp"
    if outpath.is_file() and not force:
        return
    
    source_df = pd.read_csv(source_path, sep="|")

    source_df.payload = source_df.payload.apply(ast.literal_eval)
    source_df = source_df.rename(columns={"parcelPtId": "aoi_id"})

    parcels_gdf = gpd.GeoDataFrame(source_df[['aoi_id']], geometry=source_df.geometry.apply(shapely.wkt.loads))
    parcels_gdf = parcels_gdf.set_crs(LAT_LONG_CRS)

    payloads = [FeatureApi.payload_gdf(row.payload, row.aoi_id) for row in source_df.itertuples()]
    features, metadata = zip(*payloads)

    features_gdf = pd.concat(features)
    metadata_df = pd.DataFrame(metadata)

    rollup_df = parcels.parcel_rollup(parcels_gdf, features_gdf, classes_df)

    final_df = rollup_df.merge(parcels_gdf, on="aoi_id")
    final_df = final_df.merge(metadata_df, on="aoi_id")
    columns = first_columns + [c for c in final_df.columns if c not in first_columns + last_columns] + last_columns
    final_df = final_df[columns]
    
    final_df = final_df.rename(columns={"aoi_id": "parcelPtId"})
    
    if area_unit == "sqft":
        final_df = final_df[[c for c in final_df.columns if "_area_sqm" not in c and "height_m" not in c]]
    if area_unit == "sqm":
        final_df = final_df[[c for c in final_df.columns if "_area_sqft" not in c and "height_ft" not in c]]
    
    final_df.to_csv(outpath_temp, index=False)
    os.rename(outpath_temp, outpath)
    return final_df

In [6]:
# with concurrent.futures.ProcessPoolExecutor(workers) as executor:    
#     jobs = []
#     for source_path in batches_path.glob("*.csv"):
#         jobs.append(executor.submit(process_batch, source_path))
#     for job in tqdm(jobs):
#         job.result()

In [None]:

def map_to_old_schema(dfr):

    # Dominant roof matrials
    dfr["dominant_roof_material"] = "unknown"
    dfr.loc[dfr["roof_present"] == "N", "dominant_roof_material"] = "not available"
    dfr["dominant_roof_material_confidence"] = 1.0

    for name, cname in [("Tile Roof", "tile"), ("Shingle Roof", "shingle"), ("Metal Roof", "metal")]:
        mask = dfr[f"primary_roof_{cname}_roof_dominant"] == "Y"
        dfr.loc[mask, "dominant_roof_material"] = name
        dfr.loc[mask, "dominant_roof_material_confidence"] = dfr.loc[mask, f"primary_roof_{cname}_roof_confidence"]

    # Heights
    dfr["area_under_roof_sqm"] = 0.092903 * dfr["primary_building_area_sqft"]
    dfr["building_height_ft"] = 3.28084 * dfr["primary_building_height_m"]

    # Num storeys
    dfr["storey_category"] = "not available"
    dfr["storey_category_confidence"] = 1.0

    storey_categories = df[[c for c in df.columns if "num_storeys" in c]].idxmax(axis=1)

    for storeys in ["1", "2", "3+"]:
        mask = storey_categories == f"primary_building_num_storeys_{storeys}_confidence"
        dfr.loc[mask, "storey_category"] = storeys
        dfr.loc[mask, "storey_category_confidence"] = dfr.loc[mask, f"primary_building_num_storeys_{storeys}_confidence"]

    # Direct column mappings
    direct_mappings = {
         'parcelPtId': 'parcel_id',
         'geometry': 'wkt',
         'link': 'mapbrowser_url',
         'date': 'survey_date',
         'primary_roof_tree_overhang_present': 'tree_overhang_present',
         'primary_roof_tree_overhang_confidence': 'tree_overhang_confidence',
         'primary_roof_hip_present': 'hip_roof_type_present',
         'primary_roof_hip_confidence': 'hip_roof_type_confidence',
         'primary_roof_gable_present': 'gable_roof_type_present',
         'primary_roof_gable_confidence': 'gable_roof_type_confidence',
         'primary_roof_flat_present': 'flat_roof_type_present',
         'primary_roof_flat_confidence': 'flat_roof_type_confidence',
         'primary_roof_turret_present': 'turret_roof_type_present',
         'primary_roof_turret_confidence': 'turret_roof_type_confidence',
         'primary_roof_pitch': 'roof_pitch_degrees',
         'primary_building_area_sqft': 'area_under_roof_sqft',
         'primary_building_height_m': 'building_height_m'
    }

    dfr = dfr.rename(columns=direct_mappings)

    # Map bools
    for c in dfr.columns:
        if "_present" in c:
            dfr = dfr.replace({c: {"Y": "True", "N": "False"}})
            dfr[c] = dfr[c].fillna("False")
        if "_confidence" in c:
            dfr[c] = dfr[c].fillna(1.0)

    # Filter columns
    final_columns = [
         'parcel_id',
         'wkt',
         'mapbrowser_url',
         'survey_date',
         'roof_present',
         'roof_confidence',
         'dominant_roof_material',
         'dominant_roof_material_confidence',
         'solar_panel_present',
         'solar_panel_confidence',
         'tree_overhang_present',
         'tree_overhang_confidence',
         'swimming_pool_present',
         'swimming_pool_confidence',
         'trampoline_present',
         'trampoline_confidence',
         'hip_roof_type_present',
         'hip_roof_type_confidence',
         'gable_roof_type_present',
         'gable_roof_type_confidence',
         'flat_roof_type_present',
         'flat_roof_type_confidence',
         'turret_roof_type_present',
         'turret_roof_type_confidence',
         'roof_pitch_degrees',
         'area_under_roof_sqm',
         'area_under_roof_sqft',
         'building_height_m',
         'building_height_ft',
         'storey_category',
         'storey_category_confidence'
    ]

    dfr = dfr[final_columns]
    return dfr



In [None]:

batched_results = list(target_path.glob("*.csv"))

for i in tqdm(range(1,6)):
    dfs = []
    for p in target_path.glob(f"*flat{i}*.csv"):
        dfs.append(pd.read_csv(p))
    full_df = pd.concat(dfs)
    full_df.to_csv(final_path / f"nearmap_flat{i}.csv", index=False)
    transformed_full_df = map_to_old_schema(full_df)
    transformed_full_df.to_csv(transformed_path / f"nearmap_flat{i}.csv", index=False)
