# **Training Data Processing**

1. Project Setup
2. Prepare UCP Data
3. Rasterize UCP Data
4. T

# **1. Project Setup**

### 1.1. Import Libraries

In [1]:
%load_ext autoreload
%autoreload 

import sys
import os

# Add the module's parent directory to sys.path
module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from lcz_classification.config import * 
from lcz_classification.dataset import fetch_metadata
from lcz_classification.util import merge_rasters, resample_da, read_lcz_legend, rasterize_vector, generate_raster, get_overlapping_tiles, meter_to_deg, kml_to_gdf
from lcz_classification.plots import map_training_areas
import fiona
import rioxarray as rio
import pandas as pd
import rasterio as r
from shapely.geometry import box
import geopandas as gpd
import xarray as xr
from rasterio.enums import Resampling
import numpy as np
from sklearn.model_selection import train_test_split


- Parent directory for Toronto already exists, retrievine folder paths...
- Folder paths for Toronto retrieved.


In [2]:
study_area=fetch_metadata("STUDY_AREA")

UTM_CRS=study_area.estimate_utm_crs()
bounds=list(study_area.total_bounds)

lcz_legend=fetch_metadata('LCZ_LEGEND')


### 1.3. Setup Reference Raster

In [3]:

## Setup Reference Raster from Sentinel-2 Imagery
s2=rio.open_rasterio(S2_FP)
s2_ref=s2.isel(band=0)

# **2. Urban Canopy Layer Processing**

## 2.1. Sky View Factor

#### 2.1.1. Merge  DSM Layers 

In [6]:
dsm_files= [f"{DSM_RAW}/{tif}" for tif in os.listdir(DSM_RAW) if ".tif.aux.xml" not in tif] # prepare file paths of DSM tiles


out_path=f"{SF_PRC}/alos_dsm_{CELL_RESOLUTION}m.tif" 
merge_rasters(raster_paths=dsm_files,
                        out_path=out_path,
                        transform=None
                        ) # merge into a single raster using merge_arrays()

#### 2.1.2. Compute Sky View Factor

Run ../lcz_classification/svf_qgis.py in QGIS v3.28 Python Console

ADD SCREENSHOT!

#### 2.1.3. Reproject Match to S2 Data [DELETE?]

In [5]:
# ref=rio.open_rasterio(BH_FP)
svf=rio.open_rasterio(SVF_FP)
svf=svf.rio.reproject_match(s2_ref,resampling=Resampling.average)
attrs=svf.attrs

svf.rio.to_raster(SF_FP)

svf.close()

## 2.2. Impervious Surface Fraction

In [6]:
# # Reproject Match to Sentinel-2 Image

# Example bounding box
if os.path.exists(IS_MERGED_FP):
    isf=rio.open_rasterio(IS_MERGED_FP)
    isf.values=np.float64(isf.values)

    # Reproject Match to S2 Data
    isf_matched=isf.rio.reproject_match(s2_ref,resampling=Resampling.average)
    isf_matched.rio.to_raster(IS_FP)

else:
    tiles = get_overlapping_tiles(bounds)
    x1,y1,x2,y2 = bounds

    # Get GIS Tiles that fall within Study Area bounds
    isf_files=[f"{IS_RAW}/GISA-10m_{tile}.tif" for tile in tiles]
    isf_tiles=[rio.open_rasterio(file).sel(x=slice(x1,x2), y=slice(y2,y1)) for file in isf_files]
    keep=[True if tile.y.size > 0 and tile.x.size > 0 else False for tile in isf_tiles ]
    isf_files=np.array(isf_files)[keep]

    # Merge Impervious Area Rasters
    merge_rasters(
        raster_paths=isf_files,
        out_path=IS_MERGED_FP
    )




## 2.3. Tree Canopy Height

In [7]:
from lcz_classification.util import resample_da

CH_MERGED=f"{CH_PRC}/canopy_height_merged.tif"

if os.path.exists(CH_MERGED) == False:
    
    ch_files= [f"{CH_RAW}/{tif}" for tif in os.listdir(CH_RAW) if ".tif" in tif] # prepare file paths of DSM tiles

    merge_rasters(
                raster_paths=ch_files,
                out_path=CH_MERGED,
                transform=None
                ) # merge into a single raster using merge_arrays()

ch=rio.open_rasterio(CH_MERGED)
ch=ch.where(ch < 255).fillna(0)
ch_resampled=ch.rio.reproject_match(s2_ref, resampling=Resampling.average)
ch_resampled.rio.to_raster(CH_FP)

ch.close()
ch_resampled.close()

## 2.4. Building Height

#### 2.2.1. Combine Vancouver Building Layers into Single GeoParquet File

In [8]:
if os.path.exists(BH_VECTOR_FP) == False:
    BH_FILES=[f"{BH_RAW}/{gdb}/{gdb.replace('Autobuilding_ON_','').replace('_fgdb','.gdb')}" for gdb in os.listdir(BH_RAW) if '.zip' not in gdb]


    ## Get CRS of Buildings Layer and Prepare bounding box with same projection
    with fiona.open(BH_FILES[0]) as src:
        bh_crs =src.crs
        bh_bbox=box(*list(study_area.to_crs(bh_crs).total_bounds))


    bh_gdf_list=[gpd.read_file(file, bbox=bh_bbox).to_crs(bh_crs) for file in BH_FILES]
    bh_gdf=pd.concat(bh_gdf_list).reset_index(drop=True)
    # bh_gdf=bh_gdf[bh_gdf.datemax.str.contains(filter_out_yr)]
    bh_gdf.to_crs(CRS).to_parquet(BH_VECTOR_FP)

print("Build Heights Raster File Already Found")

Build Heights Raster File Already Found


#### 2.2.2. Rasterize Building Height Data

In [9]:

if os.path.exists(BH_RASTER_FP):
    print("-READING")
    bh=rio.open_rasterio(BH_RASTER_FP)
else:
    bh_gdf=gpd.read_parquet(BH_VECTOR_FP)
    print("-Generating 5m raster")
    ref = generate_raster(study_area.total_bounds, crs=CRS, resolution=5)
    print("- 5m Raster Generated")
    # Rasterize building heights
    rasterize_vector(gdf=bh_gdf,
                    ref=ref,
                    attribute="heightmax",
                    crs=CRS,
                    out_path=BH_RASTER_FP
                )
    
    print("- Building Heights Rasterized")
    bh=rio.open_rasterio(BH_RASTER_FP)

# Resample Building Heights to Match Sentinel-2 Data
bh_resampled=bh.rio.reproject_match(s2_ref, resampling=Resampling.average)
bh.close()

# Expore 30m raster
bh_resampled.rio.to_raster(BH_FP)
bh_resampled.close()

-READING


## 2.5. Building Surface Fraction

In [10]:
if os.path.exists(BH_RASTER_FP):
    bh=rio.open_rasterio(BH_RASTER_FP)

    arr=bh.values
    mask=~bh.where(bh > 0).isnull().values

    arr[mask] = 1.0
    arr[~mask] = 0.0

    bh.values=arr
    bsf=bh.rio.reproject_match(s2_ref, resampling=Resampling.average)
    bsf.rio.to_raster(BS_FP)
    bh.close()
    bsf.close()

else:
    print(f"No rasterized Building Height Data Found, rasterize {BH_VECTOR_FP} first")

## 3. Train and Test Samples of Local Climate Zone (LCZ) Training Areas

### 3.1. Split LCZ Vector Data to Train / Test

In [None]:
# lcz_gdf=gpd.read_file(LCZ_FP)
# lcz_gdf.dropna(subset='geometry',inplace=True)
# lcz_gdf['class'] = [lcz_legend.set_index('class_id').loc[cl]['class'] for cl in lcz_gdf.Name]
# # lcz_gdf['class_id'] = [lcz_legend.set_index('class_id').loc[cl]['class_id'] for cl in lcz_gdf.Name]
# lcz_gdf['name'] = [lcz_legend.set_index('class_id').loc[cl]['name'] for cl in lcz_gdf.Name]
# lcz_gdf['area_m2']=lcz_gdf.to_crs(UTM_CRS).area

# lcz_gdf.rename(columns=dict(Name ='class_id', Description='description')).to_file(LCZ_FP.replace("training_areas",'training_areas_updated'))

In [4]:
lcz_gdf=gpd.read_file(LCZ_FP)

# Stratify by Area
# lcz_train,lcz_test=train_test_split(lcz_gdf,test_size=0.3,stratify=lcz_gdf['class_id'])


# map_training_areas(lcz_train,lcz_test,boundary=study_area)


### 3.2. Rasterize and Export 

In [5]:


## Train Samples Rasterization
rasterize_vector(gdf=lcz_gdf,
                 ref=s2_ref,
                 attribute="class",
                 crs=s2.rio.crs,
                 out_path=TRAIN_FP,
                 )

# ## Test Samples Rasterization
# rasterize_vector(gdf=lcz_test,
#                  ref=s2_ref,
#                  attribute="class",
#                  crs=s2.rio.crs,
#                  out_path=TEST_FP,
#                  )

