In [None]:
!nvidia-smi
!glxgears

In [None]:
"""
Resnet-18 for classifying roof materials from PlanetScope SuperDove imagery
Case study in Washington, D.C. 
"""

import os, time, glob
import geopandas as gpd
import pandas as pd
import rioxarray as rxr
import xarray as xr
import numpy as np
import rasterio as rio
import matplotlib.pyplot as plt
import gc
import torch
import torch.nn as nn
import torch.optim as optim

from torch.utils.data import Dataset, DataLoader
from torch.utils.data.dataloader import default_collate
from torchvision import transforms, utils
from torchsat.models.classification import resnet18

from sklearn.model_selection import train_test_split
from sklearn.utils import resample
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay, classification_report
from sklearn.metrics import roc_auc_score, log_loss, roc_auc_score, roc_curve, auc, f1_score, precision_score, recall_score
from sklearn.model_selection import StratifiedKFold

from fiona.crs import from_epsg
from shapely.geometry import box

import warnings
warnings.filterwarnings("ignore")

plt.ion() # interactive

# Projection information
wgs = from_epsg(4326)
proj = from_epsg(32618)
print(f'Projected CRS: {proj}')

# maindir = '/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping'
maindir = '/home/jovyan/opp-data' # jetstream2

print("Successfully imported all packages!")

In [None]:
class RoofImageDataset_Planet(Dataset):
    """Class to handle PlanetScope SuperDove imagery for Resnet-18"""

    def __init__(self, gdf, img_path, n_bands, img_dim, transform=None):
        """
        Args:
            gdf: Geodataframe containing 'geometry' column and 'class_code' column
            img_path: the path to the PlanetScope SuperDove composite image (single mosaic file)
                - see 'psscene-prep.py' for spectral indices calculation
            imgdim (int): Image dimension for CNN implementation
            transform (callable, optional): Optional transform to be applied on a sample

        Returns image chunks with class labels
        """

        if not os.path.exists(img_path):
            raise ValueError(f'Image does not exists: {img_path}')

        self.geometries = [p.centroid for p in gdf.geometry.values] # gather centroid geoms
        self.img_path = img_path # path to image data
        self.img_dim = img_dim # resnet window dimension, defaults to 64
        self.n_bands = n_bands # number of bands in the input image
        self.Y = gdf.code.values # class codes (numeric)
        self.transform = transform
        
    def __len__(self):
        return len(self.geometries)

    def __getitem__(self, idx):
        if torch.is_tensor(idx):
            idx = idx.tolist()

        # Get the geometry of the idx (centroid)
        geom = self.geometries[idx]

        try:
            sample = self.sample_image(geom)  # run the sampling function
        
            cc = self.Y[idx]  # get the class codes
            if type(cc) != int:
                cc = cc.astype('uint8') # make sure the cc is an integer
            
            # Ensure the sample has the correct dimensions
            assert sample.shape == (self.n_bands, self.img_dim, self.img_dim), f'Invalid sample shape: {sample.shape}'

            if self.transform:
                sample = self.transform(sample)
                        
        except Exception as e:
            raise ValueError(e)
            print(f"Skipping invalid sample at index: {idx}")
            sample = torch.from_numpy(np.zeros((self.n_bands, int(self.img_dim), int(self.img_dim))))
            cc = 255 # highest int8 number to be flagged
         
        # Convert the sample array to a Torch object
        sample = torch.from_numpy(sample)

        # Return the sample and the label as torch objects
        return {'image': sample.type(torch.FloatTensor),
                'code': torch.tensor(cc).type(torch.LongTensor)}

    
    def sample_image(self, geom):
        """ Sample the image at each geometry for the specified image chunk size (window) """
 
        N = self.img_dim # window size to be used for cropping
            
        # Use the windows.from_bounds() method to return the window
        # Returns image chunks from training data locations
        with rio.open(self.img_path) as src:
            py, px = src.index(geom.x, geom.y)
            window = rio.windows.Window(px - N // 2, py - N // 2, N, N)
            # print(window)
            
            # Read the data in the window
            # clip is a nbands * N * N numpy array
            clip = src.read(window=window, indexes=list(range(1, self.n_bands + 1)))

            del py, px, window # clean up

        # Convert the image chunk to a numpy array
        clip_arr = np.array(clip)

        # Check if the image chunk has valid data
        if clip_arr.sum() > 0:
            # Mask invalid values in each band independently
            ans = np.ma.masked_equal(clip_arr, 0).filled(0)
        else:
            ans = clip_arr
        
        del clip, clip_arr # clean up
        return ans


def make_good_batch(batch):
    """
    Removes bad samples if image dimensions do not match.
    Args:
        - batch: list of dictionaries, each containing 'image' tensor and 'code' tensor
    returns: list of dictionaries same as input with samples having non-matching image dims removed
    """

    _idx = torch.where(batch['code'] != 255)[0] # good batches

    new_batch = {}
    new_batch['image'] = batch['image'][_idx]
    new_batch['code'] = batch['code'][_idx]

    return new_batch


def balance_sampling(df, ratio=5, strategy='undersample'):
    """
    Generate balanced sample from training data based on the defined ratio.
    This can be done with majority undersampling or minority oversampling ('strategy' parameter)
    Args:
        - df: the dataframe with rows as training data
        - ratio: the sampling ration (i.e., 5:1 for minority classes default)
    Returns:
        - random sample with class ratios as defined
    """
    
    # Get the class counts
    class_counts = df['class_code'].value_counts()
    min_class_count = class_counts.min()
    
    # Calculate the target count for each class based on the ratio
    target_count = {
        class_label: min(min_class_count * ratio, len(df[df['class_code'] == class_label])) 
                        for class_label in class_counts.index
    }
    
    # Create an empty list to store balanced dataframes
    balanced_dfs = []
    for class_label in class_counts.index:
        class_df = df[df['class_code'] == class_label]
        if strategy == 'undersample':
            # Under-sample the majority class
            balanced_class_df = resample(
                class_df, replace=False, n_samples=target_count[class_label], random_state=42)
        elif strategy == 'oversample':
            # Over-sample the minority class
            balanced_class_df = resample(
                class_df, replace=True, n_samples=target_count[class_label], random_state=42)
        balanced_dfs.append(balanced_class_df)

    # Concatenate the results by class
    balanced_df = pd.concat(balanced_dfs)
    return balanced_df


def split_training_data(gdf, ts, vs):
    """ 
    Splits dataframe into train, test, and validation samples with the defined ratios 
    Args:
        - gdf: training samples (geo data frame)
        - ts: test size #
        - vs: validation size #
    Returns:
        train, test, and validation dataframes
    """
    
    train_df, test_df, val_df = [], [], []

    for cl in gdf.class_code.unique():
        
        # subset to class
        _gdf = gdf.loc[gdf.class_code == cl]
        
        # get train and test validation arrays. 
        # test array is validation array split in half.
        _train, _valtest = train_test_split(_gdf, random_state=27, test_size=ts)
        train_df.append(_train)
        
        _val, _test = train_test_split(_valtest, random_state=27, test_size=vs)
        test_df.append(_test)
        val_df.append(_val)

    # Concatenate the samples across classes
    all_train_df = pd.concat(train_df)
    all_train_df = gpd.GeoDataFrame(all_train_df, crs=gdf.crs)
    
    all_val_df = pd.concat(val_df)
    all_val_df = gpd.GeoDataFrame(all_val_df, crs=gdf.crs)
    
    all_test_df = pd.concat(test_df)
    all_test_df = gpd.GeoDataFrame(all_test_df, crs=gdf.crs)

    return all_train_df, all_val_df, all_test_df


print("Class and functions ready to use!")

In [None]:
os.chdir('/home/jovyan')
print(os.getcwd())
print(os.listdir(os.getcwd()))

In [None]:
# Load the training data (footprints)
ref_path = 'opp-data/dc_data_reference_footprints.gpkg'
ref = gpd.read_file(ref_path)
ref.head()

In [None]:
# Observe the class imbalance in the reference data
print(f"Class counts:\n\n{ref.description.value_counts()}\n")

In [None]:
ref['code'], _ = pd.factorize(ref['class_code']) # create a factorized version
print(ref['class_code'].value_counts())  # check the counts

In [None]:
# Create a dictionary mapping class_code to code
code_mapping = dict(zip(ref['class_code'], ref['code']))
desc_mapping = dict(zip(ref['class_code'], ref['description']))
print(f'Code map: {code_mapping}\nDescription map: {desc_mapping}')

In [None]:
# Calculate average footprint area and side length
mean_area_sqm = int(ref.areaUTMsqft.values.mean()) * 0.092903
pct95_area_sqm = np.percentile(ref.areaUTMsqft, 95) * 0.092903
print(f'Mean footprint area (sqm): {mean_area_sqm}')
print(f'95th percentile footprint area (sqm): {pct95_area_sqm}')
# Calculate the side length
pct95_side_length = int(np.sqrt(pct95_area_sqm))
print(f'95th percentile side length (m): {pct95_side_length}')

In [None]:
# Identify 'pure' training locations

In [None]:
print(ref.crs)

In [None]:
# Create centroids
ref_pt = ref.copy()
ref_pt = ref_pt.to_crs(epsg=32618) # UTM Zone 18N
ref_pt['geometry'] = ref_pt['geometry'].centroid

# Define the window size and half window (for boxes)
window_size = 48 # 4 times the average side length
half_window = window_size / 2

training_windows = [] # image windows with >50% of specific roof type
training_roof_types = [] # roof type codes for valid windows

# Loop through each footprint individually
for geom, roof_type in zip(ref.geometry, ref['class_code']):

    centroid = geom.centroid # footprint centroid
    
    # calculate the image window (64x64)
    window = box(centroid.x - half_window, centroid.y - half_window,
                 centroid.x + half_window, centroid.y + half_window)

    # Intersect with centroids to get class count within window
    intersect = ref_pt[ref_pt.intersects(window)]
    
    # Get the total count and count for the class
    total_count = len(intersect)
    class_count = len(intersect[intersect['class_code'] == roof_type])

    # Check if there is at least 50% of the roof type in that window
    if total_count > 0 and (class_count / total_count) > 0.50:
        training_windows.append(centroid)
        training_roof_types.append(roof_type)

    del intersect, window, centroid

# Create a GeoDataFrame for the training windows with roof types
ref_windows = gpd.GeoDataFrame({
    'geometry': training_windows, 
    'class_code': training_roof_types
}, crs=ref.crs)

# Create a numeric code for the training data frame
ref_windows['code'], _ = pd.factorize(ref_windows['class_code'])
print("Spatial filtering complete.")

In [None]:
# Save out
os.getcwd()
out_file = 'opp-data/dc_rooftop_materials_training_windows.gpkg'
ref_windows.to_file(out_file)

del training_windows, training_roof_types, ref_pt
gc.collect()

In [None]:
# Plot the training locations with colors based on roof type
fig, ax = plt.subplots(figsize=(6, 6))
ref_windows.plot(column='class_code', ax=ax, legend=True, cmap='Set1', edgecolor='black')
plt.title('Training Locations by Roof Material Type')
plt.show()

In [None]:
print(ref_windows.class_code.value_counts())
# Create a dictionary mapping class_code to code
class_mapping = dict(zip(ref_windows['class_code'], ref_windows['code']))
print(class_mapping)

In [None]:
# Perform balanced sampling (random undersampling)
ref_bal = balance_sampling(ref_windows, ratio=50, strategy='undersample')
ref_bal.code.value_counts()

In [None]:
# Split the train/test data
train_df, val_df, test_df = split_training_data(ref_bal, ts=0.4, vs=0.2)

# Print the class distribution in training and validation sets to verify stratification
print("Train class distribution:\n", train_df['code'].value_counts())
print("Validation class distribution:\n", val_df['code'].value_counts())
print("Test class distribution:\n", test_df['code'].value_counts())

In [None]:
# Load our image data to check on the format
stack_da_fp = os.path.join('opp-data/dc_0623_psscene8b_final_norm.tif')
stack_da = rxr.open_rasterio(stack_da_fp, mask=True, cache=False).squeeze()
n_bands = stack_da.values.shape[:1][0]
print(
    f"shape: {stack_da.rio.shape}\n"
    f"bands: {n_bands}\n"
    f"resolution: {stack_da.rio.resolution()}\n"
    f"bounds: {stack_da.rio.bounds()}\n"
    f"sum: {stack_da.sum().item()}\n"
    f"CRS: {stack_da.rio.crs}\n"
    f"NoData: {stack_da.rio.nodata}\n"
    f"Array: {stack_da}"
)
del stack_da

In [None]:
# Set up the Resnet-18 model

n_bands = n_bands

# Define whether to leverage cpu or gpu (for my local machine it is only cpu)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # get device for gpu or cpu
print(f'Using {device} for model dev ...')

# Grab the number of classes
n_classes = ref_bal.class_code.unique().shape[0]
print(f'There are {n_classes} roof type classes.')

# Define the Resnet-18 model (in_channels = number of bands in the image)
model = resnet18(n_classes, in_channels=n_bands, pretrained=False)

# Make model parallel and on GPU
if torch.cuda.device_count() >= 1:
    print("Using ", torch.cuda.device_count(), "GPUs!")
    model = nn.DataParallel(model)
    model.to(device)
else:
    #ps_model = nn.DataParallel(ps_model)
    model = nn.DataParallel(model)
    print('Made cpu parallel')

In [None]:
# Number of samples in each class
val_counts = list(train_df['code'].value_counts())
print(val_counts)

total_samples = sum(val_counts)

# Calculate class weights
class_weights = [total_samples / count for count in val_counts]
class_weights = torch.tensor(class_weights, dtype=torch.float).to(device)

# Print the calculated class weights for verification
print(f"Class weights: {class_weights}")

# Loss function
criterion = nn.CrossEntropyLoss(weight=class_weights).to(device)

In [None]:
gc.collect()

In [None]:
print(f"Train DataFrame indices: {val_df.index}")

In [None]:
from sklearn.model_selection import ParameterGrid
from sklearn.metrics import make_scorer
import itertools

# Define the grid of hyperparameters
param_grid = {
    'window_size': [36, 72, 144],
    'batch_size': [64, 128, 224],
    'learning_rate': [0.01, 0.001, 0.0001]
}

# Create a parameter grid
param_list = list(ParameterGrid(param_grid))
print(f'There are {len(param_list)} parameter combinations to test!')

In [None]:
def resnet_tuning(params):
    ''' Parameter testing for simple Resnet-18 '''

    batch_size = params['batch_size']
    window_size = params['window_size']
    learning_rate = params['learning_rate']

    # Create the training samples
    train_ds = RoofImageDataset_Planet(train_df[['geometry', 'code']], stack_da_fp, n_bands=n_bands, img_dim=window_size)
    train_loader = DataLoader(train_ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)
    
    # Create the validation samples
    val_ds = RoofImageDataset_Planet(val_df[['geometry', 'code']], stack_da_fp, n_bands=n_bands, img_dim=window_size)
    val_loader = DataLoader(val_ds, batch_size=batch_size, shuffle=True, num_workers=0, pin_memory=True)

    # Loss function and optimizer
    optimizer = optim.SGD(model.parameters(), lr=learning_rate, momentum=0.9, weight_decay=1e-5)
    lr_scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', factor=0.1, patience=3, verbose=True)

    # Initialize lists to track the losses for each epoch
    losses = []
    epoch_losses = []
    val_losses = []

    val_losses_c = 0.0 # for cumulative val_loss
    running_loss = 0.0 # to store epoch loss

    epoch_accuracy = []
    epoch_precision = []
    epoch_recall = []
    epoch_f1 = []
    epoch_time = []
    
    # Training loop
    for epoch in range(12):  # Adjust number of epochs as needed
        epoch_t0 = time.time()
        current_lr = optimizer.param_groups[0]['lr']
        # Model training
        model.train()
        for idx, batch in enumerate(train_loader):
            # Ensure a good batch
            batch = make_good_batch(batch)
            
            # Extract samples
            image, target = batch['image'].to(device), batch['code'].to(device)
            
            optimizer.zero_grad()
            output = model(image.float()) 
            
            loss = criterion(output, target.long())
            running_loss += loss.item() # keep track of the loss
            losses.append((idx, loss.item())) # append to list
            
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

            del image, target, batch, output, loss
            gc.collect()

        # average loss for the epoch
        epoch_losses.append(np.array(losses)[:,1].mean())
            
        # Validation loop
        model.eval()
        
        val_loss = 0
        correct = 0
        all_labels = []
        all_predictions = []
        
        with torch.no_grad():
            for idx, batch in enumerate(val_loader):
                # Ensure a good batch
                batch = make_good_batch(batch)
                
                # Extract samples
                image, target = batch['image'].to(device), batch['code'].to(device)
                output = model(image.float())

                # Get validation loss and predictions
                val_loss += criterion(output, target).item()
                predicted = output.argmax(dim=1, keepdim=True)  # get the index of the max log-probability
                correct += predicted.eq(target.view_as(predicted)).sum().item() # Number of correct
    
                # Store the labels
                all_labels.extend(target.cpu().numpy())
                all_predictions.extend(predicted.cpu().numpy())
    
                del image, target, batch, output
                gc.collect()
            
            val_loss /= len(val_loader.dataset)/val_loader.batch_size
            val_losses_c += val_loss # add to the cumulative
            val_losses.append(val_loss)
    
        # Print the average training/validation loss for the current epoch
        avg_train_loss = epoch_losses[-1]
        avg_val_loss = val_losses_c / int(epoch+1)
        
        # Adjust the learning rate based on the validation loss
        lr_scheduler.step(val_loss)

        print(f"Epoch [{epoch+1}], Average Train Loss: {avg_train_loss:.4f}, Average Validation Loss: {avg_val_loss:.4f}")

        # Check for a learning rate increase
        new_lr = optimizer.param_groups[0]['lr']
        if new_lr != current_lr:
            print(f"-- LR-Scheduler: ReduceLROnPlateau; new LR={new_lr} at epoch {epoch}")
        
        # Calculate additional metrics using sklearn
        epoch_accuracy.append(correct / len(val_loader.dataset)) 
        epoch_precision.append(precision_score(all_labels, all_predictions, average='weighted'))
        epoch_recall.append(recall_score(all_labels, all_predictions, average='weighted'))
        epoch_f1.append(f1_score(all_labels, all_predictions, average='weighted'))

        epoch_t1 = (time.time() - epoch_t0) / 60 # minutes
        epoch_time.append(round(epoch_t1, 2))

    # Get the model averages
    avg_accuracy = sum(epoch_accuracy) / len(epoch_accuracy)
    avg_f1 = sum(epoch_f1) / len(epoch_f1)
    avg_precision = sum(epoch_precision) / len(epoch_precision)
    avg_recall = sum(epoch_recall) / len(epoch_recall)
    avg_time = sum(epoch_time) / len(epoch_time)
    
    print(f"Accuracy: {avg_accuracy:.4f}, Precision: {avg_precision:.2f}, Recall: {avg_recall:.2f}, F1 Score: {avg_f1:.4f}\nTime/epoch: {avg_time}")

    gc.collect()
    
    # Return a combination of metrics to be used in tuning
    return {
        'accuracy': epoch_accuracy,
        'precision': epoch_precision,
        'recall': epoch_recall,
        'f1': epoch_f1,
        'train_losses': epoch_losses,
        'val_losses': val_losses
    }

print("Tuning function ready !")

In [None]:
# Implementation

t0 = time.time()

# Create a blank data frame to store the results
results_df = pd.DataFrame(
    columns=[
        'batch_size', 'window_size', 'learning_rate', 
        'accuracy', 'precision', 'recall', 'f1', 
        'train_losses', 'val_losses', 'time'
    ])

# Perform the grid search
for i, params in enumerate(param_list):
    t00 = time.time()

    params = param_list[i]
    print(f"Testing parameters: {params}")

    # Run the trial, store the metrics
    metrics = resnet_tuning(params)

    t1 = (time.time() - t00) / 60 # minutes
    print(f"Elapsed time for parameter combination {i}: {t1:.2f} minutes.")

    # Store epoch-wise metrics for each trial
    for epoch in range(len(metrics['accuracy'])):
        results_df = results_df.append({
            'trial': i,
            'batch_size': params['batch_size'],
            'window_size': params['window_size'],
            'learning_rate': params['learning_rate'],
            'epoch': epoch + 1,
            'accuracy': metrics['accuracy'][epoch],
            'precision': metrics['precision'][epoch],
            'recall': metrics['recall'][epoch],
            'f1': metrics['f1'][epoch],
            'train_loss': metrics['train_losses'][epoch],
            'val_loss': metrics['val_losses'][epoch],
            'time': round(t1, 2),
        }, ignore_index=True)

    print(f"Final Accuracy: {metrics['accuracy'][-1]}\nFinal F1-score: {metrics['f1'][-1]}")
    
    # Clear unused variables
    del metrics
    gc.collect()  # Run garbage collection
    torch.cuda.empty_cache()  # Clear GPU memory

t2 = ((time.time() - t0) / 60) / 60
print(f"Total elapsed time for hyperparameter tuning: {t2:.2f} hours.")

# save the results for further analysis
results_df.to_csv('opp-data/results/resnet18_grid_search_results.csv', index=False)

In [None]:
# Optionally, save the results for further analysis
results_df.to_csv('opp-data/results/resnet18_grid_search_results.csv', index=False)

In [None]:
gc.collect()

In [None]:
# Retrieve the best parameters