In [None]:
"""
XGBoost multiclassifier for roof materials in D.C.
"""

import os, sys, glob, time
import geopandas as gpd
import pandas as pd
import numpy as np
import rioxarray as rxr
import rasterio as rio

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, accuracy_score

import xgboost as xgb

maindir = '/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping'

print("Imports successful!")

In [None]:
# Functions!

def compute_band_stats(geom, band, stat, nodataval):
    """
    Computes band statistics for given geometries

    Args:
        geom: geometries from which to sample image data
        band: the band to calculate statistics
        stat: which statostoic to be used (inherited)
        nodataval: the No Data value to be used
        
    """

    # Check the geometry type to determine which method to use
    if (geom.geometry.geom_type).isin(['Polygon', 'MultiPolygon']):
    
        stats = zonal_stats(
            self.geometries,
            self.img,
            stats=[stat],
            band_num=band,
            all_touched=True,
            nodata=nodataval,
            geojson_out=False
        )
        
        return {band: [feature['properties'][stat] for feature in stats]}

    else:

        coord_list = [(x, y) for x, y in zip(geom["geometry"].x, geom["geometry"].y)]
        
        points[f"{band}"] = [x for x in img.sample(coord_list, indexes=i+1)]
        
        points_df = points.reset_index()
        
        points_df[desc] = points_df[band_names].astype(np.float32)
        
        return points_df
        
        
def sample_image_da(img_path, geom, stat='mean'):
    """
    """
    # Sample the image at each geometry    
    # Create a copy of the polygons to store the results
    stats_df = geom.copy()

    # Number of bands to be processed
    n_bands = img.count
    band_names = img.long_name
    nodataval = img.nodata

    # Calculate the number of cores to use, reserving 2 cores
    num_cores = os.cpu_count()
    if num_cores is not None:  # os.cpu_count() can return None
        max_workers = max(1, num_cores - 1)  # Reserve 2 cores, but ensure at least 1 worker
    else:
        max_workers = 1  # Default to 1 worker if os.cpu_count() is None

    # Set up parallel processing
    with ProcessPoolExecutor(max_workers=max_workers) as executor:
        futures = []
        for band, band_name in zip(n_bands, band_names):
            print(f"Processing for {band}.")
            futures.append(executor.submit(compute_band_stats, band, img_path, geom, stat, nodataval))

        for future in futures:
            result = future.result()
            band = list(result.keys())[0]
            stats_df[f'band_{band}'] = result[band]

    # Optionally, rename columns based on band names
    band_name_mapping = {f'band_{i + 1}': name for i, name in enumerate(band_names)}
    stats_df.rename(columns=band_name_mapping, inplace=True)

    return stats_df

    
def print_raster(raster, open_file):
    """
    :param raster: input raster file
    :param open_file: should the file be opened or not
    :return: print statement with raster information
    """
    if open_file is True:
        img = rxr.open_rasterio(raster,masked=True, cache=False).squeeze()
    else:
        img = raster
    print(
        f"shape: {img.rio.shape}\n"
        f"resolution: {img.rio.resolution()}\n"
        f"bounds: {img.rio.bounds()}\n"
        f"sum: {img.sum().item()}\n"
        f"CRS: {img.rio.crs}\n"
        f"NoData: {img.rio.nodata}"
        f"Array: {img}"
    )
    del img


# Apply a minimum distance sample to training data
def min_dist_sample(gdf, min_distance):
    """
    Filters the GeoDataFrame to ensure samples are at least min_distance apart.

    Args:
        gdf: GeoDataFrame containing 'geometry' column.
        min_distance: Minimum distance between samples in the same units as the geometry.

    Returns:
        Filtered GeoDataFrame.
    """
    coords = np.array([[geom.centroid.x, geom.centroid.y] for geom in gdf.geometry])
    tree = KDTree(coords)
    indices_to_keep = set(range(len(gdf)))

    for i in range(len(gdf)):
        if i not in indices_to_keep:
            continue
        indices = tree.query_radius([coords[i]], r=min_distance)[0]
        for index in indices:
            if index != i:
                indices_to_keep.discard(index)

    del coords, tree, indices

    return gdf.iloc[list(indices_to_keep)]
    

In [None]:
# Load our image data to check on the format
stack_da_fp = os.path.join(maindir,'data/spatial/mod/dc_data/planet-data/dc_data_psscene15b.tif')
stack_da = rxr.open_rasterio(stack_da_fp, masked=True, cache=False).squeeze()
print_raster(stack_da, open_file=False)
band_names = stack_da.long_name
del stack_da

In [None]:
band_names = list(band_names)
band_names.append('class_code')
band_names.append('uid')
band_names

In [None]:
# # Load the training data (footprints)
# gdf_path = os.path.join(maindir,'data/spatial/mod/dc_data/training/dc_data_reference_footprints.gpkg')
# ref = gpd.read_file(gdf_path)
# footprints.head()

In [None]:
# Load the training data (sampled building footprints)
######################################################

# Footprints (from "sample-stack.py")
ref_tbl_path_fp = os.path.join(maindir,'data/tabular/mod/dc_data/training/dc_data_reference_sampled_footprint.csv')
ref = pd.read_csv(ref_tbl_path_fp)

# Retain samples of band matches
ref = ref[band_names]

# Create a numeric class code
class_mapping = {label: idx for idx, label in enumerate(ref['class_code'].unique())}
ref['Y'] = ref['class_code'].map(class_mapping)

ref.head()

In [None]:
print(ref['class_code'].value_counts())

In [None]:
t0 = time.time()

# Set up the model data
y = ref['Y']
X = ref.drop(['class_code', 'uid', 'Y'], axis=1)

# Define dataframes to store results for this feature set
results = pd.DataFrame()  # to store the model performance metrics
feat_imps = pd.DataFrame()  # to store the feature importances
prob_preds = pd.DataFrame()  # for testing optimum cutoff

# Calculate class weights
class_counts = y.value_counts()
total_samples = len(y)
class_weights = {cls: total_samples / count for cls, count in class_counts.items()}
print(f'Class weights: {class_weights}')

# Set up the stratified K-fold
skf = StratifiedKFold(n_splits=10, shuffle=True, random_state=42)

# Loop the folds
fold_idx = 1
for train_index, test_index in skf.split(X, y):
    print(f'Fold: {fold_idx}')

    # Split into train/test sets
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

    # Map class weights to each sample in the training set
    sample_weights = y_train.map(class_weights).values
    # print(sample_weights)

    # Initialize the XGBoost classifier for multi-class classification
    xgb_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(y)),
        n_estimators=1001,
        learning_rate=0.01,
        max_depth=8,
        random_state=42
    )

    # Fit the model
    xgb_model.fit(X_train, y_train, sample_weight=sample_weights)

    # Store feature importance
    fold_imps = pd.DataFrame({
        'Fold': fold_idx,
        'Feature': X.columns,
        'Importance': xgb_model.feature_importances_
    })

    feat_imps = pd.concat([feat_imps, fold_imps], axis=0)

    # Predict on the test set
    y_pred = xgb_model.predict(X_test)

    # Retrieve the accuracy/performance metrics
    precision = precision_score(y_test, y_pred, average='weighted', zero_division=0)
    recall = recall_score(y_test, y_pred, average='weighted', zero_division=0)
    f1 = f1_score(y_test, y_pred, average='weighted')
    mcc = matthews_corrcoef(y_test, y_pred)
    accuracy = accuracy_score(y_test, y_pred)

    # Store the metrics into the results data frame
    fold_results = pd.DataFrame({
        'Fold': [fold_idx],
        'Precision': [precision],
        'Recall': [recall],
        'F1': [f1],
        'MCC': [mcc],
        'Accuracy': [accuracy]
    })
    results = pd.concat([results, fold_results], ignore_index=True)

    # Store the probability values for cutoff testing
    y_pred_proba = xgb_model.predict_proba(X_test)

    # Store probabilities and true labels
    fold_probs = pd.DataFrame({
        'TrueLabel': y_test,
        'PredictedProb': list(y_pred_proba),
        'Fold': fold_idx
    })
    prob_preds = pd.concat([prob_preds, fold_probs], ignore_index=True)

    fold_idx += 1

    del fold_probs, fold_results, fold_imps

    t1 = (time.time() - t0) / 60
    print(f"Total elapsed time for fold {fold_idx}: {t1:.2f} minutes.")
    print("\n~~~~~~~~~~\n")

t2 = (time.time() - t0) / 60
print(f"Total elapsed time: {t2:.2f} minutes.")

# # Append the feature set-specific results to the overall results dataframes
# all_results = pd.concat([all_results, results], ignore_index=True)
# all_feat_imps = pd.concat([all_feat_imps, feat_imps], ignore_index=True)
# all_prob_preds = pd.concat([all_prob_preds, prob_preds], ignore_index=True)

# del results, feat_imps, prob_preds

In [None]:
results.head()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import matplotlib.pyplot as plt

# Combine true labels and predicted labels across all folds
all_true_labels = []
all_pred_labels = []

for fold in range(1, fold_idx):
    fold_data = prob_preds[prob_preds['Fold'] == fold]
    all_true_labels.extend(fold_data['TrueLabel'])
    all_pred_labels.extend(np.argmax(np.vstack(fold_data['PredictedProb']), axis=1))

# Create the confusion matrix
cm = confusion_matrix(all_true_labels, all_pred_labels)
disp = ConfusionMatrixDisplay(confusion_matrix=cm, display_labels=np.unique(y))

# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))
disp.plot(ax=ax)
plt.title('Confusion Matrix')

plt.savefig(os.path.join(maindir,'figures/FigX_xgboost_confusion_matrix.png'), dpi=300, bbox_inches='tight')

plt.show()

In [None]:
from sklearn.metrics import classification_report

# Create a classification report and convert it to a DataFrame
cor_labels = [label for label, idx in sorted(class_mapping.items(), key=lambda item: item[1])]
cr_df = pd.DataFrame(classification_report(all_true_labels, all_pred_labels, target_names=cor_labels, output_dict=True)).transpose()

# Compute the average accuracy metrics across the 10 folds
average_metrics = cr_df.loc[cor_labels].mean()

# Display the DataFrame
cr_df

In [None]:
# Save out the results
results.to_csv(os.path.join(maindir,'xgboost_folds_results.csv'))
feat_imps.to_csv(os.path.join(maindir,'xgboost_folds_feat_imps.csv'))
prob_preds.to_csv(os.path.join(maindir,'xgboost_folds_prob_peds.csv'))
cr_df.to_csv(os.path.join(maindir,'xgboost_classification_report_avg.csv'))