In [16]:
"""
XGBoost multiclassifier for roof materials in D.C.
"""

import os, sys, glob, time
import geopandas as gpd
import pandas as pd
import numpy as np
import rioxarray as rxr
import rasterio as rio
import xgboost as xgb
import optuna
import gc

from sklearn.ensemble import IsolationForest
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import precision_score, recall_score, f1_score, matthews_corrcoef, accuracy_score

# Functions script import
sys.path.append(os.path.join(os.getcwd(),'code/'))
from __functions import *

print(os.getcwd())

maindir = '/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping'
# homedir = '/home/jovyan/data-store/data/iplant/home/maco4303/data/' # cyverse

print("Imports successful!")

/Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping/code
Imports successful!


In [2]:
results_dir = os.path.join(maindir, 'results/xgboost-cv/')
if not os.path.exists(results_dir):
    os.makedirs(results_dir)
print(f"Saving results to {results_dir}")

Saving results to /Users/max/Library/CloudStorage/OneDrive-Personal/mcook/earth-lab/opp-rooftop-mapping/results/xgboost-cv/


In [3]:
# Load the image data

In [4]:
# Load our image data to check on the format
stack_da_fp = os.path.join(maindir,'data/spatial/mod/dc_data/planet-data/dc_0623_psscene8b_final_norm.tif')
# stack_da_fp = os.path.join(homedir,'OPP/planet/dc/dc_0623_psscene8b_final_norm.tif') # cyverse
stack_da = rxr.open_rasterio(stack_da_fp, masked=True, cache=False).squeeze()
print_raster(stack_da, open_file=False)
band_names = list(stack_da.long_name)
print(f"Band names: {band_names}")
del stack_da

shape: (8223, 6714)
resolution: (3.0, -3.0)
bounds: (315267.0, 4294629.0, 335409.0, 4319298.0)
sum: 8.181640625
CRS: EPSG:32618
NoData: None
Band names: ['nir', 'NDBIbg', 'NDBIrg', 'NISI', 'MNF1', 'NISI5x5']


In [5]:
# Load the reference data

In [6]:
# Load the training data (footprints)
ref_fp = os.path.join(maindir,'data/spatial/mod/dc_data/training/dc_data_reference_footprints_sampled.gpkg')
# ref_fp = os.path.join(homedir,'OPP/training/dc/dc_data_reference_footprints_sampled.gpkg')
ref = gpd.read_file(ref_fp)
ref.head()

Unnamed: 0,uid,class_code,description,areaUTMsqft,lotSizeSqft,nir,NDBIbg,NDBIrg,NISI,MNF1,NISI5x5,geometry
0,1CS,CS,Composition Shingle,357.783709,5574.0,-0.246291,0.373826,0.472331,0.204085,0.080244,2.598158,"POLYGON ((324215.868 4313568.665, 324215.792 4..."
1,3CS,CS,Composition Shingle,918.640862,1111.0,-0.856021,0.632535,1.064584,1.001515,0.569262,0.367447,"POLYGON ((324602.816 4311717.247, 324604.322 4..."
2,9CS,CS,Composition Shingle,1383.41417,4222.0,-0.864054,0.813552,0.806408,0.73498,0.218044,-0.683204,"POLYGON ((327253.581 4300371.859, 327258.154 4..."
3,19CS,CS,Composition Shingle,836.410297,3450.0,-0.735984,0.305262,0.856115,0.50869,0.070037,0.680645,"POLYGON ((333608.13 4306267.691, 333607.957 43..."
4,27CS,CS,Composition Shingle,330.514264,4352.0,-0.418819,0.516599,0.897064,0.448048,0.185206,-0.394834,"POLYGON ((326482.699 4300939.466, 326487.386 4..."


In [7]:
print(ref['class_code'].value_counts())

class_code
CS    37133
ME    36864
SL    15267
UR     1279
WS     1152
TL      924
SH      783
Name: count, dtype: int64


In [8]:
# Create a numeric code column
ref['code'], _ = pd.factorize(ref['class_code'])

# Create a dictionary mapping class_code to code
code_mapping = dict(zip(ref['class_code'], ref['code']))
desc_mapping = dict(zip(ref['class_code'], ref['description']))

print(f'Code map: \n{code_mapping}\nDescription map: \n{desc_mapping}')

Code map: 
{'CS': 0, 'ME': 1, 'SL': 2, 'UR': 3, 'TL': 4, 'WS': 5, 'SH': 6}
Description map: 
{'CS': 'Composition Shingle', 'ME': 'Metal', 'SL': 'Slate', 'UR': 'Urethane', 'TL': 'Tile', 'WS': 'Wood shake/shingle', 'SH': 'Shingle'}


In [9]:
# Create a holdout sample for model evaluation later on

In [10]:
# Create the holdout data (independent from model training in CV)
holdout_fraction = 0.20 # excluding 20% of the data for holdout/independent testing
train_df, holdout_df, _ = split_training_data(ref, ts=holdout_fraction, vs=None) # no validation needed yet

print("Training set class distribution:\n", train_df[['class_code','code']].value_counts())
print("Holdout set class distribution:\n", holdout_df[['class_code','code']].value_counts())

Training set class distribution:
 class_code  code
CS          0       29706
ME          1       29491
SL          2       12213
UR          3        1023
WS          5         921
TL          4         739
SH          6         626
Name: count, dtype: int64
Holdout set class distribution:
 class_code  code
CS          0       7427
ME          1       7373
SL          2       3054
UR          3        256
WS          5        231
TL          4        185
SH          6        157
Name: count, dtype: int64


In [11]:
# Parameter tuning using Optuna

In [17]:
# Define the objective function for Optuna
def objective(trial):
    # Define the hyperparameter search space
    params = {
        'objective': 'multi:softmax',
        'num_class': len(np.unique(train_df['code'])),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-6, 1e-2),
        'max_depth': trial.suggest_int('max_depth', 3, 15),
        'n_estimators': trial.suggest_int('n_estimators', 101, 1001),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'gamma': trial.suggest_loguniform('gamma', 1e-9, 1.0),
        'random_state': 44,
    }

    # Stratified 5-fold cross-validation
    skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
    
    f1_scores = [] # to store the validation F1 for each fold
    
    for train_idx, val_idx in skf.split(train_df, train_df['class_code']):
        # Extract the train and validation sets
        fold_train = train_df.iloc[train_idx]
        fold_val = train_df.iloc[val_idx]

        # Undersample majority class in training set only
        fold_train_bal = balance_sampling(fold_train, ratio=5, strategy='undersample')

        # Calculate the class weights
        val_counts = fold_train_bal['code'].value_counts()
        total_samples = sum(val_counts)
        class_weights = np.array([total_samples / count for count in val_counts])
        class_weights_n = class_weights / class_weights.sum() # normalize the weights
        class_weights_n = {cls: weight for cls, weight in zip(np.unique(fold_train_bal['code']), class_weights)}

        # Define our train/validation sets
        X_train = fold_train_bal[band_names].astype(float)
        y_train = fold_train_bal['code'].astype(int)
        X_val = fold_val[band_names].astype(float)
        y_val = fold_val['code'].astype(int)
        
        del fold_train, fold_val, fold_train_bal # clear up space

        # Fit the XGBoost with params
        xgb_model = xgb.XGBClassifier(n_jobs=1, **params) # avoid over-using CPU resources
        xgb_model.fit(
            X_train, y_train,
            eval_set=[(X_val, y_val)],  # Specify validation set
            sample_weight=y_train.map(class_weights_n).values,
            eval_metric=['mlogloss'], # good for multiclass
            early_stopping_rounds=10, # stop if no improvement
            verbose=False
        )

        # Evaluate F1-score on the validation set
        y_pred = xgb_model.predict(X_val)
        f1 = f1_score(y_val, y_pred, average='weighted')
        f1_scores.append(f1)

        del y_pred, f1, X_train, y_train, X_val, y_val, xgb_model

    # Calculate mean metrics across folds
    f1_ = np.mean(f1_scores)

    del f1_scores
    gc.collect()

    return f1_  # Optuna will maximize this score

print("Optuna function ready to go !!!")

Optuna function ready to go !!!


In [18]:
print("~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
# Implement the optuna objective study
study = optuna.create_study(direction='maximize', study_name='dc_xgboost_tuning') # maximize the scoring criteria
study.optimize(objective, n_trials=50, n_jobs=5, show_progress_bar=True)  # Adjust the number of trials as needed
print("\n~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~\n")
# Retrieve the best hyperparameters
best_params = study.best_params
print(f"Best hyperparameters found: {best_params}")
# Save the Optuna study for later analysis
study.trials_dataframe().to_csv(os.path.join(results_dir, 'dc-xgboost-cv_optuna_trials.csv'), index=False)

[I 2024-09-24 14:46:30,769] A new study created in memory with name: no-name-cfe30995-3e10-49b8-8e9a-266a57eabb9c


~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~



Best trial: 2. Best value: 0.477633:   2%|▏        | 1/50 [01:28<1:12:19, 88.57s/it]

[I 2024-09-24 14:47:59,343] Trial 2 finished with value: 0.47763309059404513 and parameters: {'learning_rate': 0.00025230082034872765, 'max_depth': 5, 'n_estimators': 455, 'min_child_weight': 5, 'subsample': 0.525296899263471, 'colsample_bytree': 0.8701283142284606, 'gamma': 1.4687384197120863e-06}. Best is trial 2 with value: 0.47763309059404513.


Best trial: 4. Best value: 0.570337:   4%|▍          | 2/50 [01:57<42:41, 53.37s/it]

[I 2024-09-24 14:48:28,075] Trial 4 finished with value: 0.570336583388277 and parameters: {'learning_rate': 0.0032385602027968845, 'max_depth': 11, 'n_estimators': 319, 'min_child_weight': 8, 'subsample': 0.6509767906011497, 'colsample_bytree': 0.6422165463956916, 'gamma': 0.0031243933541480586}. Best is trial 4 with value: 0.570336583388277.


Best trial: 4. Best value: 0.570337:   6%|▋          | 3/50 [02:30<34:26, 43.97s/it]

[I 2024-09-24 14:49:00,853] Trial 5 finished with value: 0.4598921770703323 and parameters: {'learning_rate': 1.9369984715641454e-06, 'max_depth': 5, 'n_estimators': 297, 'min_child_weight': 8, 'subsample': 0.8622609615280588, 'colsample_bytree': 0.9336122487614599, 'gamma': 0.07042676451176706}. Best is trial 4 with value: 0.570336583388277.


Best trial: 4. Best value: 0.570337:   8%|▉          | 4/50 [02:45<25:08, 32.79s/it]

[I 2024-09-24 14:49:16,513] Trial 0 finished with value: 0.5157520602272279 and parameters: {'learning_rate': 4.7090575674310376e-06, 'max_depth': 8, 'n_estimators': 540, 'min_child_weight': 8, 'subsample': 0.7076176616135557, 'colsample_bytree': 0.7321062103012452, 'gamma': 0.6296116748596526}. Best is trial 4 with value: 0.570336583388277.


Best trial: 4. Best value: 0.570337:  10%|█          | 5/50 [03:32<28:29, 37.99s/it]

[I 2024-09-24 14:50:03,723] Trial 8 finished with value: 0.4343822456928298 and parameters: {'learning_rate': 2.073871560998475e-06, 'max_depth': 3, 'n_estimators': 506, 'min_child_weight': 1, 'subsample': 0.5539690597911306, 'colsample_bytree': 0.5335589569777517, 'gamma': 1.4865727190941404e-05}. Best is trial 4 with value: 0.570336583388277.


Best trial: 4. Best value: 0.570337:  12%|█▎         | 6/50 [04:22<30:47, 41.98s/it]

[I 2024-09-24 14:50:53,447] Trial 9 finished with value: 0.4839912673605237 and parameters: {'learning_rate': 0.004010116364099146, 'max_depth': 6, 'n_estimators': 196, 'min_child_weight': 1, 'subsample': 0.8208807892205442, 'colsample_bytree': 0.979047948663019, 'gamma': 0.0009764429941316682}. Best is trial 4 with value: 0.570336583388277.


Best trial: 7. Best value: 0.599452:  14%|█▌         | 7/50 [04:56<28:06, 39.21s/it]

[I 2024-09-24 14:51:26,963] Trial 7 finished with value: 0.5994517951329639 and parameters: {'learning_rate': 4.698392757944456e-06, 'max_depth': 15, 'n_estimators': 303, 'min_child_weight': 7, 'subsample': 0.5984966326003425, 'colsample_bytree': 0.5734535203766984, 'gamma': 0.00015302493721944648}. Best is trial 7 with value: 0.5994517951329639.


Best trial: 7. Best value: 0.599452:  16%|█▊         | 8/50 [05:43<29:10, 41.67s/it]

[I 2024-09-24 14:52:13,899] Trial 1 finished with value: 0.5345846931331715 and parameters: {'learning_rate': 0.00033172190757653003, 'max_depth': 10, 'n_estimators': 733, 'min_child_weight': 2, 'subsample': 0.9445005441558736, 'colsample_bytree': 0.9472351353760837, 'gamma': 0.1065094245460909}. Best is trial 7 with value: 0.5994517951329639.


Best trial: 7. Best value: 0.599452:  18%|█▉         | 9/50 [05:46<20:15, 29.65s/it]

[I 2024-09-24 14:52:17,112] Trial 3 finished with value: 0.5562503984917629 and parameters: {'learning_rate': 1.1335706358318117e-06, 'max_depth': 11, 'n_estimators': 960, 'min_child_weight': 9, 'subsample': 0.6853943284386506, 'colsample_bytree': 0.6450025001594687, 'gamma': 2.082085072317861e-07}. Best is trial 7 with value: 0.5994517951329639.


Best trial: 7. Best value: 0.599452:  20%|██        | 10/50 [05:59<16:19, 24.48s/it]

[I 2024-09-24 14:52:30,017] Trial 10 finished with value: 0.439590541781295 and parameters: {'learning_rate': 1.442718344816585e-06, 'max_depth': 4, 'n_estimators': 577, 'min_child_weight': 1, 'subsample': 0.9183577120967734, 'colsample_bytree': 0.9646330419924425, 'gamma': 0.0667085901407332}. Best is trial 7 with value: 0.5994517951329639.


Best trial: 7. Best value: 0.599452:  22%|██▏       | 11/50 [06:14<13:59, 21.53s/it]

[I 2024-09-24 14:52:44,861] Trial 11 finished with value: 0.5654174986641836 and parameters: {'learning_rate': 3.960773670328194e-06, 'max_depth': 10, 'n_estimators': 205, 'min_child_weight': 1, 'subsample': 0.5561765975591138, 'colsample_bytree': 0.7742903568594388, 'gamma': 1.5462844810299352e-05}. Best is trial 7 with value: 0.5994517951329639.


Best trial: 7. Best value: 0.599452:  24%|██▍       | 12/50 [06:58<18:00, 28.44s/it]

[I 2024-09-24 14:53:29,107] Trial 14 finished with value: 0.5968677006224246 and parameters: {'learning_rate': 2.1538154643153097e-05, 'max_depth': 14, 'n_estimators': 126, 'min_child_weight': 5, 'subsample': 0.6065133352018904, 'colsample_bytree': 0.5002200243270563, 'gamma': 5.645782653679948e-09}. Best is trial 7 with value: 0.5994517951329639.


Best trial: 16. Best value: 0.601675:  26%|██▎      | 13/50 [07:52<22:17, 36.16s/it]

[I 2024-09-24 14:54:23,026] Trial 16 finished with value: 0.6016745333760796 and parameters: {'learning_rate': 2.6689950996183083e-05, 'max_depth': 15, 'n_estimators': 108, 'min_child_weight': 5, 'subsample': 0.6179247878883178, 'colsample_bytree': 0.5005601166834468, 'gamma': 1.5005404514616368e-09}. Best is trial 16 with value: 0.6016745333760796.


Best trial: 16. Best value: 0.601675:  28%|██▌      | 14/50 [09:20<31:02, 51.75s/it]

[I 2024-09-24 14:55:50,799] Trial 15 finished with value: 0.6009000415730597 and parameters: {'learning_rate': 3.216717529954773e-05, 'max_depth': 15, 'n_estimators': 373, 'min_child_weight': 6, 'subsample': 0.6414434858554008, 'colsample_bytree': 0.5222821979966575, 'gamma': 4.598466138114392e-09}. Best is trial 16 with value: 0.6016745333760796.


Best trial: 16. Best value: 0.601675:  30%|██▋      | 15/50 [10:06<29:16, 50.19s/it]

[I 2024-09-24 14:56:37,364] Trial 12 finished with value: 0.5950599551663042 and parameters: {'learning_rate': 1.569605976077028e-06, 'max_depth': 14, 'n_estimators': 581, 'min_child_weight': 7, 'subsample': 0.574919444057491, 'colsample_bytree': 0.6100820139368276, 'gamma': 0.004389357816198275}. Best is trial 16 with value: 0.6016745333760796.


Best trial: 6. Best value: 0.608809:  32%|███▏      | 16/50 [10:07<20:05, 35.45s/it]

[I 2024-09-24 14:56:38,593] Trial 6 finished with value: 0.6088086156636194 and parameters: {'learning_rate': 0.001834017234586252, 'max_depth': 14, 'n_estimators': 880, 'min_child_weight': 1, 'subsample': 0.6611115744806062, 'colsample_bytree': 0.6798680467669853, 'gamma': 0.02056216615512264}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  34%|███▍      | 17/50 [10:27<16:54, 30.75s/it]

[I 2024-09-24 14:56:58,421] Trial 13 finished with value: 0.5687621440720755 and parameters: {'learning_rate': 0.0007221388624010373, 'max_depth': 12, 'n_estimators': 508, 'min_child_weight': 5, 'subsample': 0.830469101641741, 'colsample_bytree': 0.9915369257941999, 'gamma': 7.4461818814693755e-06}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  36%|███▌      | 18/50 [10:54<15:49, 29.68s/it]

[I 2024-09-24 14:57:25,599] Trial 17 finished with value: 0.5940046797828578 and parameters: {'learning_rate': 3.5287863686644105e-05, 'max_depth': 15, 'n_estimators': 349, 'min_child_weight': 6, 'subsample': 0.764551621320765, 'colsample_bytree': 0.5874272780505223, 'gamma': 2.1358249259908504e-09}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  38%|███▊      | 19/50 [12:18<23:44, 45.96s/it]

[I 2024-09-24 14:58:49,497] Trial 18 finished with value: 0.5857011025626676 and parameters: {'learning_rate': 5.295075551460318e-05, 'max_depth': 13, 'n_estimators': 397, 'min_child_weight': 4, 'subsample': 0.7439076469273457, 'colsample_bytree': 0.5976768231699445, 'gamma': 1.127839854133129e-09}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  40%|████      | 20/50 [13:31<27:00, 54.01s/it]

[I 2024-09-24 15:00:02,277] Trial 19 finished with value: 0.5854599607140831 and parameters: {'learning_rate': 4.0383533186025004e-05, 'max_depth': 13, 'n_estimators': 388, 'min_child_weight': 4, 'subsample': 0.7597273935562014, 'colsample_bytree': 0.7072655849879121, 'gamma': 1.9167097095724037e-09}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  42%|████▏     | 21/50 [16:54<47:39, 98.59s/it]

[I 2024-09-24 15:03:24,796] Trial 23 finished with value: 0.5404837539945552 and parameters: {'learning_rate': 0.0011901595777740707, 'max_depth': 8, 'n_estimators': 953, 'min_child_weight': 3, 'subsample': 0.5115988142003816, 'colsample_bytree': 0.7236393254414565, 'gamma': 1.1344460120551678e-07}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  44%|████▍     | 22/50 [18:08<42:37, 91.33s/it]

[I 2024-09-24 15:04:39,191] Trial 24 finished with value: 0.5735214582581724 and parameters: {'learning_rate': 0.009668944354931212, 'max_depth': 8, 'n_estimators': 966, 'min_child_weight': 3, 'subsample': 0.658541840601177, 'colsample_bytree': 0.7932721260156125, 'gamma': 1.2645062581129013e-07}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  46%|████▌     | 23/50 [18:17<29:55, 66.51s/it]

[I 2024-09-24 15:04:47,797] Trial 20 finished with value: 0.583398663535933 and parameters: {'learning_rate': 0.0008543224111569506, 'max_depth': 12, 'n_estimators': 993, 'min_child_weight': 3, 'subsample': 0.7828239576433171, 'colsample_bytree': 0.7175632928536593, 'gamma': 3.7328958138580734e-08}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  48%|████▊     | 24/50 [18:54<24:58, 57.65s/it]

[I 2024-09-24 15:05:24,780] Trial 21 finished with value: 0.5904567729811746 and parameters: {'learning_rate': 5.355938215554079e-05, 'max_depth': 13, 'n_estimators': 973, 'min_child_weight': 3, 'subsample': 0.7537227311233503, 'colsample_bytree': 0.7293361675869688, 'gamma': 8.327425682301171e-08}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 6. Best value: 0.608809:  50%|█████     | 25/50 [19:00<17:38, 42.32s/it]

[I 2024-09-24 15:05:31,352] Trial 22 finished with value: 0.6082350363766673 and parameters: {'learning_rate': 9.124353825833575e-05, 'max_depth': 13, 'n_estimators': 993, 'min_child_weight': 3, 'subsample': 0.5022632734010981, 'colsample_bytree': 0.7377726643420442, 'gamma': 1.0595547371041887e-07}. Best is trial 6 with value: 0.6088086156636194.


Best trial: 25. Best value: 0.614153:  52%|████▏   | 26/50 [23:09<41:46, 104.44s/it]

[I 2024-09-24 15:09:40,714] Trial 25 finished with value: 0.6141532042228375 and parameters: {'learning_rate': 0.00013217400639180558, 'max_depth': 15, 'n_estimators': 740, 'min_child_weight': 3, 'subsample': 0.6422321098793086, 'colsample_bytree': 0.5251378143447732, 'gamma': 2.2478111031064685e-08}. Best is trial 25 with value: 0.6141532042228375.


Best trial: 25. Best value: 0.614153:  54%|████▊    | 27/50 [24:03<34:10, 89.13s/it]

[I 2024-09-24 15:10:34,135] Trial 27 finished with value: 0.5887535928800685 and parameters: {'learning_rate': 1.7533900345273233e-05, 'max_depth': 15, 'n_estimators': 702, 'min_child_weight': 10, 'subsample': 0.6338366528667015, 'colsample_bytree': 0.5355325769882805, 'gamma': 1.8166222744405228e-08}. Best is trial 25 with value: 0.6141532042228375.


Best trial: 25. Best value: 0.614153:  56%|█████    | 28/50 [24:16<24:17, 66.25s/it]

[I 2024-09-24 15:10:46,971] Trial 26 finished with value: 0.5907870662483617 and parameters: {'learning_rate': 0.0001085291555632899, 'max_depth': 15, 'n_estimators': 758, 'min_child_weight': 10, 'subsample': 0.6345552088520585, 'colsample_bytree': 0.5074712271408562, 'gamma': 2.195920396208809e-08}. Best is trial 25 with value: 0.6141532042228375.


Best trial: 25. Best value: 0.614153:  58%|█████▏   | 29/50 [24:25<17:12, 49.18s/it]

[I 2024-09-24 15:10:56,353] Trial 28 finished with value: 0.591293065572654 and parameters: {'learning_rate': 0.00014555464874872073, 'max_depth': 15, 'n_estimators': 694, 'min_child_weight': 10, 'subsample': 0.6318117807125824, 'colsample_bytree': 0.5061237361629948, 'gamma': 1.5061521490944078e-06}. Best is trial 25 with value: 0.6141532042228375.


Best trial: 29. Best value: 0.615598:  60%|█████▍   | 30/50 [26:11<22:04, 66.22s/it]

[I 2024-09-24 15:12:42,310] Trial 29 finished with value: 0.6155980935998727 and parameters: {'learning_rate': 1.1170160434715529e-05, 'max_depth': 14, 'n_estimators': 821, 'min_child_weight': 2, 'subsample': 0.5077057096262747, 'colsample_bytree': 0.8191150936146058, 'gamma': 1.997795558541556e-06}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  62%|████▉   | 31/50 [31:22<44:15, 139.76s/it]

[I 2024-09-24 15:17:53,658] Trial 30 finished with value: 0.6036398495263589 and parameters: {'learning_rate': 0.0001532047186351882, 'max_depth': 14, 'n_estimators': 844, 'min_child_weight': 2, 'subsample': 0.7043504558436658, 'colsample_bytree': 0.8224955976935784, 'gamma': 1.2454579822129483e-06}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  64%|█████   | 32/50 [31:32<30:12, 100.67s/it]

[I 2024-09-24 15:18:03,131] Trial 33 finished with value: 0.5808265029228581 and parameters: {'learning_rate': 1.329471788090154e-05, 'max_depth': 12, 'n_estimators': 862, 'min_child_weight': 2, 'subsample': 0.7133574759271458, 'colsample_bytree': 0.8282972506292213, 'gamma': 0.8326493324106462}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  66%|█████▉   | 33/50 [32:26<24:32, 86.59s/it]

[I 2024-09-24 15:18:56,851] Trial 31 finished with value: 0.6037511864993526 and parameters: {'learning_rate': 0.00015875127154728276, 'max_depth': 14, 'n_estimators': 855, 'min_child_weight': 2, 'subsample': 0.7007172417873162, 'colsample_bytree': 0.8275090783047417, 'gamma': 1.5680552902743697e-06}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  68%|██████   | 34/50 [32:34<16:48, 63.01s/it]

[I 2024-09-24 15:19:04,865] Trial 32 finished with value: 0.603551844527334 and parameters: {'learning_rate': 0.00014307490192843076, 'max_depth': 14, 'n_estimators': 857, 'min_child_weight': 2, 'subsample': 0.7045821278085591, 'colsample_bytree': 0.8270196660387702, 'gamma': 1.3632349827838023e-06}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  70%|██████▎  | 35/50 [34:41<20:37, 82.47s/it]

[I 2024-09-24 15:21:12,736] Trial 34 finished with value: 0.6028591434249322 and parameters: {'learning_rate': 7.516100774277996e-06, 'max_depth': 14, 'n_estimators': 863, 'min_child_weight': 2, 'subsample': 0.7030688157651666, 'colsample_bytree': 0.8331008076799936, 'gamma': 0.00011663835446717447}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  72%|█████▊  | 36/50 [37:51<26:45, 114.70s/it]

[I 2024-09-24 15:24:22,646] Trial 35 finished with value: 0.59765088698093 and parameters: {'learning_rate': 8.163908493050215e-06, 'max_depth': 12, 'n_estimators': 868, 'min_child_weight': 2, 'subsample': 0.5093448524087211, 'colsample_bytree': 0.675987610348388, 'gamma': 8.994323215643769e-05}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  74%|██████▋  | 37/50 [37:56<17:43, 81.80s/it]

[I 2024-09-24 15:24:27,686] Trial 37 finished with value: 0.6084096666189535 and parameters: {'learning_rate': 0.0004916467182087801, 'max_depth': 13, 'n_estimators': 774, 'min_child_weight': 2, 'subsample': 0.5298457079597837, 'colsample_bytree': 0.6649832433452564, 'gamma': 4.986841191128937e-07}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  76%|██████▊  | 38/50 [38:18<12:45, 63.81s/it]

[I 2024-09-24 15:24:49,501] Trial 36 finished with value: 0.6083411836923898 and parameters: {'learning_rate': 8.511213808752397e-06, 'max_depth': 13, 'n_estimators': 860, 'min_child_weight': 2, 'subsample': 0.5016950381769191, 'colsample_bytree': 0.6793691315685486, 'gamma': 8.263234657176268e-05}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  78%|███████  | 39/50 [38:45<09:40, 52.77s/it]

[I 2024-09-24 15:25:16,525] Trial 38 finished with value: 0.6048186211828129 and parameters: {'learning_rate': 0.0004213376944005989, 'max_depth': 13, 'n_estimators': 776, 'min_child_weight': 4, 'subsample': 0.5072993066724781, 'colsample_bytree': 0.6719262695173973, 'gamma': 9.740709219275927e-05}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  80%|███████▏ | 40/50 [40:13<10:33, 63.36s/it]

[I 2024-09-24 15:26:44,577] Trial 39 finished with value: 0.5838908167423151 and parameters: {'learning_rate': 0.00035622918835830113, 'max_depth': 11, 'n_estimators': 798, 'min_child_weight': 4, 'subsample': 0.5060948024159692, 'colsample_bytree': 0.6815355681625108, 'gamma': 3.1198049102316646e-07}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  82%|███████▍ | 41/50 [42:51<13:44, 91.57s/it]

[I 2024-09-24 15:29:21,966] Trial 42 finished with value: 0.5804505454007957 and parameters: {'learning_rate': 0.00038429493296454663, 'max_depth': 11, 'n_estimators': 779, 'min_child_weight': 4, 'subsample': 0.541641842309497, 'colsample_bytree': 0.639306390410496, 'gamma': 3.9800840809400684e-07}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  84%|███████▌ | 42/50 [43:20<09:43, 72.89s/it]

[I 2024-09-24 15:29:51,291] Trial 43 finished with value: 0.5700988384812878 and parameters: {'learning_rate': 0.0017192189057966705, 'max_depth': 10, 'n_estimators': 642, 'min_child_weight': 1, 'subsample': 0.544252324872384, 'colsample_bytree': 0.9182966974246087, 'gamma': 5.471910248786425e-07}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  86%|███████▋ | 43/50 [43:34<06:27, 55.30s/it]

[I 2024-09-24 15:30:05,526] Trial 41 finished with value: 0.5653863282244761 and parameters: {'learning_rate': 0.00035478794099572797, 'max_depth': 10, 'n_estimators': 777, 'min_child_weight': 4, 'subsample': 0.5480073568376054, 'colsample_bytree': 0.9030399791216918, 'gamma': 4.772895649826315e-06}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  88%|███████▉ | 44/50 [43:35<03:53, 38.85s/it]

[I 2024-09-24 15:30:05,991] Trial 40 finished with value: 0.5659566573814518 and parameters: {'learning_rate': 0.0005549188258269871, 'max_depth': 10, 'n_estimators': 798, 'min_child_weight': 4, 'subsample': 0.5396950651160585, 'colsample_bytree': 0.8850361206788597, 'gamma': 2.263968546089011e-07}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  90%|████████ | 45/50 [45:06<04:33, 54.63s/it]

[I 2024-09-24 15:31:37,466] Trial 44 finished with value: 0.5700011947873864 and parameters: {'learning_rate': 0.002354303219061526, 'max_depth': 10, 'n_estimators': 661, 'min_child_weight': 1, 'subsample': 0.5787367574119537, 'colsample_bytree': 0.8812963394965047, 'gamma': 0.21125496930189527}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  92%|████████▎| 46/50 [46:53<04:41, 70.37s/it]

[I 2024-09-24 15:33:24,561] Trial 48 finished with value: 0.5034338291220719 and parameters: {'learning_rate': 0.0021403949737679013, 'max_depth': 6, 'n_estimators': 927, 'min_child_weight': 1, 'subsample': 0.5794457665333211, 'colsample_bytree': 0.7594602428250996, 'gamma': 0.022497883505471036}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  94%|████████▍| 47/50 [47:22<02:53, 57.81s/it]

[I 2024-09-24 15:33:53,041] Trial 45 finished with value: 0.5680032861527794 and parameters: {'learning_rate': 0.0018814848136698289, 'max_depth': 10, 'n_estimators': 631, 'min_child_weight': 1, 'subsample': 0.5824828757808236, 'colsample_bytree': 0.9253496053867474, 'gamma': 0.01633436578520149}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  96%|████████▋| 48/50 [47:32<01:27, 43.62s/it]

[I 2024-09-24 15:34:03,556] Trial 47 finished with value: 0.5234573145503971 and parameters: {'learning_rate': 0.00271650359022917, 'max_depth': 7, 'n_estimators': 926, 'min_child_weight': 1, 'subsample': 0.5808423423889431, 'colsample_bytree': 0.7603459202803521, 'gamma': 0.01801747679816116}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598:  98%|████████▊| 49/50 [49:14<01:01, 61.08s/it]

[I 2024-09-24 15:35:45,388] Trial 46 finished with value: 0.5549872150635996 and parameters: {'learning_rate': 0.0019323413493062568, 'max_depth': 9, 'n_estimators': 928, 'min_child_weight': 1, 'subsample': 0.5806025829791039, 'colsample_bytree': 0.878637434726981, 'gamma': 0.021038079988934283}. Best is trial 29 with value: 0.6155980935998727.


Best trial: 29. Best value: 0.615598: 100%|█████████| 50/50 [50:08<00:00, 60.18s/it]

[I 2024-09-24 15:36:39,749] Trial 49 finished with value: 0.5676420141271533 and parameters: {'learning_rate': 0.004765351464785309, 'max_depth': 9, 'n_estimators': 905, 'min_child_weight': 1, 'subsample': 0.5774900710593927, 'colsample_bytree': 0.7649836120110108, 'gamma': 0.0006878710540806738}. Best is trial 29 with value: 0.6155980935998727.

~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~

Best hyperparameters found: {'learning_rate': 1.1170160434715529e-05, 'max_depth': 14, 'n_estimators': 821, 'min_child_weight': 2, 'subsample': 0.5077057096262747, 'colsample_bytree': 0.8191150936146058, 'gamma': 1.997795558541556e-06}





In [24]:
best_params

{'learning_rate': 1.1170160434715529e-05,
 'max_depth': 14,
 'n_estimators': 821,
 'min_child_weight': 2,
 'subsample': 0.5077057096262747,
 'colsample_bytree': 0.8191150936146058,
 'gamma': 1.997795558541556e-06}

In [25]:
best_params = {
    'learning_rate': 1.1170160434715529e-05,
    'max_depth': 14,
    'n_estimators': 821,
    'min_child_weight': 2,
    'subsample': 0.5077057096262747,
    'colsample_bytree': 0.8191150936146058,
    'gamma': 1.997795558541556e-06
}

In [None]:
# Model run w/ best parameters

In [26]:
# # Create a dictionary from "best_params"
# best_params_fp = os.path.join(results_dir, 'dc-xgboost-cv_optuna_trials.csv')
# best_params_df = pd.read_csv(best_params_fp)
# print(best_params_df.head())

In [None]:
t0 = time.time()

# Define dataframes to store results for this feature set
results = pd.DataFrame()  # to store the model performance metrics

# Set up the stratified K-fold CV
n_folds = 5
skf = StratifiedKFold(n_splits=n_folds, shuffle=True, random_state=44)

# Loop the folds
fold_idx = 1
for fold_idx, (train_idx, val_idx) in enumerate(skf.split(train_df, train_df['class_code'])):
    t00 = time.time()
    
    print(f"Processing fold {fold_id + 1}/{n_folds}...")

    # Get the train and validation splits for this fold
    fold_train = train_df.iloc[train_idx]
    fold_val = train_df.iloc[val_idx]

    # Apply random undersampling to the training data within this fold
    fold_train_bal = balance_sampling(fold_train, ratio=5, strategy='undersample')
    # Display the class distribution in the undersampled training data for this fold
    print(f"Train class distribution (fold {fold_idx + 1}):\n", fold_train_bal['code'].value_counts().to_dict())
    print(f"Validation class distribution (fold {fold_idx + 1}):\n", fold_val['code'].value_counts().to_dict())
    del fold_train

    # Calculate the class weights
    val_counts = fold_train_bal['code'].value_counts()
    total_samples = sum(val_counts)
    class_weights = np.array([total_samples / count for count in val_counts])
    class_weights_n = class_weights / class_weights.sum() # normalize the weights
    class_weights_n = {cls: weight for cls, weight in zip(np.unique(fold_train_bal['code']), class_weights)}
    # print(f"Class weights: {class_weights_n}")
    del val_counts, total_samples, class_weights # clean up some

    # Initialize the XGBoost classifier for multi-class classification
    xgb_model = xgb.XGBClassifier(
        objective='multi:softmax',
        num_class=len(np.unique(fold_train_bal['code'])),
        n_estimators=1001,
        learning_rate=0.01,
        max_depth=8,
        random_state=44,
        eval_metric=['mlogloss', 'rmse']
    )

    # Fit the model
    X_train = fold_train_bal[band_names].astype(float)
    y_train = fold_train_bal['code'].astype(int)
    X_val = fold_val[band_names].astype(float)
    y_val = fold_val['code'].astype(int)
    
    xgb_model.fit(
        X_train, y_train,
        eval_set=[(X_val, y_val)],  # Specify validation set
        sample_weight=y.map(class_weights_n).values,
        eval_metric=['mlogloss'], # good for multiclass
        early_stopping_rounds=10, # stop if no improvement
        verbose=False
    )

    # Retrieve the evaluation metrics for the fold
    eval_results = xgb_model.evals_result()
    print(f"Evaluation metrics for fold {fold_idx + 1}:", eval_results)

    # Predict on the validation set
    
    y_pred = xgb_model.predict(X_val)

    # Calculate the accuracy and f1-score (weighted)
    f1 = f1_score(y_val, y_pred, average='weighted')
    acc = accuracy_score(y_val, y_pred, average='weighted')

    # Store the metrics into the results data frame
    fold_results = pd.DataFrame({
        'fold': [fold_idx],
        'mlogloss': [eval_metrics['mlogloss']],
        'rmse': [eval_metrics['rmse']]
        'f1_val': [f1],
        'acc_val': [acc],
        'val_labs': [y_val],
        'val_preds': [y_pred]
    })
    # Concatenate with the results data frame
    results = pd.concat([results, fold_results], ignore_index=True)

    # Save the model for further evaluation
    model_filename = f"dc-xgb_fold{fold_idx + 1}.model"
    xgb_model.save_model(os.path.join(results_dir, model_fn))

    t1 = (time.time() - t00) / 60
    print(f"Total elapsed time for fold {fold_idx}: {t1:.2f} minutes.")

    del X, y, X_val, y_val, y_pred
    gc.collect()

    fold_idx += 1 # fold counter
    print("\n~~~~~~~~~~\n")

t2 = (time.time() - t0) / 60
print(f"Total elapsed time: {t2:.2f} minutes.")

In [None]:
# # Append the feature set-specific results to the overall results dataframes
# all_results = pd.concat([all_results, results], ignore_index=True)
# all_feat_imps = pd.concat([all_feat_imps, feat_imps], ignore_index=True)
# all_prob_preds = pd.concat([all_prob_preds, prob_preds], ignore_index=True)

# del results, feat_imps, prob_preds

In [None]:
    # Store feature importance
    fold_imps = pd.DataFrame({
        'Fold': fold_idx,
        'Feature': X.columns,
        'Importance': xgb_model.feature_importances_
    })
    # Append to the running data frame
    feat_imps = pd.concat([feat_imps, fold_imps], axis=0)
    
    # Store the probability values for cutoff testing
    y_pred_probs = xgb_model.predict_proba(X_test)

    # Store probabilities and true labels
    fold_probs = pd.DataFrame({
        'Fold': fold_idx,
        'TrueLabel': y_val,
        'PredictedProb': list(y_pred_probs)
    })

    # Concatenate the feature importances and probabilities
    prob_preds = pd.concat([prob_preds, fold_probs], ignore_index=True)

In [None]:
results.head()

In [None]:
from sklearn.metrics import confusion_matrix, ConfusionMatrixDisplay
import numpy as np
import matplotlib.pyplot as plt

# Initialize the confusion matrix accumulator
n_classes = len(np.unique(y))  # Number of classes
cm_accumulator = np.zeros((n_classes, n_classes))

all_true_labels = []
all_pred_labels = []

class_acc_reports = []

# Loop over the folds and compute confusion matrices
for fold in range(1, fold_idx):
    fold_data = prob_preds[prob_preds['Fold'] == fold]
    true_labels = fold_data['TrueLabel']
    pred_labels = np.argmax(np.vstack(fold_data['PredictedProb']), axis=1)

    all_true_labels.extend(true_labels)
    all_pred_labels.extend(pred_labels)

    # Compute the classification report for this fold
    cr_fold = classification_report(true_labels, pred_labels, target_names=cor_labels, output_dict=True)
    # Convert the classification report to a DataFrame and store it
    cr_df_fold = pd.DataFrame(cr_fold).transpose()
    class_acc_reports.append(cr_df_fold)
    
    # Compute the confusion matrix for this fold
    cm_fold = confusion_matrix(true_labels, pred_labels, labels=np.unique(y))

    # Accumulate the confusion matrix
    cm_accumulator += cm_fold

# Average the confusion matrix by the number of folds
cm_avg = cm_accumulator / (fold_idx - 1)

# Round to the nearest integer (optional, as you mentioned it's fine)
cm_avg = np.rint(cm_avg).astype(int)

# Display the averaged confusion matrix
disp = ConfusionMatrixDisplay(confusion_matrix=cm_avg, display_labels=np.unique(y))

# Plot the confusion matrix
fig, ax = plt.subplots(figsize=(10, 10))
disp.plot(ax=ax)
plt.title('Averaged Confusion Matrix Across Folds')

# Save the figure
plt.savefig(os.path.join('/home/jovyan/data-store/results/FigX_xgboost_confusion_matrix_avg.png'), dpi=300, bbox_inches='tight')

plt.show()

In [None]:
cr_avg = pd.concat(class_acc_reports).groupby(level=0).mean()
print(cr_avg)

In [None]:
from sklearn.metrics import classification_report

# Create a classification report and convert it to a DataFrame
cor_labels = [label for label, idx in sorted(class_mapping.items(), key=lambda item: item[1])]
cr_df = pd.DataFrame(classification_report(all_true_labels, all_pred_labels, target_names=cor_labels, output_dict=True)).transpose()

# Compute the average accuracy metrics across the 10 folds
average_metrics = cr_df.loc[cor_labels].mean()

# Display the DataFrame
cr_df

In [None]:
# Save out the results
results_dir = '/home/jovyan/data-store/results/'
results.to_csv(os.path.join(results_dir,'dc_xgboost_folds_results.csv'))
feat_imps.to_csv(os.path.join(results_dir,'dc_xgboost_folds_feat_imps.csv'))
prob_preds.to_csv(os.path.join(results_dir,'dc_xgboost_folds_prob_peds.csv'))
cr_df.to_csv(os.path.join(results_dir,'dc_xgboost_classification_report_avg.csv'))