#  MODIS Water Cluster Training

Version: 0.1.0

Date modified: 05.01.2023

Modified by: Amanda Burke

In [1]:
import csv
import datetime
import glob
import joblib
import pickle
import numpy as np
import os
import pandas as pd
from pathlib import Path   
from sklearn.cluster import KMeans
from sklearn.model_selection import train_test_split 

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
%matplotlib inline


import optuna
from sklearn.ensemble import RandomForestClassifier as skRF
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, f1_score
from sklearn.metrics import classification_report, roc_curve, auc, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
#from sklearn.inspection import permutation_importance


# #GDAL Stuff
# from osgeo import gdalconst
# from osgeo import gdal
# from pprint import pprint

# GPU-based frameworks

# import cudf
# import cupy as cp
# from cuml.ensemble import RandomForestClassifier as cuRFC

## Parameters and Functions

In [2]:
GPU = False
MODEL = 'rf'
TEST_RATIO = 0.2
RANDOM_STATE = 42
LABEL_NAME = 'water'
DATA_TYPE = np.int16
FRAC_LAND=0.5
num_datapoints = 10000000

"Unhighlight" different versions for input

In [3]:
# #############################
# # VERSION 4.2.1 (targeted 500k points)
# TILE_IN = 'Golden'#v4.2.1
# DATA_VERSION='v4.2.1'
# offsets_indexes = ['x_offset', 'y_offset', 'year', 'julian_day','tileID']
# #############################

##############################
#VERSION 2.0.1 (5 million points)
TILE_IN = 'GLOBAL'#v2.0.1
DATA_VERSION='v2.0.1'
offsets_indexes = ['x_offset', 'y_offset', 'year', 'julian_day']
##############################

# #############################
# #VERSION 0.0.0 (2billion data points)
# TILE_IN = 'cleaned'#v2.0.1
# DATA_VERSION='AGU'
# offsets_indexes = []#'x_offset', 'y_offset', 'year', 'julian_day']
# ##############################

training_data_basepath = f'/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/{DATA_VERSION}'
glob_string = os.path.join(training_data_basepath,'MOD*{}*.parquet.gzip'.format(TILE_IN))
data_paths = sorted([fv for fv in glob.glob(glob_string)])

print(data_paths)
data_path = data_paths[0]
print(data_path)

['/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/v2.0.1/MOD09_GLOBAL_5469777_2_0_1.parquet.gzip']
/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/v2.0.1/MOD09_GLOBAL_5469777_2_0_1.parquet.gzip


In [4]:
def load_cpu_data(fpath, colsToDrop, yCol='water', testSize=0.2, randomState=42, 
            dataType=np.float32, cpu=True, splitXY=False, trainTestSplit=False,
            applyLog=False, imbalance=False, frac=0.1, land=False, multi=False, 
            multisample=1000000):
    """
    Simple helper function for loading data to be used by models
    :param fpath: Path to the data to be ingested.
    :param dataType: Data type to convert ingested data to.
    :param colsToDrop: Columns which are not necessary, from which to drop.
    :param testSize: Ration to
    """
    if multi:
        all_dfs = [pd.read_csv(path_) for path_ in fpath]
        df = pd.concat(all_dfs).sample(n=multisample, random_state=randomState)
        print('DF length: {}'.format(len(df.index)))
    else:   
        df = pd.read_parquet(fpath) if '.parquet' in fpath else pd.read_csv(fpath)
    df = df[df['sur_refl_b01_1'] + df['sur_refl_b02_1'] != 0]
    df = df[df['sur_refl_b07_1'] + df['sur_refl_b02_1'] != 0]
    df = df[df['sur_refl_b06_1'] + df['sur_refl_b02_1'] != 0]

    df = df.drop(columns=colsToDrop)
    cleanedDF = df[~df.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0).astype(dataType)
    if applyLog:
        for col in cleanedDF.drop([yCol], axis=1).columns:
            print('Applying log1p func to {}'.format(col))
            cleanedDF[col] = np.log1p(cleanedDF[col])
        cleanedDF = cleanedDF[~cleanedDF.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0)
    df = None
    if imbalance:
        if land:
            print('Imbalancing data, sampling {} from water'.format(frac))
        else:
            print(f'Imbalancing data, sampling {frac} from land, {1-frac} from water')
        groupedDF = cleanedDF.groupby('water')
        dfs = [groupedDF.get_group(y) for y in groupedDF.groups]
        sampledDF = dfs[1].sample(frac=frac)if land else dfs[0].sample(frac=frac)
        concatDF = sampledDF.append(dfs[0]) if land else sampledDF.append(dfs[1])
        concatDF = concatDF.sample(frac=1)
        concatDF = concatDF.reset_index()
        cleanedDF = concatDF.drop(columns=['index'])
    if not splitXY:
        return cleanedDF
    X = cleanedDF.drop([yCol], axis=1).astype(dataType)
    y = cleanedDF[yCol].astype(dataType)
    if trainTestSplit:
        return train_test_split(X, y, test_size=TEST_RATIO)
    else:
        return X, y

Change the input features below

In [5]:
colsToDrop = [
            # 'sur_refl_b01_1','sur_refl_b02_1',
            'sur_refl_b03_1','sur_refl_b04_1',
            'sur_refl_b05_1','sur_refl_b06_1',
            # 'sur_refl_b07_1', 'ndvi',
            'ndwi1','ndwi2'
            ]

colsToDropTraining = colsToDrop.copy()
colsToDropTraining.extend(offsets_indexes)
v_names = ['sur_refl_b01_1','sur_refl_b02_1',
           'sur_refl_b03_1','sur_refl_b04_1',
           'sur_refl_b05_1','sur_refl_b06_1',
           'sur_refl_b07_1','ndvi',
           'ndwi1','ndwi2']

In [6]:
colsToDrop

['sur_refl_b03_1',
 'sur_refl_b04_1',
 'sur_refl_b05_1',
 'sur_refl_b06_1',
 'ndwi1',
 'ndwi2']

## Input data

In [7]:
%%time
X, X_test, y, y_test = load_cpu_data(fpath=data_path,
                                             colsToDrop=colsToDropTraining,
                                             dataType=DATA_TYPE,
                                             splitXY=True,
                                             imbalance=False,
                                             trainTestSplit=True)
X = X.iloc[:num_datapoints,:] 
y = y.iloc[:num_datapoints] 

X_test = X_test.iloc[:num_datapoints,:] 
y_test = y_test.iloc[:num_datapoints] 

print(f'data shape: {X.shape}, {y.shape}')

data shape: (4375821, 4), (4375821,)
CPU times: user 4.09 s, sys: 986 ms, total: 5.08 s
Wall time: 4.59 s


In [8]:
#Getting the indices that are associated with land (0) and water (1)
y_water_ind = np.where(y>0.5)[0]
y_land_ind = np.where(y<0.5)[0]

#Subset the X AND y data to later/ subset with the clusters and then combine for RFA
X_water = X.iloc[y_water_ind,:]
y_water = y.iloc[y_water_ind]

X_land = X.iloc[y_land_ind,:]
y_land = y.iloc[y_land_ind]
print(len(X_water),len(X_land))

1976893 2398928


In [9]:
_ = [print(column) for column in X.columns]

sur_refl_b01_1
sur_refl_b02_1
sur_refl_b07_1
ndvi


## Clustering

Based on the cluster analysis above on 5.03.23, 15 clusters appears to have the most data and exclude outliers so will use that number for selection 

In [10]:
kmean_land_fit_file = 'kmeans_land_fit.pkl'
kmean_water_fit_file = 'kmeans_water_fit.pkl'
# if len(glob.glob(kmean_land_fit_file)) == 1:
#     print(f"Opening {kmean_land_fit_file}")
#     kme_land_random = pickle.load(open(kmean_land_fit_file, 'rb'))
# if len(glob.glob(kmean_water_fit_file)) == 1:
#     print(f"Opening {kmean_water_fit_file}")
#     kme_water_random = pickle.load(open(kmean_water_fit_file, 'rb'))

In [11]:
CLUSTER_NUM=15

common_params = {
    "n_init": "auto",
    "random_state": 42,
    "init":"random"
}

In [12]:
%%time
kme_water_random = KMeans(n_clusters=CLUSTER_NUM, **common_params).fit(X_water)
kmeans_output_water_random = kme_water_random.predict(X_water)

CPU times: user 58.1 s, sys: 1.35 s, total: 59.4 s
Wall time: 15.3 s


In [13]:
%%time
kme_land_random = KMeans(n_clusters=CLUSTER_NUM, **common_params).fit(X_land)
kmeans_output_land_random = kme_land_random.predict(X_land)

CPU times: user 2min 18s, sys: 2.6 s, total: 2min 21s
Wall time: 35.4 s


In [15]:
pickle.dump(kme_land_random, open("kmeans_land_fit.pkl", "wb"))
pickle.dump(kme_water_random, open("kmeans_water_fit.pkl", "wb"))

### Evenly balanced cluster data

In [14]:
COUNT_EVEN_BALANCE_LAND = np.inf
COUNT_EVEN_BALANCE_WATER = np.inf
for cluster in np.unique(kmeans_output_water_random):
    land_num = len(np.where(kmeans_output_land_random == cluster)[0])
    water_num = len(np.where(kmeans_output_water_random == cluster)[0])
    if land_num < COUNT_EVEN_BALANCE_LAND: COUNT_EVEN_BALANCE_LAND = land_num
    if water_num < COUNT_EVEN_BALANCE_WATER: COUNT_EVEN_BALANCE_WATER = water_num
    
print(COUNT_EVEN_BALANCE_LAND, COUNT_EVEN_BALANCE_WATER)
if COUNT_EVEN_BALANCE_LAND < COUNT_EVEN_BALANCE_WATER:
    COUNT = COUNT_EVEN_BALANCE_LAND
else: 
    COUNT = COUNT_EVEN_BALANCE_WATER
print(COUNT,COUNT_EVEN_BALANCE_LAND,COUNT_EVEN_BALANCE_WATER)

919 33589
919 919 33589


In [16]:
np.random.seed(42)
cluster_sample_land = np.array([])
cluster_sample_water = np.array([])

for cluster in np.unique(kmeans_output_water_random):
    print(f'cluster {cluster}')
    cluster_water = np.where(kmeans_output_water_random == cluster)[0]
    sample_water = np.random.choice(cluster_water,COUNT,replace=False)
    max_X_random_water = np.nanmax(X_water['sur_refl_b01_1'].iloc[sample_water])
    if max_X_random_water < 10000:
        cluster_sample_water = np.append(cluster_sample_water, sample_water)
    else: 
        print(f'contains outliers')
        continue
    
    cluster_land= np.where(kmeans_output_land_random == cluster)[0]
    sample_land = np.random.choice(cluster_land,COUNT,replace=False)
    cluster_sample_land = np.append(cluster_sample_land, sample_land)
    
cluster_sample_water = cluster_sample_water.astype('int')
cluster_sample_land = cluster_sample_land.astype('int')

print(len(cluster_sample_water),len(cluster_sample_land))

cluster 0
cluster 1
cluster 2
cluster 3
cluster 4
cluster 5
cluster 6
cluster 7
cluster 8
cluster 9
cluster 10
cluster 11
cluster 12
cluster 13
cluster 14
13785 13785


#### Combining even balance cluster data

In [17]:
X_seperate_cluster = pd.concat([
    X_land.iloc[cluster_sample_land],X_water.iloc[cluster_sample_water]
    ])
    
y_seperate_cluster = pd.concat([
    y_land.iloc[cluster_sample_land],y_water.iloc[cluster_sample_water]
    ])

#Combine the data so that we can shuffle the indices and keep the data together that should be
all_cluster = pd.concat([X_seperate_cluster,y_seperate_cluster],axis=1).sample(frac=1)
X_cluster = all_cluster[X_seperate_cluster.columns]
y_cluster = all_cluster['water']

print(all_cluster)
print(X_cluster)
print(y_cluster)


         sur_refl_b01_1  sur_refl_b02_1  sur_refl_b07_1   ndvi  water
1860342             477            2980             958   7240      0
2227721             162            2774             352   8896      0
325991               21               6              65  -5555      1
2253569             -32               2              12 -11333      1
5381081              23              -5              19 -15555      1
...                 ...             ...             ...    ...    ...
1996160              29             -37              63  16964      1
2942419            -100              93             329 -13570      0
901337              491            1740            1129   5598      0
3174851             318             112             178  -4790      1
94498                -5               3              11  25536      1

[27570 rows x 5 columns]
         sur_refl_b01_1  sur_refl_b02_1  sur_refl_b07_1   ndvi
1860342             477            2980             958   7240
2227721 

### Proportional cluster data

In [None]:
# List of the clusters: kmeans_output_land and kmeans_output_water
# Data: X_water, X_land, y_water, y_land

PERCENT_RANDOM_PULL = 0.15

In [None]:
# np.random.seed(42)
random_ind_land = np.array([])
random_ind_water = []

for cluster in np.unique(kmeans_output_water_random):
    print(f'cluster {cluster}')
    cluster_ind_water = np.where(kmeans_output_water_random == cluster)[0]
    # cluster_ind_water = np.where(bgm_water == cluster)[0]
    COUNT_RANDOM_PULL_WATER = int(PERCENT_RANDOM_PULL*len(cluster_ind_water))
    random_pts_water = np.random.choice(cluster_ind_water,COUNT_RANDOM_PULL_WATER,replace=False)
    max_X_random_water = np.nanmax(X_water['sur_refl_b01_1'].iloc[random_pts_water])
    if max_X_random_water < 10000:
        random_ind_water = np.append(random_ind_water, random_pts_water)
    else: print(f'Cluster {cluster} contains outliers')
    
    cluster_ind_land = np.where(kmeans_output_land_random == cluster)[0]
    # cluster_ind_land = np.where(bgm_land == cluster)[0]
    COUNT_RANDOM_PULL_LAND = int(PERCENT_RANDOM_PULL*len(cluster_ind_land))
    random_pts_land = np.random.choice(cluster_ind_land,COUNT_RANDOM_PULL_LAND,replace=False)
    random_ind_land = np.append(random_ind_land, random_pts_land)
    print(f'Pulling {COUNT_RANDOM_PULL_WATER} Water pts and {COUNT_RANDOM_PULL_LAND} Land pts')
    print()
random_ind_water = random_ind_water.astype('int')
random_ind_land = random_ind_land.astype('int')

print(random_ind_water,random_ind_land)

### Creating random sample, same size as clusters

In [18]:
match_sample_land = np.random.choice( np.arange(len(X_land)),len(cluster_sample_land),replace=False)
match_sample_water = np.random.choice( np.arange(len(X_water)),len(cluster_sample_water),replace=False)

X_seperate_match= pd.concat([
    X_land.iloc[match_sample_land],X_water.iloc[match_sample_water]
        ])
y_seperate_match = pd.concat([
    y_land.iloc[match_sample_land],y_water.iloc[match_sample_water]
        ])

all_match = pd.concat([X_seperate_match,y_seperate_match],axis=1).sample(frac=1).reset_index(drop=True)
X_match= all_match[X_seperate_match.columns]
y_match = all_match['water']

print(all_match)
print(X_match)
print(y_match)

       sur_refl_b01_1  sur_refl_b02_1  sur_refl_b07_1   ndvi  water
0                 176            3652             498   9080      0
1                1043            3682            1292   5585      0
2                 497            2324            1346   6476      0
3                  47               2              30  -9183      1
4                 111            1862             229   8874      0
...               ...             ...             ...    ...    ...
27565             358            1683             835   6491      0
27566             971            2881            1883   4958      0
27567              35              -3              12 -11875      1
27568            1130            2442            1474   3673      0
27569            2513            1731             830  -1842      0

[27570 rows x 5 columns]
       sur_refl_b01_1  sur_refl_b02_1  sur_refl_b07_1   ndvi
0                 176            3652             498   9080
1                1043            368

### Plotting paramater space

In [None]:
# fig, ax = plt.subplots(2, 2,figsize=(20, 10))
# var=0
# for col in range(2):
#     ax[col, 0].set_ylabel('Frequency') 
#     for row in range(2):
#         variable=X_land.columns[var]
#         if 'ndvi' in variable: 
#             # var_bins = bin_boundaries
#             log_values = False
#         else: 
#             # var_bins = None
#             log_values = True
#         ax[row, col].hist(
#             [  
#             X_cluster_eb[variable].values,
#             X_match_eb[variable].values,
#             X_cluster_p[variable].values,
#             X_match_p[variable].values,
#             ],
#             label=[
#             f"EB Cluster {len(X_cluster_eb)}",
#             "EB Match",
#             f"P Cluster {len(X_match_p)}",
#             f"P Match"
#             ],
#             #bins=var_bins,
#         color=['darkgreen','lightgreen','darkblue','lightblue'], log=log_values) 
#         ax[row, col].set_xlabel(f'{variable}')
#         var+=1
#     ax[0,0].legend(loc='upper right',fontsize=20)
# plt.show()

## Random forest

In [19]:
def cpu_rf_objective(trial):
    list_trees = [75, 100, 125, 150, 175, 200, 250, 300, 400, 500]
    max_depth = [5, 10, 30, 50, 80, 90, 100, 110]
    min_samples_leaf = [1, 2, 3, 4, 5]
    min_samples_split = [2, 4, 8, 10]
    bootstrap = [True, False]
    max_features = ['auto', 'sqrt', 'log2']
 
    param = {'n_estimators': trial.suggest_categorical('n_estimators', list_trees), 
                       'max_depth':trial.suggest_categorical('max_depth', max_depth), 
                       'min_samples_split':trial.suggest_categorical('min_samples_split', min_samples_split), 
                       'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', min_samples_leaf), 
                       'bootstrap': trial.suggest_categorical('bootstrap', bootstrap),
                       'criterion':'gini', 
                       #'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 1e-8, 1.0, log=True), 
                       'max_features':trial.suggest_categorical('max_features', max_features), 
                       'max_leaf_nodes':None, 
                       'min_impurity_decrease':0.0, 
                       'oob_score':False, 
                       'n_jobs':-1, 
                       # 'random_state':42, 
                       'verbose':0, 
                       'warm_start':False, 
                       'class_weight':None, 
                       'ccp_alpha':0.0, 
                       'max_samples':None
                      }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    #####################################################################
    # HERE IS WHERE TO CHANGE THE X,Y DATASET USED FOR TRAINING
    #####################################################################
   
    cv_scores = np.empty(5)
    
    for idx, (train_idx, val_idx) in enumerate(cv.split(X_match,  y_match)):    
        X_train, X_val = X_match.iloc[train_idx], X_match.iloc[val_idx]
        y_train, y_val = y_match.iloc[train_idx],  y_match.iloc[val_idx]   
        
    # for idx, (train_idx, val_idx) in enumerate(cv.split(X_cluster,  y_cluster)):    
    #     X_train, X_val = X_cluster.iloc[train_idx], X_cluster.iloc[val_idx]
    #     y_train, y_val = y_cluster.iloc[train_idx],  y_cluster.iloc[val_idx]     
   

    #####################################################################

        model = skRF(**param)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        cv_scores[idx] = f1_score(y_val, preds)
        if cv_scores[idx] == 0.0:
            print('Pruning because of 0.0 score.')
            return 0.0
        print('Fold {}: {}'.format(idx, cv_scores[idx]))
    return np.mean(cv_scores)

search_space={
    "n_estimators": [75, 100, 125, 150, 175, 200, 250, 300, 400, 500],
    "max_depth" : [5,10, 30, 50, 80, 90, 100, 110],
    "min_samples_leaf" : [1, 2, 3, 4, 5],
    "min_samples_split" : [2, 4, 8, 10],
    "bootstrap" : [True, False],
    "max_features" : ['auto', 'sqrt', 'log2'],
    
}
TREES_AND_DEPTH_ONLY = False
GRID_SEARCH = True

### Training RF

Change modeling type below e.g.,

study.optimize(cpu_***rf***_objective, n_trials=25, timeout=30*600)

vs

study.optimize(cpu_***xgb***_objective, n_trials=25, timeout=30*600)


In [None]:
%%time

optuna.logging.set_verbosity(optuna.logging.INFO)
if GRID_SEARCH:
    study = optuna.create_study(study_name='RF Tuning Grid Search', 
                                direction='maximize',
                                sampler=optuna.samplers.GridSampler(search_space))
    
else:
    study = optuna.create_study(study_name='RF Tuning',
                                direction='maximize')

study.optimize(cpu_rf_objective, n_trials=25, timeout=30*600)

[32m[I 2024-02-06 15:12:30,300][0m A new study created in memory with name: RF Tuning Grid Search[0m


Fold 0: 0.9782766111513396
Fold 1: 0.9779225479551213
Fold 2: 0.9772768587529541
Fold 3: 0.9798730734360834


[32m[I 2024-02-06 15:12:34,460][0m Trial 0 finished with value: 0.9784730020507061 and parameters: {'n_estimators': 75, 'max_depth': 50, 'min_samples_split': 2, 'min_samples_leaf': 5, 'bootstrap': False, 'max_features': 'sqrt'}. Best is trial 0 with value: 0.9784730020507061.[0m


Fold 4: 0.9790159189580318
Fold 0: 0.9780916168748868
Fold 1: 0.9775849602313811
Fold 2: 0.9761861479730959
Fold 3: 0.9795029929258118


[32m[I 2024-02-06 15:12:43,021][0m Trial 1 finished with value: 0.977970124255528 and parameters: {'n_estimators': 150, 'max_depth': 90, 'min_samples_split': 8, 'min_samples_leaf': 2, 'bootstrap': False, 'max_features': 'auto'}. Best is trial 0 with value: 0.9784730020507061.[0m


Fold 4: 0.9784849032724642
Fold 0: 0.9782687432089823
Fold 1: 0.9781153915717128
Fold 2: 0.9770909090909091
Fold 3: 0.9802213754309563


[32m[I 2024-02-06 15:12:58,857][0m Trial 2 finished with value: 0.9786133338062232 and parameters: {'n_estimators': 400, 'max_depth': 100, 'min_samples_split': 4, 'min_samples_leaf': 2, 'bootstrap': True, 'max_features': 'auto'}. Best is trial 2 with value: 0.9786133338062232.[0m


Fold 4: 0.9793702497285559
Fold 0: 0.9782766111513396
Fold 1: 0.9783001808318265
Fold 2: 0.976203451407811
Fold 3: 0.9793253536452666


[32m[I 2024-02-06 15:13:04,389][0m Trial 3 finished with value: 0.9780457804088758 and parameters: {'n_estimators': 100, 'max_depth': 80, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': False, 'max_features': 'log2'}. Best is trial 2 with value: 0.9786133338062232.[0m


Fold 4: 0.9781233050081359
Fold 0: 0.9786231884057972
Fold 1: 0.9786463988418385
Fold 2: 0.977632296781233
Fold 3: 0.9805842859735076


[32m[I 2024-02-06 15:13:11,487][0m Trial 4 finished with value: 0.9790805884423062 and parameters: {'n_estimators': 175, 'max_depth': 90, 'min_samples_split': 2, 'min_samples_leaf': 4, 'bootstrap': True, 'max_features': 'auto'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.979916772209155
Fold 0: 0.9791704401376562
Fold 1: 0.9779385171790236
Fold 2: 0.9774709302325582
Fold 3: 0.9804063860667634


[32m[I 2024-02-06 15:13:14,609][0m Trial 5 finished with value: 0.9788713046689115 and parameters: {'n_estimators': 75, 'max_depth': 30, 'min_samples_split': 4, 'min_samples_leaf': 1, 'bootstrap': True, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9793702497285559
Fold 0: 0.9780836804926645
Fold 1: 0.9775606225117626
Fold 2: 0.9765411893071468
Fold 3: 0.9806054014863151


[32m[I 2024-02-06 15:13:25,518][0m Trial 6 finished with value: 0.9782890036510394 and parameters: {'n_estimators': 200, 'max_depth': 80, 'min_samples_split': 8, 'min_samples_leaf': 3, 'bootstrap': False, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9786541244573084
Fold 0: 0.9787928221859707
Fold 1: 0.9784381228483421
Fold 2: 0.9765411893071468
Fold 3: 0.9794955543458538


[32m[I 2024-02-06 15:13:30,200][0m Trial 7 finished with value: 0.9781220894866017 and parameters: {'n_estimators': 100, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 1, 'bootstrap': False, 'max_features': 'auto'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9773427587456952
Fold 0: 0.9782766111513396
Fold 1: 0.9781233050081359
Fold 2: 0.9774381368267832
Fold 3: 0.9802213754309563


[32m[I 2024-02-06 15:13:46,162][0m Trial 8 finished with value: 0.9787258252965328 and parameters: {'n_estimators': 400, 'max_depth': 110, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': True, 'max_features': 'log2'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9795696980654494
Fold 0: 0.9784537389100127
Fold 1: 0.9786463988418385
Fold 2: 0.9774463441251364
Fold 3: 0.9807622504537206


[32m[I 2024-02-06 15:14:01,947][0m Trial 9 finished with value: 0.9789712487285852 and parameters: {'n_estimators': 400, 'max_depth': 80, 'min_samples_split': 10, 'min_samples_leaf': 2, 'bootstrap': True, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9795475113122173
Fold 0: 0.9789778905400508
Fold 1: 0.9784459337076616
Fold 2: 0.9768964889939967
Fold 3: 0.9802285506983495


[32m[I 2024-02-06 15:14:08,746][0m Trial 10 finished with value: 0.9788148280516464 and parameters: {'n_estimators': 150, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 2, 'bootstrap': False, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9795252763181735
Fold 0: 0.9784693323683734
Fold 1: 0.9783080260303688
Fold 2: 0.9778020378457061
Fold 3: 0.9798657718120805


[32m[I 2024-02-06 15:14:16,676][0m Trial 11 finished with value: 0.9786613913348828 and parameters: {'n_estimators': 200, 'max_depth': 90, 'min_samples_split': 2, 'min_samples_leaf': 2, 'bootstrap': True, 'max_features': 'sqrt'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9788617886178861
Fold 0: 0.9786309308221659
Fold 1: 0.9773837524877873
Fold 2: 0.9767272727272727
Fold 3: 0.9796733212341198


[32m[I 2024-02-06 15:14:24,747][0m Trial 12 finished with value: 0.9781784795815976 and parameters: {'n_estimators': 150, 'max_depth': 30, 'min_samples_split': 2, 'min_samples_leaf': 3, 'bootstrap': False, 'max_features': 'auto'}. Best is trial 4 with value: 0.9790805884423062.[0m


Fold 4: 0.9784771206366432
Fold 0: 0.9784537389100127


In [None]:
trials = study.best_trials            
max_trial_score = max([trial.values[0] for trial in trials])
max_trial_params = [trial.params for trial in trials 
                        if trial.values[0] == max_trial_score][0]
max_trial_params['n_jobs'] = -1
score_print = int(np.round(max_trial_score,4)*1000)
print(score_print)

In [None]:
hyperparameters = max_trial_params
hyperparameters['n_jobs'] = -1
print('Using these params:')
print(hyperparameters)
tuned_classifier = skRF(**hyperparameters)

Change the data in the .fit() function below e.g. 

tuned_classifier.fit(***X_cluster, y_cluster***)

vs

tuned_classifier.fit(***X_match, y_match***)

In [None]:
%%time 
tuned_classifier.fit(X_cluster, y_cluster)

Change the filename below e.g., 

out_file = f'rfa_models/MODIS_RFA_v201_***EBcluster***_MaxScore{score_print}_sfcref127ndvi.pkl'

vs

out_file = f'rfa_models/MODIS_RFA_v201_***EBmatch***_MaxScore{score_print}_sfcref127ndvi.pkl'

In [None]:
out_file = f'rfa_models/MODIS_RFA_v201_EBcluster_MaxScore{score_print}_sfcref127ndvi.pkl'
print(out_file)

In [None]:
pickle.dump(tuned_classifier, open(out_file, 'wb'))

In [None]:
# pickled_model = pickle.load(open(out_file, 'rb'))
# print(pickled_model)