#  MODIS Water Cluster Training

Version: 0.1.0

Date modified: 05.01.2023

Modified by: Amanda Burke

In [1]:
import csv
import datetime
import glob
import joblib
import numpy as np
import os
import math 
import pandas as pd
from pathlib import Path   
from sklearn.cluster import KMeans
from sklearn.cluster import Birch
from sklearn.cluster import SpectralClustering
from sklearn.model_selection import train_test_split 

# Visualization
import seaborn as sns
import matplotlib.pyplot as plt
import warnings


# plt.style.use('fivethirtyeight')
warnings.filterwarnings('ignore')
%matplotlib inline


import optuna
from sklearn.ensemble import RandomForestClassifier as skRF
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score, precision_score, f1_score
from sklearn.metrics import classification_report, roc_curve, auc, matthews_corrcoef
from sklearn.model_selection import RandomizedSearchCV, KFold, StratifiedKFold
#from sklearn.inspection import permutation_importance


# #GDAL Stuff
# from osgeo import gdalconst
# from osgeo import gdal
# from pprint import pprint

# # GPU-based frameworks

# import cudf
# import cupy as cp
# from cuml.ensemble import RandomForestClassifier as cuRFC

In [2]:
# df = pd.read_pickle('MODIS_RFA_v201_NoCluster_sfcref127ndvi.pkl')
# df

In [3]:
GPU = False

In [4]:
MODEL = 'rf'
TEST_RATIO = 0.2
RANDOM_STATE = 42
LABEL_NAME = 'water'
if GPU is False:
    DATA_TYPE = np.int16
else: 
    DATA_TYPE = cp.float32
FRAC_LAND=0.5
num_datapoints = 10000000

In [5]:
# #############################
# # VERSION 4.2.1 (targeted 500k points)
# TILE_IN = 'Golden'#v4.2.1
# DATA_VERSION='v4.2.1'
# offsets_indexes = ['x_offset', 'y_offset', 'year', 'julian_day','tileID']
# #############################

##############################
#VERSION 2.0.1 (5 million points)
TILE_IN = 'GLOBAL'#v2.0.1
DATA_VERSION='v2.0.1'
offsets_indexes = ['x_offset', 'y_offset', 'year', 'julian_day']
##############################

# #############################
# #VERSION 0.0.0 (2billion data points)
# TILE_IN = 'cleaned'#v0.0.0
# DATA_VERSION='AGU'
# offsets_indexes = []#'x_offset', 'y_offset', 'year', 'julian_day']
# ##############################

training_data_basepath = f'/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/{DATA_VERSION}'
glob_string = os.path.join(training_data_basepath,'MOD*{}*.parquet.gzip'.format(TILE_IN))
data_paths = sorted([fv for fv in glob.glob(glob_string)])

#Only want the one with 4.2.0 because the other file doesnt work. 
print(data_paths)
data_path = data_paths[0]
print(data_path)

['/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/v2.0.1/MOD09_GLOBAL_5469777_2_0_1.parquet.gzip']
/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/v2.0.1/MOD09_GLOBAL_5469777_2_0_1.parquet.gzip


In [6]:
def load_cpu_data(fpath, colsToDrop, yCol='water', testSize=0.2, randomState=42, 
            dataType=np.float32, cpu=True, splitXY=False, trainTestSplit=False,
            applyLog=False, imbalance=False, frac=0.1, land=False, multi=False, 
            multisample=1000000):
    """
    Simple helper function for loading data to be used by models
    :param fpath: Path to the data to be ingested.
    :param dataType: Data type to convert ingested data to.
    :param colsToDrop: Columns which are not necessary, from which to drop.
    :param testSize: Ration to
    """
    if multi:
        all_dfs = [pd.read_csv(path_) for path_ in fpath]
        df = pd.concat(all_dfs).sample(n=multisample, random_state=randomState)
        print('DF length: {}'.format(len(df.index)))
    else:   
        df = pd.read_parquet(fpath) if '.parquet' in fpath else pd.read_csv(fpath)
    df = df[df['sur_refl_b01_1'] + df['sur_refl_b02_1'] != 0]
    df = df[df['sur_refl_b07_1'] + df['sur_refl_b02_1'] != 0]
    df = df[df['sur_refl_b06_1'] + df['sur_refl_b02_1'] != 0]

    df = df.drop(columns=colsToDrop)
    cleanedDF = df[~df.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0).astype(dataType)
    if applyLog:
        for col in cleanedDF.drop([yCol], axis=1).columns:
            print('Applying log1p func to {}'.format(col))
            cleanedDF[col] = np.log1p(cleanedDF[col])
        cleanedDF = cleanedDF[~cleanedDF.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0)
    df = None
    if imbalance:
        if land:
            print('Imbalancing data, sampling {} from water'.format(frac))
        else:
            print(f'Imbalancing data, sampling {frac} from land, {1-frac} from water')
        groupedDF = cleanedDF.groupby('water')
        dfs = [groupedDF.get_group(y) for y in groupedDF.groups]
        sampledDF = dfs[1].sample(frac=frac)if land else dfs[0].sample(frac=frac)
        concatDF = sampledDF.append(dfs[0]) if land else sampledDF.append(dfs[1])
        concatDF = concatDF.sample(frac=1)
        concatDF = concatDF.reset_index()
        cleanedDF = concatDF.drop(columns=['index'])
    if not splitXY:
        return cleanedDF
    X = cleanedDF.drop([yCol], axis=1).astype(dataType)
    y = cleanedDF[yCol].astype(dataType)
    if trainTestSplit:
        return train_test_split(X, y, test_size=TEST_RATIO)
    else:
        return X, y

In [7]:
# def load_gpu_data(fpath, colsToDrop, yCol='water', testSize=0.2, randomState=42, 
#             dataType=cp.float32, cpu=False, splitXY=True, trainTestSplit=True,
#             applyLog=False, imbalance=False, frac=0.1, land=False, multi=False, 
#             multisample=1000000):
#     """
#     Simple helper function for loading data to be used by models
#     :param fpath: Path to the data to be ingested.
#     :param dataType: Data type to convert ingested data to.
#     :param colsToDrop: Columns which are not necessary, from which to drop.
#     :param testSize: Ration to
#     """
#     if multi:
#         all_dfs = [pd.read_csv(path_) for path_ in fpath]
#         df = pd.concat(all_dfs).sample(n=multisample, random_state=randomState)
#         print('DF length: {}'.format(len(df.index)))
#     else:   
#         df = pd.read_parquet(fpath) if '.parquet' in fpath else pd.read_csv(fpath)
#     df = df[df['sur_refl_b01_1'] + df['sur_refl_b02_1'] != 0]
#     df = df[df['sur_refl_b07_1'] + df['sur_refl_b02_1'] != 0]
#     df = df[df['sur_refl_b06_1'] + df['sur_refl_b02_1'] != 0]
#     df = df.drop(columns=colsToDrop)
#     cleanedDF = df[~df.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0).astype(dataType)
#     cleanedDF = cudf.from_pandas(cleanedDF) if not cpu else cleanedDF
#     if applyLog:
#         for col in cleanedDF.drop([yCol], axis=1).columns:
#             print('Applying log1p func to {}'.format(col))
#             cleanedDF[col] = np.log1p(cleanedDF[col])
#         cleanedDF = cleanedDF[~cleanedDF.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0)
#     df = None
#     if imbalance:
#         if land:
#             print('Imbalancing data, sampling {} from water'.format(frac))
#         else:
#             print('Imbalancing data, sampling {} from land'.format(frac))
#         groupedDF = cleanedDF.groupby('water')
#         dfs = [groupedDF.get_group(y) for y in groupedDF.groups]
#         sampledDF = dfs[1].sample(frac=frac)if land else dfs[0].sample(frac=frac)
#         concatDF = sampledDF.append(dfs[0]) if land else sampledDF.append(dfs[1])
#         concatDF = concatDF.sample(frac=1)
#         concatDF = concatDF.reset_index()
#         cleanedDF = concatDF.drop(columns=['index'])
#     if not splitXY:
#         return cleanedDF
#     X = cleanedDF.drop([yCol], axis=1).astype(dataType)
#     y = cleanedDF[yCol].astype(dataType)
#     cleanedX = cleanedDF.drop([yCol], axis=1).astype(dataType)
#     cleanedy = cleanedDF[yCol].astype(dataType)
#     if trainTestSplit:
#         return train_test_split(cleanedX, cleanedy, test_size=TEST_RATIO)
#     else:
#         return cleanedX, cleanedy

In [8]:
colsToDrop = [
    # 'sur_refl_b01_1',
    # 'sur_refl_b02_1',
    'sur_refl_b03_1',
    'sur_refl_b04_1','sur_refl_b05_1','sur_refl_b06_1',
    # 'sur_refl_b07_1',
    # 'ndvi',
    'ndwi1','ndwi2'
        ]

colsToDropTraining = colsToDrop.copy()
colsToDropTraining.extend(offsets_indexes)
v_names = ['sur_refl_b01_1','sur_refl_b02_1','sur_refl_b03_1',
           'sur_refl_b04_1','sur_refl_b05_1','sur_refl_b06_1',
           'sur_refl_b07_1','ndvi','ndwi1','ndwi2']

### Input data

In [9]:
colsToDrop

['sur_refl_b03_1',
 'sur_refl_b04_1',
 'sur_refl_b05_1',
 'sur_refl_b06_1',
 'ndwi1',
 'ndwi2']

In [10]:
%%time
load_data_params = {'fpath':data_path,'colsToDrop':colsToDropTraining,'splitXY':True,
                    'imbalance':False,'trainTestSplit':True}

# print(load_data_params)

if GPU is False: 
    X, X_test, y, y_test = load_cpu_data(**load_data_params)
else: 
    X, X_test, y, y_test = load_gpu_data(**load_data_params)
        
# X = X.iloc[:num_datapoints,:] 
# y = y.iloc[:num_datapoints] 

# X_test = X_test.iloc[:num_datapoints,:] 
# y_test = y_test.iloc[:num_datapoints] 

print(f'data shape: {X.shape}, {y.shape}')

data shape: (4375821, 4), (4375821,)
CPU times: user 3.65 s, sys: 1.04 s, total: 4.69 s
Wall time: 4.23 s


In [11]:
X

Unnamed: 0,sur_refl_b01_1,sur_refl_b02_1,sur_refl_b07_1,ndvi
4423372,58.0,-1.0,9.0,-10350.876953
4037063,2196.0,2455.0,2345.0,556.869507
1888872,240.0,442.0,87.0,2961.876953
4284286,816.0,2099.0,1258.0,4401.372070
1018719,44.0,-13.0,94.0,-18387.097656
...,...,...,...,...
5512927,420.0,2003.0,713.0,6533.223145
3403713,553.0,2213.0,684.0,6001.446289
3415244,419.0,1474.0,810.0,5573.164062
3582120,54.0,-20.0,37.0,-21764.705078


# Clustering

In [12]:
#Getting the indices that are associated with land (0) and water (1)
y_water_ind = np.where(y>0.5)[0]
y_land_ind = np.where(y<0.5)[0]

#Subset the X AND y data to later/ subset with the clusters and then combine for RFA
X_water = X.iloc[y_water_ind,:]
y_water = y.iloc[y_water_ind]

X_land = X.iloc[y_land_ind,:]
y_land = y.iloc[y_land_ind]
print(f'data shape: {X_water.shape}, {X_land.shape}')

data shape: (1976866, 4), (2398955, 4)


In [13]:
_ = [print(column) for column in X.columns]

sur_refl_b01_1
sur_refl_b02_1
sur_refl_b07_1
ndvi


In [14]:
# DATA_VERSION='v4.2.1'
# training_data_basepath = f'/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/{DATA_VERSION}'

# #VERSION 4.2.1
# TILE_IN = 'Golden'#v4.2.1
# offsets_indexes = ['x_offset', 'y_offset', 'year', 'julian_day','tileID']

# glob_string = os.path.join(training_data_basepath,'MOD*{}*.parquet.gzip'.format(TILE_IN))
# data_paths = sorted([fv for fv in glob.glob(glob_string)])

# #Only want the one with 4.2.0 because the other file doesnt work. 
# print(data_paths)
# data_path = data_paths[0]
# print(data_path)
# colsToDropTraining = colsToDrop.copy()
# colsToDropTraining.extend(offsets_indexes)


# X_target, X_test_target, y_target, y_test_target = load_data(fpath=data_path,
#                                 colsToDrop=colsToDropTraining,
#                                 dataType=DATA_TYPE,
#                                 cpu=True,
#                                 splitXY=True,
#                                 trainTestSplit=True
#                                 )

# X_target = X_target.iloc[:num_datapoints,:] 
# y_target = y_target.iloc[:num_datapoints] 

# X_test_target = X_test_target.iloc[:num_datapoints,:] 
# y_test_target = y_test_target.iloc[:num_datapoints] 

# print(f'\n\ntarget subset data shape: {X_target.shape}, {y_target.shape}')

# #Getting the indices that are associated with land (0) and water (1)
# #Subset the X AND y data to later subset with the clusters and then combine for RFA
# X_water_target = X_target.iloc[np.where(y_target>0.5)[0],:]
# X_land_target = X_target.iloc[np.where(y_target<0.5)[0],:]

## Clustering Data for Input to Random Forest

Based on the cluster analysis above on 5.03.23, 15 clusters appears to have the most data and exclude outliers so will use that number for selection 

In [15]:
CLUSTER_NUM=15

common_params = {
    "n_init": "auto",
    # "random_state": 42,
    "init":"random"
}

In [16]:
%%time
kme_land_random =  KMeans(n_clusters=CLUSTER_NUM, **common_params).fit(X_land)
kmeans_output_land_random = kme_land_random.predict(X_land)

CPU times: user 2min 14s, sys: 2.66 s, total: 2min 17s
Wall time: 34.8 s


In [17]:
%%time
kme_water_random = KMeans(n_clusters=CLUSTER_NUM, **common_params).fit(X_water)
kmeans_output_water_random = kme_water_random.predict(X_water)

CPU times: user 1min 8s, sys: 1.36 s, total: 1min 9s
Wall time: 17.5 s


### Even Balanced Random pulled datapoints

In [18]:
COUNT_EVEN_BALANCE_LAND = np.inf
COUNT_EVEN_BALANCE_WATER = np.inf
for cluster in np.unique(kmeans_output_water_random):
    land_num = len(np.where(kmeans_output_land_random == cluster)[0])
    water_num = len(np.where(kmeans_output_water_random == cluster)[0])
    if land_num < COUNT_EVEN_BALANCE_LAND: COUNT_EVEN_BALANCE_LAND = land_num
    if water_num < COUNT_EVEN_BALANCE_WATER: COUNT_EVEN_BALANCE_WATER = water_num
    
print(COUNT_EVEN_BALANCE_LAND, COUNT_EVEN_BALANCE_WATER)
if COUNT_EVEN_BALANCE_LAND < COUNT_EVEN_BALANCE_WATER:
    COUNT = COUNT_EVEN_BALANCE_LAND
else: 
    COUNT = COUNT_EVEN_BALANCE_WATER
print(COUNT,COUNT_EVEN_BALANCE_LAND,COUNT_EVEN_BALANCE_WATER)

4 49
4 4 49


In [19]:
# np.random.seed(42)
random_ind_land = np.array([])
random_ind_water = []

for cluster in np.unique(kmeans_output_water_random):
    print(f'cluster {cluster}')
    cluster_ind_water = np.where(kmeans_output_water_random == cluster)[0]
    random_pts_water = np.random.choice(cluster_ind_water,COUNT,replace=False)
    max_X_random_water = np.nanmax(X_water['sur_refl_b01_1'].iloc[random_pts_water])
    if max_X_random_water < 10000:
        random_ind_water = np.append(random_ind_water, random_pts_water)
    else: print(f'Cluster {cluster} contains outliers')
    
    cluster_ind_land = np.where(kmeans_output_land_random == cluster)[0]
    random_pts_land = np.random.choice(cluster_ind_land,COUNT,replace=False)
    random_ind_land = np.append(random_ind_land, random_pts_land)
    
random_ind_water = random_ind_water.astype('int')
random_ind_land = random_ind_land.astype('int')

print(np.shape(random_ind_water),np.shape(random_ind_land))

cluster 0
cluster 1
cluster 2
cluster 3
cluster 4
cluster 5
cluster 6
cluster 7
cluster 8
cluster 9
cluster 10
cluster 11
cluster 12
cluster 13
cluster 14
(60,) (60,)


# THE DATES TO PULL IN VIIRS DATA

In [30]:
%%time
# This set of parameters has the date/lat lon encoded 

load_data_params = {'fpath':data_path,'colsToDrop':colsToDrop,'splitXY':True,
                    'imbalance':False,'trainTestSplit':True}

# print(load_data_params)

if GPU is False: 
    X_meta, X_meta, y_meta, y_meta = load_cpu_data(**load_data_params)
else: 
    X_meta, X_meta, y_meta, y_meta = load_gpu_data(**load_data_params)

print(f'data shape: {X_w_date.shape}, {y_w_date.shape}')

data shape: (4375821, 8), (4375821,)
CPU times: user 6.03 s, sys: 2.29 s, total: 8.33 s
Wall time: 7.68 s


In [27]:
np.concatenate([random_ind_water,random_ind_land])

array([1439478,  438068,  730604, 1503219, 1509976, 1770749, 1190977,
       1599469,  594781, 1102015, 1817470, 1745113, 1197818, 1212199,
         23772,  390059,   14795, 1209277,  314210,  694903,  752054,
        537682, 1712979,  516959, 1319815,   22178, 1673176, 1786381,
        774458,  248623,  617242,  624232, 1865703,  442469,  107038,
        914474,  954533,  939537, 1264737, 1333356,  353275,  721534,
        249302,   59454,  715028,    4168,  533153, 1138907, 1705743,
       1546762,  397961, 1519468,  688548,  781819,  209172, 1624426,
        665362, 1582558,  181003,  627999, 1076402, 1611394, 1517693,
        797320, 1572290, 1946907, 1866284, 1623656, 2301389,  759022,
       1828056,   74701,  636832, 1463362, 1107487,  238254, 1868573,
        813388, 2199747,  452622,  402285,  626479,  519152, 1365355,
        884207,  313800,  436181, 1565850,  745461, 2380084,  541789,
        265226, 2322677, 1461569, 1764940, 1804591, 1393325, 1145260,
        773825, 1898

In [29]:
total_cluster_inds = np.concatenate([random_ind_water,random_ind_land])
cluster_meta_data = X_meta.iloc[total_cluster_inds,4:]
print(cluster_meta_data[cluster_meta_data['year' > 2011.0]])

NameError: name 'X_meta' is not defined

# THE OTHER STUFF

### Percentage Random pulled datapoints

In [None]:
# List of the clusters: kmeans_output_land and kmeans_output_water
# Data: X_water, X_land, y_water, y_land

PERCENT_RANDOM_PULL = 0.15

In [None]:

np.random.seed(42)
random_ind_land = np.array([])
random_ind_water = []

for cluster in np.unique(kmeans_output_water_random):
    print(f'cluster {cluster}')
    cluster_ind_water = np.where(kmeans_output_water_random == cluster)[0]
    # cluster_ind_water = np.where(bgm_water == cluster)[0]
    COUNT_RANDOM_PULL_WATER = int(PERCENT_RANDOM_PULL*len(cluster_ind_water))
    random_pts_water = np.random.choice(cluster_ind_water,COUNT_RANDOM_PULL_WATER,replace=False)
    max_X_random_water = np.nanmax(X_water['sur_refl_b01_1'].iloc[random_pts_water])
    if max_X_random_water < 10000:
        random_ind_water = np.append(random_ind_water, random_pts_water)
    else: print(f'Cluster {cluster} contains outliers')
    
    cluster_ind_land = np.where(kmeans_output_land_random == cluster)[0]
    # cluster_ind_land = np.where(bgm_land == cluster)[0]
    COUNT_RANDOM_PULL_LAND = int(PERCENT_RANDOM_PULL*len(cluster_ind_land))
    random_pts_land = np.random.choice(cluster_ind_land,COUNT_RANDOM_PULL_LAND,replace=False)
    random_ind_land = np.append(random_ind_land, random_pts_land)
    # print(f'Pulling {COUNT_RANDOM_PULL_WATER} Water pts and {COUNT_RANDOM_PULL_LAND} Land pts')
    # print()
random_ind_water = random_ind_water.astype('int')
random_ind_land = random_ind_land.astype('int')

print(len(random_ind_water),len(random_ind_land))


In [None]:
# fig, ax = plt.subplots(2, 2,figsize=(20, 10))
# var=0
# for col in range(2):
#     ax[col, 0].set_ylabel('Frequency') 
#     for row in range(2):
#         variable=X_cpu.columns[var]
#         if 'ndvi' in variable: 
#             continue
#             var_bins = bin_boundaries
#             log_values = False
#         else: 
#             var_bins = None
#             log_values = True
#         ax[row, col].hist(
#             [   
#             X_cpu[variable][not_same_point.index].values
#             ],
#             label=[
#             "data"
#             ],
#             bins=var_bins,
#         color=['brown'], log=log_values) 
#         ax[row, col].set_xlabel(f'{variable}')
#         var+=1
#     ax[0,0].legend(loc='upper right',fontsize=20)
# plt.show()

### Total random dataset used for training random forest

In [None]:
X_cluster_land_random = X_land.iloc[random_ind_land]
y_cluster_land_random = y_land.iloc[random_ind_land]
X_cluster_water_random = X_water.iloc[random_ind_water]
y_cluster_water_random = y_water.iloc[random_ind_water]

X_cluster_random = pd.concat([X_cluster_land_random,X_cluster_water_random])
y_cluster_random = pd.concat([y_cluster_land_random,y_cluster_water_random])

#Combine the data so that we can shuffle the indices and keep the data together that should be
All_data_random = pd.concat([X_cluster_random,y_cluster_random],axis=1).sample(frac=1)

X_cluster_rfa_random = All_data_random[X_cluster_random.columns]
y_cluster_rfa_random = All_data_random['water']

print(X_cluster_rfa_random)
print(y_cluster_rfa_random)

In [None]:
match_ind_land = np.random.choice(
    np.arange(len(X_land)),len(random_ind_land),replace=False)
print(random_ind_land)
print(match_ind_land)

match_ind_water = np.random.choice(
    np.arange(len(X_water)),len(random_ind_water),replace=False)
print(len(random_ind_water))
print(len(match_ind_water))

# X_match_land_random = X_land.iloc[match_ind_land]
# y_match_land_random = y_land.iloc[match_ind_land]
# X_match_water_random = X_water.iloc[match_ind_water]
# y_match_water_random = y_water.iloc[match_ind_water]

X_match_land_random = X_land.iloc[random_ind_land]
y_match_land_random = y_land.iloc[random_ind_land]
X_match_water_random = X_water.iloc[random_ind_water]
y_match_water_random = y_water.iloc[random_ind_water]

X_match_random = pd.concat([X_match_land_random,X_match_water_random])
y_match_random = pd.concat([y_match_land_random,y_match_water_random])

#Combine the data so that we can shuffle the indices and keep the data together that should be
All_data_match_random = pd.concat([X_match_random,y_match_random],axis=1).sample(frac=1).reset_index(drop=True)

X_match_rfa_random = All_data_match_random[X_match_random.columns]
y_match_rfa_random = All_data_match_random['water']

In [None]:
# print(All_data_random)
print(X_match_rfa_random)
print(y_match_rfa_random)

### Plotting paramater space

In [None]:
%%time

kme_land_random =  KMeans(n_clusters=CLUSTER_NUM, **common_params).fit(X_land)
kmeans_output_land_random = kme_land_random.predict(X_land)
kme_water_random = KMeans(n_clusters=CLUSTER_NUM, **common_params).fit(X_water)
kmeans_output_water_random = kme_water_random.predict(X_water)

In [None]:
%%time

np.random.seed(42)
random_ind_land_eb = np.array([])
random_ind_water_eb = []

for cluster in np.unique(kmeans_output_water_random):
    print(f'cluster {cluster}')
    cluster_ind_water = np.where(kmeans_output_water_random == cluster)[0]
    random_pts_water = np.random.choice(cluster_ind_water,COUNT,replace=False)
    max_X_random_water = np.nanmax(X_water['sur_refl_b01_1'].iloc[random_pts_water])
    if max_X_random_water < 10000:
        random_ind_water_eb = np.append(random_ind_water_eb, random_pts_water)
    else: print(f'Cluster {cluster} contains outliers')
    
    cluster_ind_land = np.where(kmeans_output_land_random == cluster)[0]
    random_pts_land = np.random.choice(cluster_ind_land,COUNT,replace=False)
    random_ind_land_eb = np.append(random_ind_land_eb, random_pts_land)
    
random_ind_water_eb = random_ind_water_eb.astype('int')
random_ind_land_eb = random_ind_land_eb.astype('int')

print(random_ind_water_eb,random_ind_land_eb)


#############

match_ind_land_eb = np.random.choice(np.arange(len(X_land)),len(random_ind_land_eb),replace=False)
match_ind_water_eb = np.random.choice(np.arange(len(X_water)),len(random_ind_water_eb),replace=False)

X_match_land_eb = X_land.iloc[match_ind_land_eb]
X_match_water_eb = X_water.iloc[match_ind_water_eb]
X_match_eb = pd.concat([X_match_land_eb,X_match_water_eb])

X_cluster_land_eb = X_land.iloc[random_ind_land_eb]
X_cluster_water_eb = X_water.iloc[random_ind_water_eb]
X_cluster_eb = pd.concat([X_cluster_land_eb,X_cluster_water_eb])




In [None]:
%%time


#############
np.random.seed(42)
random_ind_land_p = np.array([])
random_ind_water_p= []

for cluster in np.unique(kmeans_output_water_random):
    print(f'cluster {cluster}')
    cluster_ind_water = np.where(kmeans_output_water_random == cluster)[0]
    # cluster_ind_water = np.where(bgm_water == cluster)[0]
    COUNT_RANDOM_PULL_WATER = int(PERCENT_RANDOM_PULL*len(cluster_ind_water))
    random_pts_water = np.random.choice(cluster_ind_water,COUNT_RANDOM_PULL_WATER,replace=False)
    max_X_random_water = np.nanmax(X_water['sur_refl_b01_1'].iloc[random_pts_water])
    if max_X_random_water < 10000:
        random_ind_water_p = np.append(random_ind_water_p, random_pts_water)
    else: print(f'Cluster {cluster} contains outliers')
    
    cluster_ind_land = np.where(kmeans_output_land_random == cluster)[0]
    # cluster_ind_land = np.where(bgm_land == cluster)[0]
    COUNT_RANDOM_PULL_LAND = int(PERCENT_RANDOM_PULL*len(cluster_ind_land))
    random_pts_land = np.random.choice(cluster_ind_land,COUNT_RANDOM_PULL_LAND,replace=False)
    random_ind_land_p = np.append(random_ind_land_p, random_pts_land)
    print(f'Pulling {COUNT_RANDOM_PULL_WATER} Water pts and {COUNT_RANDOM_PULL_LAND} Land pts')
    print()
random_ind_water_p = random_ind_water_p.astype('int')
random_ind_land_p = random_ind_land_p.astype('int')

print(random_ind_water_p,random_ind_land_p)

#############

match_ind_land_p = np.random.choice(np.arange(len(X_land)),len(random_ind_land_p),replace=False)
match_ind_water_p = np.random.choice(np.arange(len(X_water)),len(random_ind_water_p),replace=False)

X_match_land_p = X_land.iloc[match_ind_land_p]
X_match_water_p = X_water.iloc[match_ind_water_p]
X_match_p = pd.concat([X_match_land_p,X_match_water_p])

X_cluster_land_p = X_land.iloc[random_ind_land_p]
X_cluster_water_p = X_water.iloc[random_ind_water_p]
X_cluster_p = pd.concat([X_cluster_land_p,X_cluster_water_p])

In [None]:
fig, ax = plt.subplots(2, 2,figsize=(20, 10))
var=0
for col in range(2):
    ax[col, 0].set_ylabel('Frequency') 
    for row in range(2):
        variable=X_land.columns[var]
        if 'ndvi' in variable: 
            # var_bins = bin_boundaries
            log_values = False
        else: 
            # var_bins = None
            log_values = True
        ax[row, col].hist(
            [  
            # X_cluster_eb[variable].values,
            # X_match_eb[variable].values,
            X_cluster_p[variable].values,
            X_match_p[variable].values,
            ],
            label=[
            # f"EB Cluster {len(X_cluster_eb)}",
            # "EB Match",
            f"P Cluster {len(X_match_p)}",
            f"P Match"
            ],
            #bins=var_bins,
        #color=['darkgreen','lightgreen','darkblue','lightblue'], log=log_values) 
        color=['plum','darkorchid'], log=log_values) 
        ax[row, col].set_xlabel(f'{variable}')
        var+=1
    ax[0,0].legend(loc='upper right',fontsize=20)
plt.show()

In [None]:
fig, ax =  plt.subplots(1, 1,figsize=(10, 5))
variable = X_land.columns[0]

plt.hist(
    [X_cluster_p[variable].values,
     X_match_p[variable].values,
    ],
    label=[
        f"P Cluster {len(X_match_p)}",
        f"P Match"
        ],
    color=['plum','darkorchid'], log=True) 

plt.ylabel('Frequency') 
plt.xlabel(f'{variable}')
plt.legend(loc='upper right',fontsize=20)   
plt.show()


# Random forest

In [None]:
def cpu_rf_objective(trial):
    list_trees = [75, 100, 125, 150, 175, 200, 250, 300, 400, 500]
    max_depth = [5, 10, 30, 50, 80, 90, 100, 110]
    min_samples_leaf = [1, 2, 3, 4, 5]
    min_samples_split = [2, 4, 8, 10]
    bootstrap = [True, False]
    max_features = ['auto', 'sqrt', 'log2']
    
    param = {'n_estimators': trial.suggest_categorical('n_estimators', list_trees), 
       'max_depth':trial.suggest_categorical('max_depth', max_depth), 
       'min_samples_split':trial.suggest_categorical('min_samples_split', min_samples_split), 
       'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', min_samples_leaf), 
       'bootstrap': trial.suggest_categorical('bootstrap', bootstrap),
       'criterion':'gini', 
       #'min_weight_fraction_leaf': trial.suggest_float('min_weight_fraction_leaf', 1e-8, 1.0, log=True), 
       'max_features':trial.suggest_categorical('max_features', max_features), 
       'max_leaf_nodes':None, 
       'min_impurity_decrease':0.0, 
       'oob_score':False, 
       'n_jobs':-1, 
       # 'random_state':42, 
       'verbose':0, 
       'warm_start':False, 
       'class_weight':None, 
       'ccp_alpha':0.0, 
       'max_samples':None
        }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    #######################
    # HERE IS WHERE TO CHANGE THE X,Y DATASET USED FOR TRAINING
    #######################
   
    cv_scores = np.empty(5)
    for idx, (train_idx, val_idx) in enumerate(cv.split(X,y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

    # for idx, (train_idx, val_idx) in enumerate(cv.split(X_cluster_rfa_random,  y_cluster_rfa_random)):    
    #     X_train, X_val = X_cluster_rfa_random.iloc[train_idx], X_cluster_rfa_random.iloc[val_idx]
    #     y_train, y_val = y_cluster_rfa_random.iloc[train_idx],  y_cluster_rfa_random.iloc[val_idx]

    # for idx, (train_idx, val_idx) in enumerate(cv.split(X_match_rfa_random,  y_match_rfa_random)):    
    #     X_train, X_val = X_match_rfa_random.iloc[train_idx], X_match_rfa_random.iloc[val_idx]
    #     y_train, y_val = y_match_rfa_random.iloc[train_idx],  y_match_rfa_random.iloc[val_idx]     

        model = skRF(**param)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        cv_scores[idx] = f1_score(y_val, preds)
        if cv_scores[idx] == 0.0:
            print('Pruning because of 0.0 score.')
            return 0.0
        print('Fold {}: {}'.format(idx, cv_scores[idx]))
    return np.mean(cv_scores)

search_space={
    "n_estimators": [75, 100, 125, 150, 175, 200, 250, 300, 400, 500],
    "max_depth" : [5,10, 30, 50, 80, 90, 100, 110],
    "min_samples_leaf" : [1, 2, 3, 4, 5],
    "min_samples_split" : [2, 4, 8, 10],
    "bootstrap" : [True, False],
    "max_features" : ['auto', 'sqrt', 'log2'],
    
}
TREES_AND_DEPTH_ONLY = False
GRID_SEARCH = True

In [None]:
def gpu_rf_objective(trial):
    list_trees = [75, 100, 125, 150, 175, 200, 250, 300, 400, 500]
    max_depth = [5, 10, 30, 50, 80, 90, 100, 110]
    min_samples_leaf = [1, 2, 3, 4, 5]
    min_samples_split = [2, 4, 8, 10]
    bootstrap = [True, False]
    max_features = ['auto', 'sqrt', 'log2']
    
    param = {'n_estimators': trial.suggest_categorical('n_estimators', list_trees), 
        'max_depth':trial.suggest_categorical('max_depth', max_depth), 
        'min_samples_split':trial.suggest_categorical('min_samples_split', min_samples_split), 
        'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', min_samples_leaf), 
        'max_features':trial.suggest_categorical('max_features', max_features), 
            }
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

    #######################
    # HERE IS WHERE TO CHANGE THE X,Y DATASET USED FOR TRAINING
    #######################
   
    cv_scores = np.empty(5)
    for idx, (train_idx, val_idx) in enumerate(cv.split(X.to_pandas(),y.to_pandas())):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]
        
        model = cuRFC(**param)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        cv_scores[idx] = f1_score(y_val.to_numpy(), preds.to_numpy())
        del model, preds
        if cv_scores[idx] == 0.0:
            print('Pruning because of 0.0 score.')
            return 0.0
        print('Fold {}: {}'.format(idx, cv_scores[idx]))
    return np.mean(cv_scores)
    
search_space={
    "n_estimators": [75, 100, 125, 150, 175, 200, 250, 300, 400, 500],
    "max_depth" : [5,10, 30, 50, 80, 90, 100, 110],
    "min_samples_leaf" : [1, 2, 3, 4, 5],
    "min_samples_split" : [2, 4, 8, 10],
    "bootstrap" : [True, False],
    "max_features" : ['auto', 'sqrt', 'log2'],
    
}
TREES_AND_DEPTH_ONLY = False
GRID_SEARCH = True

In [None]:
%%time

optuna.logging.set_verbosity(optuna.logging.INFO)
if GRID_SEARCH:
    study = optuna.create_study(study_name='RF Tuning Grid Search', 
                                direction='maximize',
                                sampler=optuna.samplers.GridSampler(search_space))
    
else:
    study = optuna.create_study(study_name='RF Tuning',
                                direction='maximize')
#Objective is under the functions area

#####################################################################
#CHANGE HERE FOR DIFFERENT MODELING TYPE
#rf_objective or xgb_objective
#####################################################################
if GPU is False:
    study.optimize(cpu_rf_objective, n_trials=25, timeout=30*600)
else: 
    study.optimize(gpu_rf_objective, n_trials=25, timeout=30*600)

#### Training and output best model

In [None]:
trials = study.best_trials            
max_trial_score = max([trial.values[0] for trial in trials])
max_trial_params = [trial.params for trial in trials 
                        if trial.values[0] == max_trial_score][0]
max_trial_params['n_jobs'] = -1
score_print = int(np.round(max_trial_score,4)*1000)
print(max_trial_score)
print(score_print)

In [None]:
hyperparameters = max_trial_params
hyperparameters['n_jobs'] = -1
print('Using these params:')
print(hyperparameters)
tuned_classifier = skRF(**hyperparameters)

In [None]:
%%time 
tuned_classifier.fit(X,y) #_match_rfa_random , y_match_rfa_random)

In [None]:
import pickle
# save the model to disk
filename = f'rfa_models/MODIS_RFA_Targeted_v000_MaxScore{score_print}_sfcref127ndvi.pkl'
print(filename)
pickle.dump(tuned_classifier, open(filename, 'wb'))

In [None]:
import pickle
# pickled_model = pickle.load(open('rfa_models/MODIS_RFA_v201_EBCluster_sfcref127ndvi_4.pkl', 'rb'))
# print(pickled_model)