#  MODIS Water Cluster Training

Version: 0.1.0

Date modified: 05.01.2023

Modified by: Amanda Burke

In [5]:
from pathlib import Path  
import numpy as np
import pandas as pd
import datetime
import joblib
import optuna
import pickle
import time
import glob
import csv
import os

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import warnings

warnings.filterwarnings('ignore')
plt.style.use('fivethirtyeight')
%matplotlib inline

from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.ensemble import RandomForestClassifier as skRF
from sklearn.cluster import KMeans, MiniBatchKMeans
from sklearn.metrics import f1_score

# np.random.seed(42)

## Parameters

In [6]:
GPU = False
MODEL = 'rf'
TEST_RATIO = 0.2
RANDOM_STATE = 42
LABEL_NAME = 'water'
DATA_TYPE = np.int16
FRAC_LAND=0.5
num_datapoints = 100000000

v_names = [
    'sur_refl_b01_1','sur_refl_b02_1','sur_refl_b03_1',
    'sur_refl_b04_1','sur_refl_b05_1','sur_refl_b06_1',
    'sur_refl_b07_1','ndvi','ndwi1','ndwi2'
    ]

common_params = {
    "n_init": "auto"
}
input_vars = ['sur_refl_b01_1','sur_refl_b02_1','sur_refl_b07_1']
droped_vars = [v for v in v_names if v not in input_vars]

#RF Training
search_space={
    "n_estimators": [75, 100, 125, 150, 175, 200, 250, 300, 400, 500],
    "max_depth" : [5, 10, 30, 50, 80, 90, 100, 110],
    "min_samples_leaf" : [1, 2, 3, 4, 5],
    "min_samples_split" : [2, 4, 8, 10],
    "bootstrap" : [True, False],
    "max_features" : ['auto', 'sqrt', 'log2'] 
}

## Functions

### Plotting

In [7]:
def plotting_clusters(X_w,X_l,cluster_output_w,cluster_output_l, n_cluster,
                      kme_w=None,kme_l=None):
    fig = plt.figure(figsize = (25, 10))

    plt.suptitle(f'Kmeans Clustering {DATA_VERSION} Data, {n_cluster} Clusters')

    ax1 = plt.subplot(131)
    ax1.set_title(f'Land and Water Datapoints')
    ax1.scatter(X_w.values[:,0], X_w.values[:,1],label='Water')
    ax1.scatter(X_l.values[:,0], X_l.values[:,1],label='Land')
    ax1.tick_params(axis='both', which='major', labelsize=10)
    ax1.tick_params(axis='both', which='minor', labelsize=10)
    ax1.set_xlabel(X_w.columns[0])
    ax1.set_ylabel(X_w.columns[1])
    ax1.legend(loc='lower right',fontsize="20")

    ax2 = plt.subplot(132)
    ax2.set_title(f'Water Datapoints Clustered: {len(X_water)} Examples')
    ax2.scatter(X_w.values[:,0], X_w.values[:,1],c=cluster_output_w,cmap='tab10')
    if kme_w is not None:
        ax2.scatter(kme_w.cluster_centers_[:,0],kme_w.cluster_centers_[:,1],
            label='Center Point',c='k',s=150)
        ax2.legend(loc='lower right',fontsize="20")
    ax2.tick_params(axis='both', which='major', labelsize=10)
    ax2.tick_params(axis='both', which='minor', labelsize=10)
    ax2.set_xlabel(X_w.columns[0])
    ax2.set_ylabel(X_w.columns[1])
    

    ax3 = plt.subplot(133)
 
    ax3.set_title(f'Land Datapoints Clustered: {len(X_land)} Examples')
    ax3.scatter(X_l.values[:,0], X_l.values[:,1],c=cluster_output_l,cmap='tab10')
    if kme_l is not None:
        ax3.scatter(kme_l.cluster_centers_[:,0],kme_l.cluster_centers_[:,1],
                    label='Center Point',c='k',s=150)
        ax3.legend(loc='lower right',fontsize="20")
    ax3.tick_params(axis='both', which='major', labelsize=10)
    ax3.tick_params(axis='both', which='minor', labelsize=10)
    ax3.set_xlabel(X_l.columns[0])
    ax3.set_ylabel(X_l.columns[1])
   
    plt.show()
    plt.close()

### Loading and extracting data

In [8]:
def load_data(fpath, colsToDrop, 
              yCol='water', testSize=0.2, randomState=42,
              dataType=np.float32, cpu=True, splitXY=False, trainTestSplit=False,
              applyLog=False, imbalance=False, frac=0.1, land=False, multi=False, 
              multisample=1000000, ndvi_change=False):
    """
    Simple helper function for loading data to be used by models
    :param fpath: Path to the data to be ingested.
    :param dataType: Data type to convert ingested data to.
    :param colsToDrop: Columns which are not necessary, from which to drop.
    :param testSize: Ration to
    """
    if multi:
        all_dfs = [pd.read_csv(path_) for
                   path_ in fpath]
        df = pd.concat(all_dfs).sample(n=multisample, random_state=randomState)
        print('DF length: {}'.format(len(df.index)))
    else:   
        df = pd.read_parquet(fpath) if '.parquet' in fpath else pd.read_csv(fpath)
    df = df[df['sur_refl_b01_1'] + df['sur_refl_b02_1'] != 0]
    df = df[df['sur_refl_b07_1'] + df['sur_refl_b02_1'] != 0]
    df = df[df['sur_refl_b06_1'] + df['sur_refl_b02_1'] != 0]

    df = df.drop(columns=colsToDrop)
    cleanedDF = df[~df.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0).astype(dataType)
    if applyLog:
        for col in cleanedDF.drop([yCol], axis=1).columns:
            print('Applying log1p func to {}'.format(col))
            cleanedDF[col] = np.log1p(cleanedDF[col])
        cleanedDF = cleanedDF[~cleanedDF.isin([np.NaN, np.inf, -np.inf]).any(1)].dropna(axis=0)
    df = None
    if imbalance:
        if land:
            print('Imbalancing data, sampling {} from water'.format(frac))
        else:
            print(f'Imbalancing data, sampling {frac} from land, {1-frac} from water')
        groupedDF = cleanedDF.groupby('water')
        dfs = [groupedDF.get_group(y) for y in groupedDF.groups]
        sampledDF = dfs[1].sample(frac=frac)if land else dfs[0].sample(frac=frac)
        concatDF = sampledDF.append(dfs[0]) if land else sampledDF.append(dfs[1])
        concatDF = concatDF.sample(frac=1)
        concatDF = concatDF.reset_index()
        cleanedDF = concatDF.drop(columns=['index'])
    if not splitXY:
        return cleanedDF
    cleanedX = cleanedDF.drop([yCol], axis=1).astype(dataType)
    cleanedy = cleanedDF[yCol].astype(dataType)
    
    ############
    #Added calculation of NDVI instead of the file point
    ############
    if ndvi_change is True:
        top_math_ndvi = (cleanedX['sur_refl_b02_1'].values - cleanedX['sur_refl_b01_1'].values)
        bot_math_ndvi = (cleanedX['sur_refl_b02_1'].values + cleanedX['sur_refl_b01_1'].values)
        calculated_ndvi = top_math_ndvi/bot_math_ndvi
        calculated_ndvi[calculated_ndvi > 1.0] = 1.0
        calculated_ndvi[calculated_ndvi < -1.0] = -1.0
        scaled_ndvi = (10000*calculated_ndvi).astype(int)
        cleanedX['ndvi'] = scaled_ndvi
        
    if trainTestSplit:
        return train_test_split(cleanedX, cleanedy, test_size=TEST_RATIO)
    else:
        return cleanedX, cleanedy

In [9]:
def pre_process_data(tile, data_version, offsets_indexes, 
                     ndvi_calc=True, colsToDrop=droped_vars):
    
    training_data_basepath = f'/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/{data_version}'
    glob_string = os.path.join(training_data_basepath,'MOD*{}*.parquet.gzip'.format(tile))
    data_paths = sorted([fv for fv in glob.glob(glob_string)])
 
    data_path = data_paths[0]
    print(data_path)

    colsToDropTraining = colsToDrop.copy()
    colsToDropTraining.extend(offsets_indexes)
    
    X, X_test, y, y_test = load_data(
        fpath=data_path,
        colsToDrop=colsToDropTraining,
        dataType=DATA_TYPE,
        cpu=True,
        splitXY=True,
        trainTestSplit=True,
        ndvi_change=ndvi_calc)

    print('Input Variables', X.columns)
    print(f'data shape: {X.shape}, {y.shape}')
    
    #Getting the indices that are associated with land (0) and water (1)
    water_indx = np.where(y>0.5)[0]
    land_indx = np.where(y<0.5)[0]
    # print(y.iloc[water_indx])
    # print('Min water value:',np.nanmin(y.iloc[water_indx]),', Min land value:',np.nanmin(y.iloc[land_indx]))
    print()

    return X, y, water_indx, land_indx

### Kmeans Clustering and Matching Size dataset

Based on the cluster analysis above on 5.03.23, 15 clusters appears to have the most data and exclude outliers so will use that number for selection 

In [10]:
def kmeans_clustering(cluster_type, InX, InY, water_i, land_i, 
                      kwargs=common_params, plotting=False, CLUSTER_NUM=15, 
                      PERCENT_RANDOM_PULL=0.15, match=True):

    InY_w = InY.iloc[water_i].reset_index(drop=True)
    InY_l = InY.iloc[land_i].reset_index(drop=True)
    InX_w = InX.iloc[water_i].reset_index(drop=True)
    InX_l = InX.iloc[land_i].reset_index(drop=True)
    
    kme_land_model = MiniBatchKMeans(n_clusters=CLUSTER_NUM, **kwargs).fit(InX_l)
    kme_land = kme_land_model.predict(InX_l)

    kme_water_model = MiniBatchKMeans(n_clusters=CLUSTER_NUM, **kwargs).fit(InX_w)
    kme_water = kme_water_model.predict(InX_w)
    
    if plotting:
        plotting_clusters(
                    InX_w, InX_l,
                    kme_water,kme_land,
                    CLUSTER_NUM,kme_water_model,kme_land_model)
    if 'Even' in cluster_type: 
        eb_count_water  = np.inf
        eb_count_land = np.inf
        for c in np.arange(CLUSTER_NUM):
            water_num = len(np.where(kme_water == c)[0])
            if water_num < eb_count_water: eb_count_water = water_num
            land_num = len(np.where(kme_land == c)[0])
            if land_num < eb_count_land: eb_count_land = land_num
        if eb_count_land < eb_count_water: COUNT = eb_count_land
        else: COUNT = eb_count_water
    
    w_l_cluster_indx = []
    for label in [kme_water,kme_land]:
        cluster_indx = []
        for c in np.arange(CLUSTER_NUM):
            indx = np.where(label == c)[0]  
            if 'Even' in cluster_type: selection_count = COUNT
            else: selection_count = int(PERCENT_RANDOM_PULL*len(indx))
            rand_indx = np.random.choice(indx,selection_count,replace=False)
            cluster_indx.extend(list(rand_indx))
        w_l_cluster_indx.append(cluster_indx)
    
    clusterY = pd.concat(
        [InY_w.iloc[w_l_cluster_indx[0]],InY_l.iloc[w_l_cluster_indx[1]]]
         ).reset_index(drop=True).sample(frac=1)
    clusterX = pd.concat(
        [InX_w.iloc[w_l_cluster_indx[0]],InX_l.iloc[w_l_cluster_indx[1]]]
         ).reset_index(drop=True).iloc[clusterY.index]
    if match is False:
        print("Using clustered data")
        return clusterX, clusterY
    else:
        print("Using Randomly Matched Data")
        matchY_w = InY_w.sample(n=len(w_l_cluster_indx[0]),replace=False)
        matchY_l = InY_l.sample(n=len(w_l_cluster_indx[1]),replace=False)
    
        matchY = pd.concat([matchY_w,matchY_l]).reset_index(drop=True).sample(frac=1)  
        matchX = pd.concat(
            [InX_w.iloc[matchY_w.index], InX_l.iloc[matchY_l.index]]
            ).reset_index(drop=True).iloc[matchY.index]
        return matchX, matchY

### Random Forest functions

In [11]:
def rf_objective(trial, rfaX, rfaY):
    list_trees = [75, 100, 125, 150, 175, 200, 250, 300, 400, 500]
    max_depth = [5, 10, 30, 50, 80, 90, 100, 110]
    min_samples_leaf = [1, 2, 3, 4, 5]
    min_samples_split = [2, 4, 8, 10]
    bootstrap = [True, False]
    max_features = ['auto', 'sqrt', 'log2']
    
    param = {'n_estimators': trial.suggest_categorical('n_estimators', list_trees), 
        'max_depth':trial.suggest_categorical('max_depth', max_depth), 
        'min_samples_split':trial.suggest_categorical('min_samples_split', min_samples_split), 
        'min_samples_leaf':trial.suggest_categorical('min_samples_leaf', min_samples_leaf), 
        'bootstrap': trial.suggest_categorical('bootstrap', bootstrap),
        'criterion':'gini', 
        'max_features':trial.suggest_categorical('max_features', max_features), 
        'max_leaf_nodes':None, 
        'min_impurity_decrease':0.0, 
        'oob_score':False, 
        'n_jobs':-1, 
        'verbose':0, 
        'warm_start':False, 
        'class_weight':None, 
        'ccp_alpha':0.0, 
        'max_samples':None
                      }
    cv = StratifiedKFold(n_splits=5, shuffle=True)
    cv_scores = np.empty(5)
    
    for idx, (train_idx, val_idx) in enumerate(cv.split(rfaX,rfaY)):
        X_train, X_val = rfaX.iloc[train_idx], rfaX.iloc[val_idx]
        y_train, y_val = rfaY.iloc[train_idx],  rfaY.iloc[val_idx]

        model = skRF(**param)
        model.fit(X_train, y_train)
        preds = model.predict(X_val)
        cv_scores[idx] = f1_score(y_val, preds)
        if cv_scores[idx] == 0.0:
            print('Pruning because of 0.0 score.')
            return 0.0
    return np.mean(cv_scores)

In [12]:
def running_rfa(num_trials, rfaX, rfaY):
    optuna.logging.set_verbosity(optuna.logging.INFO)
    study = optuna.create_study(
        study_name='RF Tuning Grid Search', direction='maximize',
        sampler=optuna.samplers.GridSampler(search_space))
    study.optimize(
        lambda trial: rf_objective(trial, rfaX, rfaY), 
        n_trials=num_trials, timeout=30*600)
    trials = study.best_trials            
    max_trial_score = max([trial.values[0] for trial in trials])
    max_trial_params = [trial.params for trial in trials 
                        if trial.values[0] == max_trial_score][0]
    max_trial_params['n_jobs'] = -1
    return max_trial_score, max_trial_params

### Running the Functions

In [13]:
%%time
# ############################
# # VERSION 4.2.1 (targeted 500k points)
# v421_X, v421_y, v421_water_i, v421_land_i  = pre_process_data('Golden','v4.2.1',['x_offset','y_offset','year','julian_day','tileID'])
# #############################

##############################
# VERSION 2.0.1 (5 million points)
v201_X, v201_y, v201_water_i, v201_land_i = pre_process_data('GLOBAL','v2.0.1',['x_offset','y_offset','year','julian_day'])
##############################

# ##############################
# VERSION 0.0.0 (2billion data points)
# v000_X, v000_y = read_in_file('cleaned','AGU',[])
# ###############################


/explore/nobackup/projects/ilab/data/MODIS/MODIS_WATER_ML/training_data/v2.0.1/MOD09_GLOBAL_5469777_2_0_1.parquet.gzip
Input Variables Index(['sur_refl_b01_1', 'sur_refl_b02_1', 'sur_refl_b07_1', 'ndvi'], dtype='object')
data shape: (4375821, 4), (4375821,)

CPU times: user 3.63 s, sys: 940 ms, total: 4.57 s
Wall time: 4.6 s


#### Train random forest

In [14]:
num_iterations = 10
num_trials = 25
cluster_name = ['EBmatch']

In [15]:
best_trial_data = {}

for cluster_type in cluster_name:
    start_time = time.time()
    print(f'\n{cluster_type}')
    trial_scores, trial_data_size, rfa_trials = np.array([]), np.array([]),np.array([])
    rfa_preds, rfa_labels = np.array([]), np.array([])
    for i in np.arange(num_iterations):
        print(f'Iteration: {i}')
        if 'EB' in cluster_type:
            if 'match' in cluster_type: 
                rfa_pred, rfa_label = kmeans_clustering(
                    'Even Balance', v201_X, v201_y, v201_water_i, v201_land_i, match=True)
            else: 
                rfa_pred, rfa_label = kmeans_clustering(
                    'Even Balance', v201_X, v201_y, v201_water_i, v201_land_i, match=False)
        if 'P' in cluster_type:
            if 'match' in cluster_type: 
                rfa_pred, rfa_label = kmeans_clustering(
                    'Percent', v201_X, v201_y, v201_water_i, v201_land_i, match=True)
            else: 
                rfa_pred, rfa_pred = kmeans_clustering(
                    'Percent', v201_X, v201_y, v201_water_i, v201_land_i, match=False)       
        
        score, param = running_rfa(num_trials, rfa_pred, rfa_label)
        tuned_classifier = skRF(**param)
        tuned_classifier.fit(rfa_pred , rfa_label)
       
        # Append info to the lists
        rfa_preds = np.append(rfa_preds, rfa_pred)
        rfa_labels = np.append(rfa_labels, rfa_label)
        rfa_trials = np.append(rfa_trials, tuned_classifier)
        trial_scores = np.append(trial_scores, score)
        del tuned_classifier, rfa_pred, rfa_pred
    
    #Get max score of iterations
    best_score_indx = np.argmax(trial_scores)
    best_score = int(np.round(trial_scores[max_score_indx],3)*100)
    best_rfa = rfa_trials[max_score_indx]       
    best_trial_data[cluster_type] = (rfa_preds[max_score_indx], rfa_labels[max_score_indx]) 
    
    #Print out info 
    print(f'\nMax score for {cluster_type}: {best_score},'+/ 
           f'{len(rfa_preds[best_score_indx])} Samples')
    rfa_filename = f'rfa_models/MODIS_RFA_v201_{cluster_type}_MaxScore{best_score}_SfcRef127ndvi.pkl'
    print(f'Saving random forest to: {rfa_filename}')
    
    #Output rfa file
    pickle.dump(best_rfa, open(rfa_filename, 'wb'))
 
    #Print out time length 
    elapsed_time = time.time() - start_time
    print(f'Execution time: {time.strftime("%H:%M:%S", time.gmtime(elapsed_time))}\n')
    print(best_rfa)

SyntaxError: invalid syntax (3247026218.py, line 43)

### Plots

In [None]:
print(best_trial_data)

In [None]:
# fig, ax = plt.subplots(2, 2,figsize=(20, 10))
# var=0
# for col in range(2):
#     ax[col, 0].set_ylabel('Frequency') 
#     for row in range(2):
#         variable=P_cluster_X.columns[var]
#         if 'ndvi' in variable: var_bins = bin_boundaries
#         else: var_bins=None
#         ax[row, col].hist(
#             [   
#                 v201_X[variable].values,
#                 P_cluster_X[variable].values,
#                 # P_match_X[variable].values,
#                 EB_cluster_X[variable].values,
#                 # EB_match_X[variable].values,
#             ],
#             label=[
#                 #Change these 
#                 'v2.0.1',
#                 'Percent Cluster',
#                 # 'Percent Match',
#                 'EB Cluster'
#                 # 'EB Match'
#             ],
#             bins=var_bins,
#         color=['orange',
#                'brown',
#                # 'steelblue',
#                'lightgreen'
#                # 'pink'
#               ]) #, log=True) 
#         ax[row, col].set_xlabel(f'{variable}')
#         var+=1
#     ax[0,0].legend(loc='upper right',fontsize=20)
# plt.show()

# Old

In [None]:
# EB_cluster_X, EB_cluster_y, EB_match_X, EB_match_y = kmeans_clustering(
#     'Even Balance', v201_X, v201_y, v201_water_i, v201_land_i)

In [None]:
# P_cluster_X, P_cluster_y, P_match_X, P_match_y = kmeans_clustering(
#     'Percent', v201_X, v201_y, v201_water_i, v201_land_i)

In [None]:
# bin_boundaries =  [*range(-10000,0,1000)] + [*range(0,10001,1000)]
# plt.figure(figsize=(10,5))
# # plt.hist(v201_X['ndvi'].values, label='Total Values',rwidth=0.5,bins=bin_boundaries)
# plt.hist(P_cluster_X['ndvi'].values, label='Percent Cluster',
#          bins=bin_boundaries,color='red',histtype ='bar')
# plt.hist(EB_cluster_X['ndvi'].values, label='Even Balanced Cluster',
#          bins=bin_boundaries,color='black',histtype ='bar')

# plt.ylabel('Frequency',fontsize=12)
# plt.legend()
# plt.show()

#### Recalculating NDVI manually

In [None]:
# neg_ndvi = np.where(X['ndvi'].values < 0.0)[0]
# percent_neg_ndvi = len(neg_ndvi)/len(X['ndvi'].values)
# how_many_water = np.where(y.iloc[neg_ndvi] > 0.5)[0]
# how_many_land = np.where(y.iloc[neg_ndvi] < 0.5)[0]
# print(len(how_many_water))
# print(len(how_many_land))
# print(len(neg_ndvi))

# print(percent_neg_ndvi)
# print(X.iloc[neg_ndvi,:].head())
# print(y.iloc[neg_ndvi].head())

In [None]:

# bin_boundaries =  [*range(-10000,0,1000)] + [*range(0,10001,1000)]



In [None]:
# print("Number of finished trials: {}".format(len(study.trials)))
# trials = study.best_trials
# trial_score = max([trial.values[0]
#                    for trial in trials])
# best_trial_params = [trial.params for trial in trials if trial.values[0] == trial_score][0]
# print(best_trial_params)
# print(trial_score)
# score_print = np.round(trial_score,4)
# print(score_print)

In [None]:
# hyperparameters = best_trial_params
# hyperparameters['n_jobs'] = -1
# print('Using these params:')
# print(hyperparameters)
# tuned_classifier = skRF(**hyperparameters)

In [None]:
# pickled_model = pickle.load(open('rfa_models/MODIS_RFA_v201_EBCluster_sfcref127ndvi_4.pkl', 'rb'))
# print(pickled_model)

In [None]:
#EB_rand_X_rfa: {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 4, 'bootstrap': True, 'max_features': 'log2'} 0.9785175070775922
#Per_rand_X_rfa: {'n_estimators': 200, 'max_depth': 10, 'min_samples_split': 10, 'min_samples_leaf': 3, 'bootstrap': True, 'max_features': 'auto', 'n_jobs': -1} 0.9773275135496542
#EB_cluster_X_rfa: {'n_estimators': 200, 'max_depth': 100, 'min_samples_split': 2, 'min_samples_leaf': 5, 'bootstrap': True, 'max_features': 'sqrt'} 0.9668007117784662
#Per_cluster_X_rfa: {'n_estimators': 300, 'max_depth': 10, 'min_samples_split': 2, 'min_samples_leaf': 3, 'bootstrap': True, 'max_features': 'auto', 'n_jobs': -1}

In [None]:
# EB1: 0.9121361508267432
# EB2: 0.9152120215067565
# EB3: 0.9153888116216589
# EB4: 0.9334065797136774
# EB5: 0.9330483639924072
# EB6: 0.9325007563612594
# EB7: 13875 12950 0.9330562509477165
# EB8: 14220 13272 0.9119937102685194 
# EB9: 13875 12950 0.9318627771973583
# EB10: 14220 13272 0.9128319097402475

# %1: 359648 295422 0.9778067788170863
# %2: 359649 295421 0.977828848863204
# %3: 359649 295421 0.977814397217147
# %4: 359648 295421 0.9777926266177195
# %5: 359650 295422 0.9778923365411023
# %6: 359843 295233 0.9779668774041262
# %7: 359841 295230 0.977761799842581
# %8: 359840 296518 0.977231470922011
# %9: 359835 295223 0.977650365098458
# %10: 359837 295222 0.9778535497660246
