In [10]:
import numpy as np
import skimage.io
import matplotlib.pyplot as plt
import pathlib
import tqdm
import annotation, misc, hyspec_io, image_render
import skimage.exposure
from sklearn.experimental import enable_halving_search_cv  # noqa
import sklearn.ensemble
import sklearn.model_selection
import sklearn.metrics
import sklearn.utils
import sys

In [2]:
# Paths
pca_dataset_train_path = pathlib.Path('/media/mha114/Massimal/Larvik_Olberg/Hyperspectral/20210825/OlbergAreaS/5b_Rad_Georef_SGC_PCA-spectra/20210825_Olberg_PCA_TrainValDataset.npz')
pca_dataset_test_path = pathlib.Path('/media/mha114/Massimal/Larvik_Olberg/Hyperspectral/20210825/OlbergAreaS/5b_Rad_Georef_SGC_PCA-spectra/20210825_Olberg_PCA_TestDataset.npz')

In [3]:
# Load training data
with np.load(pca_dataset_train_path,allow_pickle=True) as npz_files:
    X = npz_files['pca_scores']
    y_orig = npz_files['labels']
    class_dict_orig = npz_files['class_dict'].item()  # Use item() to convert from 0-dim array back to dict

In [4]:
class_dict_orig

{'Sand': 1,
 'Zostera marina': 2,
 'Zostera marina with turf algae': 3,
 'Rockweed': 4,
 'Other algae': 5,
 'Zostera marina - NGT': 6,
 'Rockweed - NGT': 7,
 'Other algae - NGT': 8}

In [5]:
# Merge NGT labels with other labels
classes_to_merge = [['Sand'],
                    ['Zostera marina', 'Zostera marina - NGT'],
                    ['Zostera marina with turf algae'],
                    ['Rockweed','Rockweed - NGT'],
                    ['Other algae','Other algae - NGT']]
merged_class_names = ['Sand',
                      'Zostera marina',
                      'Zostera marina with turf algae',
                      'Rockweed',
                      'Other algae']
class_dict, y = annotation.merge_classes_in_label_vector(class_dict_orig,y_orig,classes_to_merge,merged_class_names)

In [11]:
# Shuffle data
X,y = sklearn.utils.shuffle(X,y)

In [13]:
# Define example (relatively small) classifier
# clf = sklearn.ensemble.RandomForestClassifier(n_estimators=30,min_samples_split=0.001,max_samples=0.2)
clf = sklearn.ensemble.RandomForestClassifier(n_estimators=10,min_samples_split=0.001,max_samples=0.05)

In [14]:
# Try K-fold crossvalidation
scores = sklearn.model_selection.cross_validate(clf,X,y,verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................., score=0.854 total time=   6.9s


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    7.2s remaining:    0.0s


[CV] END ......................................., score=0.851 total time=   6.5s


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   13.7s remaining:    0.0s


[CV] END ......................................., score=0.858 total time=   6.5s
[CV] END ......................................., score=0.854 total time=   6.4s
[CV] END ......................................., score=0.851 total time=   6.4s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   33.1s finished


In [16]:
scores

{'fit_time': array([6.2904098 , 5.94609928, 5.97345638, 5.86045647, 5.82967448]),
 'score_time': array([0.56757188, 0.54695773, 0.56136656, 0.54800224, 0.56211162]),
 'test_score': array([0.85406931, 0.85066991, 0.85829385, 0.85445631, 0.85074296])}

In [17]:
# Try regular grid search
base_estimator = sklearn.ensemble.RandomForestClassifier()
param_grid = {'n_estimators':[5,10,20],
              'min_samples_leaf':[0.01,0.001,0.0001],
              'max_samples':[0.02]}
clf_search = sklearn.model_selection.GridSearchCV(base_estimator,param_grid,verbose=3)
clf_search.fit(X,y)

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[CV 1/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=5;, score=0.403 total time=   1.6s
[CV 2/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=5;, score=0.403 total time=   1.5s
[CV 3/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=5;, score=0.403 total time=   1.5s
[CV 4/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=5;, score=0.403 total time=   1.5s
[CV 5/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=5;, score=0.403 total time=   1.5s
[CV 1/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=10;, score=0.403 total time=   2.7s
[CV 2/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=10;, score=0.403 total time=   2.7s
[CV 3/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=10;, score=0.403 total time=   2.8s
[CV 4/5] END max_samples=0.02, min_samples_leaf=0.01, n_estimators=10;, score=0.403 total time=   2.7s
[CV 5/5] END max_s

In [21]:
clf_search.cv_results_['param_min_samples_leaf']

{'mean_fit_time': array([1.37243614, 2.47771115, 4.70315485, 1.64931893, 3.03706994,
        5.76247373, 1.87092333, 3.4524508 , 6.63158207]),
 'std_fit_time': array([0.01672694, 0.0173367 , 0.05628114, 0.00826842, 0.03392109,
        0.03741965, 0.00697333, 0.02222552, 0.03772322]),
 'mean_score_time': array([0.15878987, 0.26501122, 0.48125925, 0.24082751, 0.42099624,
        0.77746096, 0.31592073, 0.56606364, 1.06944594]),
 'std_score_time': array([0.00140018, 0.00167507, 0.01728946, 0.00262298, 0.00231256,
        0.00477839, 0.00925568, 0.00111583, 0.00219904]),
 'param_max_samples': masked_array(data=[0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02, 0.02],
              mask=[False, False, False, False, False, False, False, False,
                    False],
        fill_value='?',
             dtype=object),
 'param_min_samples_leaf': masked_array(data=[0.01, 0.01, 0.01, 0.001, 0.001, 0.001, 0.0001, 0.0001,
                    0.0001],
              mask=[False, False, False, Fal

In [25]:
for min_samp,n_est,score,std in zip(
    clf_search.cv_results_['param_min_samples_leaf'],
    clf_search.cv_results_['param_n_estimators'],
    clf_search.cv_results_['mean_test_score'],
    clf_search.cv_results_['std_test_score'],
    ):
    print(f'Min samples {min_samp}, n_estimators {n_est}: score {score} +/- {std}')
#clf_search.cv_results_['param_min_samples_leaf']

Min samples 0.01, n_estimators 5: score 0.4027489852920307
Min samples 0.01, n_estimators 10: score 0.4027489852920307
Min samples 0.01, n_estimators 20: score 0.4027489852920307
Min samples 0.001, n_estimators 5: score 0.7683690781705212
Min samples 0.001, n_estimators 10: score 0.7793109403049164
Min samples 0.001, n_estimators 20: score 0.7847401051676716
Min samples 0.0001, n_estimators 5: score 0.8504390232602319
Min samples 0.0001, n_estimators 10: score 0.861392482242761
Min samples 0.0001, n_estimators 20: score 0.8667070824820919


We can see that the model performance is most affected by the minimum numer of samples per leaf, which regulates the size of each individual tree. It is also affected by the number of estimators, but to a much smaller degree. Also, the variation between each of the 5 folds is quite low, probably because the number of samples is fairly high, and the samples have been shuffled well.

Let't try fixing the number of estimators and varying the minimum numer of samples more.

In [26]:
# 
base_estimator2 = sklearn.ensemble.RandomForestClassifier()
param_grid2 = {'n_estimators':[10],
              'min_samples_leaf':[0.001,0.0001, 0.00001, 0.000001],
              'max_samples':[0.02]}
clf_search2 = sklearn.model_selection.GridSearchCV(base_estimator2,param_grid2,verbose=3)
clf_search2.fit(X,y)

Fitting 5 folds for each of 4 candidates, totalling 20 fits
[CV 1/5] END max_samples=0.02, min_samples_leaf=0.001, n_estimators=10;, score=0.782 total time=   3.4s
[CV 2/5] END max_samples=0.02, min_samples_leaf=0.001, n_estimators=10;, score=0.764 total time=   3.4s
[CV 3/5] END max_samples=0.02, min_samples_leaf=0.001, n_estimators=10;, score=0.781 total time=   3.5s
[CV 4/5] END max_samples=0.02, min_samples_leaf=0.001, n_estimators=10;, score=0.786 total time=   3.4s
[CV 5/5] END max_samples=0.02, min_samples_leaf=0.001, n_estimators=10;, score=0.771 total time=   3.4s
[CV 1/5] END max_samples=0.02, min_samples_leaf=0.0001, n_estimators=10;, score=0.861 total time=   4.0s
[CV 2/5] END max_samples=0.02, min_samples_leaf=0.0001, n_estimators=10;, score=0.866 total time=   4.0s
[CV 3/5] END max_samples=0.02, min_samples_leaf=0.0001, n_estimators=10;, score=0.860 total time=   4.0s
[CV 4/5] END max_samples=0.02, min_samples_leaf=0.0001, n_estimators=10;, score=0.862 total time=   4.0s


We can see that model performance increases as the minimum number of samples per leaf decreases. Interestingly, the time spent to fit each model only increases slightly as the minimum number of samples per leaf decreases. There is only a quite small increase in performance between 0.00001 and 0.000001.

In [29]:
# 
base_estimator3 = sklearn.ensemble.RandomForestClassifier()
param_grid3 = {'n_estimators':[10],
              'min_samples_leaf':[0.0001, 0.00001, 0.000001],
              'max_samples':[0.03, 0.1, 0.3]}
clf_search3 = sklearn.model_selection.GridSearchCV(base_estimator3,param_grid3,verbose=3,cv=3)
clf_search3.fit(X,y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END max_samples=0.03, min_samples_leaf=0.0001, n_estimators=10;, score=0.874 total time=   5.0s
[CV 2/3] END max_samples=0.03, min_samples_leaf=0.0001, n_estimators=10;, score=0.868 total time=   4.5s
[CV 3/3] END max_samples=0.03, min_samples_leaf=0.0001, n_estimators=10;, score=0.870 total time=   4.3s
[CV 1/3] END max_samples=0.03, min_samples_leaf=1e-05, n_estimators=10;, score=0.899 total time=   5.0s
[CV 2/3] END max_samples=0.03, min_samples_leaf=1e-05, n_estimators=10;, score=0.900 total time=   5.0s
[CV 3/3] END max_samples=0.03, min_samples_leaf=1e-05, n_estimators=10;, score=0.897 total time=   5.0s
[CV 1/3] END max_samples=0.03, min_samples_leaf=1e-06, n_estimators=10;, score=0.906 total time=   5.5s
[CV 2/3] END max_samples=0.03, min_samples_leaf=1e-06, n_estimators=10;, score=0.907 total time=   5.5s
[CV 3/3] END max_samples=0.03, min_samples_leaf=1e-06, n_estimators=10;, score=0.907 total time=   5.4s
[

In [34]:
for max_samp,min_samp_leaf,score,std in zip(
    clf_search3.cv_results_['param_max_samples'],
    clf_search3.cv_results_['param_min_samples_leaf'],
    clf_search3.cv_results_['mean_test_score'],
    clf_search3.cv_results_['std_test_score'],
    ):
    print(f'Max samples per tree {max_samp:5}, min samples per leaf {min_samp_leaf:7}: score {score} +/- {std}')
#clf_search.cv_results_['param_min_samples_leaf']

Max samples per tree  0.03, min samples per leaf  0.0001: score 0.8709411677400404 +/- 0.0023027822274634398
Max samples per tree  0.03, min samples per leaf   1e-05: score 0.8984623903281647 +/- 0.001181539812770455
Max samples per tree  0.03, min samples per leaf   1e-06: score 0.9065462975814826 +/- 0.0003854762148200035
Max samples per tree   0.1, min samples per leaf  0.0001: score 0.8880289024388718 +/- 0.0007668575071351447
Max samples per tree   0.1, min samples per leaf   1e-05: score 0.9082855526514733 +/- 0.0006423304648482094
Max samples per tree   0.1, min samples per leaf   1e-06: score 0.913187246265481 +/- 0.0003007408214533279
Max samples per tree   0.3, min samples per leaf  0.0001: score 0.8978725230130099 +/- 0.00048317372128492674
Max samples per tree   0.3, min samples per leaf   1e-05: score 0.9141225093135498 +/- 0.0008279088292669118
Max samples per tree   0.3, min samples per leaf   1e-06: score 0.917551921879682 +/- 0.000265468881717931


Performance increases with increasing number of samples per tree and decreasing minimum number of samples per leaf. When max_samples = 0.3, there is only a small difference between allowing 1e-5 and 1e-6 as minimum number of samples per leaf. Note however that the absolute minimum number of samples per leaf _scales_ with the maximum number of samples per tree. We should test the effect of the absolute minimum number of samples.

In [37]:
X.shape[0]*1e-6

2.932864

In [38]:
# 
base_estimator4 = sklearn.ensemble.RandomForestClassifier()
param_grid4 = {'n_estimators':[10],
              'min_samples_leaf':[100,10,1],
              'max_samples':[0.03, 0.1, 0.3]}
clf_search4 = sklearn.model_selection.GridSearchCV(base_estimator4,param_grid4,verbose=3,cv=3)
clf_search4.fit(X,y)

Fitting 3 folds for each of 9 candidates, totalling 27 fits
[CV 1/3] END max_samples=0.03, min_samples_leaf=100, n_estimators=10;, score=0.882 total time=   4.6s
[CV 2/3] END max_samples=0.03, min_samples_leaf=100, n_estimators=10;, score=0.882 total time=   4.6s
[CV 3/3] END max_samples=0.03, min_samples_leaf=100, n_estimators=10;, score=0.880 total time=   4.7s
[CV 1/3] END max_samples=0.03, min_samples_leaf=10, n_estimators=10;, score=0.904 total time=   5.2s
[CV 2/3] END max_samples=0.03, min_samples_leaf=10, n_estimators=10;, score=0.903 total time=   5.1s
[CV 3/3] END max_samples=0.03, min_samples_leaf=10, n_estimators=10;, score=0.904 total time=   5.1s
[CV 1/3] END max_samples=0.03, min_samples_leaf=1, n_estimators=10;, score=0.905 total time=   5.6s
[CV 2/3] END max_samples=0.03, min_samples_leaf=1, n_estimators=10;, score=0.905 total time=   5.6s
[CV 3/3] END max_samples=0.03, min_samples_leaf=1, n_estimators=10;, score=0.904 total time=   5.6s
[CV 1/3] END max_samples=0.1, m

We can see that there is now very little variation between the different parameters. The highest achievable score is slightly above 91%. There is very little difference between using 10 or 1 as the minimum number of samples, and between using 0.1 and 0.3 as the maximum number of samples per tree. Let's do another test on the number of estimators

In [40]:
# 
base_estimator5 = sklearn.ensemble.RandomForestClassifier()
param_grid5 = {'n_estimators':[10, 20, 40, 80],
              'min_samples_leaf':[10],
              'max_samples':[0.1]}
clf_search5 = sklearn.model_selection.GridSearchCV(base_estimator5,param_grid5,verbose=3,cv=3)
clf_search5.fit(X,y)

Fitting 3 folds for each of 4 candidates, totalling 12 fits
[CV 1/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=10;, score=0.912 total time=  11.7s
[CV 2/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=10;, score=0.912 total time=  11.8s
[CV 3/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=10;, score=0.911 total time=  11.8s
[CV 1/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=20;, score=0.915 total time=  22.8s
[CV 2/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=20;, score=0.914 total time=  22.9s
[CV 3/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=20;, score=0.914 total time=  23.0s
[CV 1/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=40;, score=0.916 total time=  45.2s
[CV 2/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=40;, score=0.916 total time=  44.9s
[CV 3/3] END max_samples=0.1, min_samples_leaf=10, n_estimators=40;, score=0.915 total time=  45.2s
[CV 1/3] END max_samples=0.1, min_sample

Increasing the number of estimators yields a slight boost in score, but it is on the order of 0.1%. Such differences may not carry over into a hold-out test set.  

In [41]:
# Train a random forest with no constraints and default (100) number of estimators
clf_full = sklearn.ensemble.RandomForestClassifier()
scores = sklearn.model_selection.cross_validate(clf_full,X,y,verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................., score=0.928 total time=20.3min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed: 20.3min remaining:    0.0s


KeyboardInterrupt: 

In [42]:
# Train a random forest with 10 estimators but full dataset
clf_full = sklearn.ensemble.RandomForestClassifier(n_estimators=10)
scores = sklearn.model_selection.cross_validate(clf_full,X,y,verbose=3)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[CV] END ......................................., score=0.919 total time= 2.1min


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  2.1min remaining:    0.0s


[CV] END ......................................., score=0.919 total time= 2.2min


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  4.2min remaining:    0.0s


[CV] END ......................................., score=0.919 total time= 2.0min
[CV] END ......................................., score=0.918 total time= 2.1min
[CV] END ......................................., score=0.920 total time= 2.1min


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 10.5min finished
