In [1]:
import sys
sys.path.append('../../')

import os
import gdal
from pathlib import Path
import pandas as pd
import geopandas as gpd
from pyeumap.mapper import LandMapper

In [2]:
from pyeumap import datasets

tile = datasets.TILES[0]

data_root = datasets.DATA_ROOT_NAME
data_dir = Path(os.getcwd()).joinpath(data_root,tile)

In [3]:
fn_points = data_dir.joinpath(f'{tile}_landcover_samples_overlayed.gpkg')

# Estimators

In [4]:
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from tensorflow.keras.wrappers.scikit_learn import KerasClassifier
from sklearn.linear_model import LogisticRegression
from pyeumap.misc import build_ann

estimator_rf = RandomForestClassifier(n_jobs=-1, n_estimators=85)
estimator_bgtree = xgb.XGBClassifier(n_jobs=-1, n_estimators=28, objective='multi:softmax', eval_metric='mlogloss', booster='gbtree')

input_shape = 87
n_classes = 8
estimator_ann = Pipeline([
	('standardize', StandardScaler()),
	('estimator', KerasClassifier(build_ann, input_shape=input_shape, output_shape=n_classes, \
		epochs=5, batch_size=64, learning_rate = 0.0005, \
		dropout_rate=0.15, n_layers = 4, n_neurons=64, shuffle=True, verbose=False))
])

meta_estimator = LogisticRegression(solver='saga', multi_class='multinomial', n_jobs=96)

# Hyperparameter Optimization

In [5]:
from sklearn.metrics import log_loss

def log_loss_scorer(clf, X, y_true):
    class_labels = clf.classes_
    y_pred_proba = clf.predict_proba(X)
    error = log_loss(y_true, y_pred_proba, labels=class_labels)
    return error * -1

In [6]:
from sklearn.model_selection import GridSearchCV

hyperpar_rf = GridSearchCV(
    estimator = estimator_rf,
    scoring = 'accuracy',
    param_grid = {
     'max_depth': [5, None], 
     'max_features': [0.5, None]
    }
)

hyperpar_bgtree = GridSearchCV(
    estimator = estimator_bgtree,
    scoring = 'accuracy',
    param_grid = {
     'eta': [0.001, 0.9], 
     'alpha': [0, 10]
    }
)

hyperpar_ann = GridSearchCV(
    estimator = estimator_ann,
    scoring = 'accuracy',
    param_grid = {
     'estimator__dropout_rate': [0, 0.15], 
     'estimator__n_layers': [2, 4]
    }
)

hyperpar_meta = GridSearchCV(
    estimator = meta_estimator,
    scoring = 'accuracy',
    param_grid = {
        'fit_intercept': [False, True],
        'C': [0.5, 1]
    }
)

# Training (single estimator)

In [7]:
#from sklearn.ensemble import RandomForestClassifier

#feat_col_prfxs = ['landsat', 'dtm', 'night_lights']
#target_col = 'lc_class'

#landmapper = LandMapper(points=fn_points, 
#                        feat_col_prfxs = feat_col_prfxs, 
#                        target_col = target_col, 
#                        estimator = estimator_rf, 
#                        hyperpar_selection = hyperpar_rf,
#                        min_samples_per_class=0.05,
#                        cv=2,
#                        verbose = True)

In [8]:
#landmapper.train()

In [9]:
#print(landmapper.eval_report)

# Training (single estimator - prob)

In [10]:
#from sklearn.ensemble import RandomForestClassifier

#feat_col_prfxs = ['landsat', 'dtm', 'night_lights']
#target_col = 'lc_class'

#landmapper = LandMapper(points=fn_points, 
#                        feat_col_prfxs = feat_col_prfxs, 
#                        target_col = target_col, 
#                        estimator = estimator_rf, 
#                        hyperpar_selection = hyperpar_rf,
#                        min_samples_per_class=0.05,
#                        cv=2,
#                        pred_method='predict_proba',
#                        verbose = True)

In [11]:
#landmapper.train()

In [12]:
#print(landmapper.eval_report)

# Training (Ensemble Machine Learning)

In [13]:
from sklearn.ensemble import RandomForestClassifier

feat_col_prfxs = ['landsat', 'dtm', 'night_lights']
target_col = 'lc_class'

estimator_list = [estimator_rf, estimator_bgtree, estimator_ann]
hyperpar_selection_list = [hyperpar_rf, hyperpar_bgtree, hyperpar_ann]

landmapper = LandMapper(points=fn_points, 
                        feat_col_prfxs = feat_col_prfxs, 
                        target_col = target_col, 
                        estimator_list = estimator_list, 
                        meta_estimator = meta_estimator,
                        hyperpar_selection_list = hyperpar_selection_list,
                        hyperpar_selection_meta = hyperpar_meta,
                        min_samples_per_class=0.05,
                        cv=2,
                        verbose = True)

[10:16:30] Removing 74 samples due min_samples_per_class condition (< 0.05)


In [14]:
landmapper.train()

[10:16:30] Optimizing hyperparameters for RandomForestClassifier
[10:16:35]  -0.63143 (+/-0.02476) from {'max_depth': 5, 'max_features': 0.5}
[10:16:35]  -0.63238 (+/-0.00381) from {'max_depth': 5, 'max_features': None}
[10:16:35]  -0.72190 (+/-0.00381) from {'max_depth': None, 'max_features': 0.5}
[10:16:35]  -0.73048 (+/-0.02476) from {'max_depth': None, 'max_features': None}
[10:16:35] Best: 0.73048 using {'max_depth': None, 'max_features': None}
[10:16:35] Optimizing hyperparameters for XGBClassifier
[10:16:40]  -0.59048 (+/-0.03048) from {'alpha': 0, 'eta': 0.001}
[10:16:40]  -0.70571 (+/-0.05905) from {'alpha': 0, 'eta': 0.9}
[10:16:40]  -0.55238 (+/-0.04571) from {'alpha': 10, 'eta': 0.001}
[10:16:40]  -0.60857 (+/-0.01333) from {'alpha': 10, 'eta': 0.9}
[10:16:40] Best: 0.70571 using {'alpha': 0, 'eta': 0.9}
[10:16:40] Optimizing hyperparameters for Pipeline
[10:17:13]  -0.42476 (+/-0.01143) from {'estimator__dropout_rate': 0, 'estimator__n_layers': 2}
[10:17:13]  -0.22762 (+/-

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.8s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    1.2s finished
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


[10:17:22]  Meta-features shape: (1050, 24)
[10:17:22] Optimizing hyperparameters for LogisticRegression


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    6.5s finished


[10:17:24]  -0.71333 (+/-0.00952) from {'C': 0.5, 'fit_intercept': False}
[10:17:24]  -0.71429 (+/-0.00762) from {'C': 0.5, 'fit_intercept': True}
[10:17:24]  -0.71238 (+/-0.01143) from {'C': 1, 'fit_intercept': False}
[10:17:24]  -0.70952 (+/-0.00571) from {'C': 1, 'fit_intercept': True}
[10:17:24] Best: 0.71429 using {'C': 0.5, 'fit_intercept': True}
[10:17:24] Calculating evaluation metrics
[10:17:24] Training RandomForestClassifier using all samples


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:    0.1s finished


[10:17:24] Training XGBClassifier using all samples
[10:17:25] Training Pipeline using all samples
[10:17:28] Training meta-estimator using all samples


In [15]:
print(landmapper.eval_metrics)

{'log_loss': 0.8713499966119077}


In [16]:
fn_model = data_dir.joinpath(f'{tile}_landmapper.mod')
landmapper.save_instance(fn_model)

['/home/jupyter/leandro/Code/eumap/demo/python/eumap_data/10636_switzerland/10636_switzerland_landmapper.mod']