# Find best model

### Imports

In [1]:
import __main__ as main

from helpers.training_classifier import *
from helpers.feature_selection import select_features
from helpers.paths import Paths
from joblib import load, dump
from sklearn.model_selection import train_test_split
from datetime import datetime
from helpers.is_interactive import is_interactive

### Run dependency notebooks

In [2]:
if is_interactive(main):
    %run 01_2_data_wrangling_classifier.ipynb -p
    %run 02_2_scaling_classifier.ipynb -p

Running previous notebooks...


### Load Dataframe

In [3]:
training_data = load(Paths.CLASSIFIER_SCALING_DATA)

df = training_data['dataset']

X = df.drop('type_unified', axis=1)
y = df['type_unified']

df.head(10)

Unnamed: 0,Living space,Plot area,Floor,Latitude,Longitude,Zip,distanceToTrainStation,gde_area_agriculture_percentage,gde_area_forest_percentage,gde_area_nonproductive_percentage,...,WorkplaceDensity_2,WorkplaceDensity_3,WorkplaceDensity_4,WorkplaceDensity_5,WorkplaceDensity_6,ForestDensity_2,ForestDensity_3,ForestDensity_4,ForestDensity_5,ForestDensity_6
0,0.01033,0.002748,0.363636,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
1,0.016114,0.000633,0.31457,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
2,0.009606,0.002575,0.272727,0.799258,0.468164,0.462,0.034049,0.129342,0.366948,0.08056,...,0.036712,0.007034,0.001348,0.0002582432,4.948057e-05,0.008129,0.000733,6.608701e-05,5.958613e-06,5.372472e-07
3,0.015907,0.001054,0.181818,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
4,0.014668,0.003276,0.181818,0.803051,0.470341,0.464541,0.054848,0.377475,0.566478,0.013291,...,0.002871,0.000154,8e-06,4.415364e-07,2.3657e-08,0.068665,0.017993,0.00471484,0.001235474,0.000323743
5,0.019626,0.003029,0.181818,0.801165,0.461133,0.464079,0.118598,0.375855,0.606279,0.013751,...,0.006233,0.000492,3.9e-05,3.067084e-06,2.421424e-07,0.017961,0.002407,0.0003226117,4.323646e-05,5.794556e-06
6,0.012809,0.00057,0.302524,0.808677,0.477811,0.464657,0.114461,0.349443,0.586349,0.051803,...,0.004779,0.00033,2.3e-05,1.578986e-06,1.091576e-07,0.094108,0.028869,0.008856293,0.002716846,0.000833447
7,0.014045,0.001995,0.318182,0.794885,0.467948,0.462,0.027599,0.129342,0.366948,0.08056,...,0.120739,0.041954,0.014578,0.005065411,0.001760101,0.000314,6e-06,9.866563e-08,1.748668e-09,3.099195e-11
8,0.007747,0.000353,0.27142,0.801046,0.474388,0.465696,0.038177,0.129342,0.366948,0.08056,...,0.010812,0.001124,0.000117,1.215577e-05,1.263976e-06,0.129677,0.046698,0.01681615,0.006055616,0.00218067
9,0.011362,0.00603,0.181818,0.798179,0.461295,0.464079,0.106152,0.375855,0.606279,0.013751,...,0.005998,0.000465,3.6e-05,2.78604e-06,2.157669e-07,0.022775,0.003437,0.0005186871,7.827657e-05,1.181294e-05


## Train test split

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

## Train all Models

In [5]:
TRAINING_FUNCTIONS = [
    train_k_neighbors,
    train_gradient_boosting,
    train_random_forest,
    train_mlp_classifier
]

In [6]:
results = [
    func(X_train, X_test, y_train, y_test)
    for func in TRAINING_FUNCTIONS
]

Training KNeighborsClassifier with -1 jobs
Parameters: {'n_neighbors': [5], 'weights': ['distance'], 'leaf_size': [10], 'p': [2]}
Training GradientBoostingClassifier with -1 jobs
Parameters: {'max_depth': [2], 'min_samples_split': [10], 'max_features': [0.5], 'min_samples_leaf': [10], 'n_estimators': [130], 'random_state': [42]}
Training RandomForestClassifier with -1 jobs
Parameters: {'min_samples_split': [10], 'max_features': [0.5], 'min_samples_leaf': [10], 'n_estimators': [130], 'random_state': [42]}
Training MLPClassifier with 1 jobs
Parameters: {'hidden_layer_sizes': [(14, 14, 14, 14)], 'activation': ['relu'], 'learning_rate': ['adaptive'], 'learning_rate_init': [0.005], 'max_iter': [10000]}


In [12]:
pd.DataFrame(results)

Unnamed: 0,num_columns,score,best_params,model
0,222,0.633226,"{'n_neighbors': 5, 'weights': 'distance', 'lea...","KNeighborsClassifier(leaf_size=10, weights='di..."
1,222,0.713184,"{'max_depth': 2, 'min_samples_split': 10, 'max...",([DecisionTreeRegressor(criterion='friedman_ms...
2,222,0.762381,"{'min_samples_split': 10, 'max_features': 0.5,...","(DecisionTreeClassifier(max_features=0.5, min_..."
3,222,0.649037,"{'hidden_layer_sizes': (14, 14, 14, 14), 'acti...","MLPClassifier(hidden_layer_sizes=(14, 14, 14, ..."


## Select the best model

In [13]:
best_model_row = sorted(results, key=lambda x: x['score'])[-1]
best_model = best_model_row['model']
best_model

In [16]:
best_model_row['best_params']

{'min_samples_split': 10,
 'max_features': 0.5,
 'min_samples_leaf': 10,
 'n_estimators': 130,
 'random_state': 42}

## Save Results

In [17]:
def getClassName(obj):
   return type(obj).__name__

def getFormattedDate(date = datetime.now()):
    return date.strftime('%Y%m%d_%H%M')

In [18]:
for result in results:
    dump(
        result['model'],
        Paths.CLASSIFIER_MODEL_DATA(getClassName(result['model']))
    )