In [None]:
import pandas as pd

# import dataset
dataset = pd.read_csv('sklearn-multi.csv')
dataset.shape

In [32]:
import numpy as np

# split into features and labels
features = dataset.iloc[:, 0:17]
labels = dataset.iloc[:, 17]

# generate dummies from categorical values (if present)
features = pd.get_dummies(features)
features = features.replace(np.inf, np.nan)
features = features.fillna(0)

from sklearn.preprocessing import LabelEncoder

# encode labels
le = LabelEncoder()
le_fitted_labels = le.fit(labels)
labels = le.fit_transform(labels)

In [9]:
from sklearn.model_selection import train_test_split

# split in train subset (used for RandomizedSearch) and test dataset (used for later validation at the end)
features_train, features_test, labels_train, labels_test = train_test_split(features, labels, test_size = 0.25)

In [None]:
# from sklearn.ensemble import RandomForestClassifier

# # simple baseline RandomForestClassifier (only for first testing)
# clf = RandomForestClassifier(n_estimators=100, n_jobs=-1, verbose=1)
# clf.fit(features_train, labels_train)
# labels_pred = clf.predict(features_test)

In [16]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import RandomizedSearchCV

# Create random grid as parameters for RandomizedSearch

n_estimators = [int(x) for x in np.linspace(start = 10, stop = 100, num = 10)]
max_features = ['auto', 'sqrt']
max_depth = [int(x) for x in np.linspace(10, 100, num = 10)]
max_depth.append(None)
min_samples_split = [2, 5, 10]
min_samples_leaf = [1, 2, 4]
bootstrap = [True, False]

random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}

In [None]:
rf = RandomForestClassifier(verbose = 100)

# Find best model using RandomizedSearch

rf_random = RandomizedSearchCV(estimator = rf, param_distributions=random_grid, 
                               n_iter = 100, cv = 5, verbose = 100, n_jobs = -2)

rf_random.fit(features_train, labels_train)

In [None]:
from joblib import dump, load

# print and save best result
print(rf_random.best_estimator_)

# RandomizedSearchCV delivered the following model as best result
# RandomForestClassifier(bootstrap=False, max_depth=70, max_features='sqrt', min_samples_leaf=2, n_estimators=50, verbose=100)
# dump(rf_random.best_estimator_, 'best_estimator.joblib')

In [None]:
# predict validation dataset

labels_pred = rf_random.best_estimator_.predict(features_test)

In [None]:
from sklearn import metrics
from sklearn.metrics import confusion_matrix

# print scores

labels_test_tf = le.inverse_transform(labels_test)
labels_pred_tf = le.inverse_transform(labels_pred) 

print(metrics.classification_report(labels_test_tf, labels_pred_tf, digits=6))
print(metrics.balanced_accuracy_score(labels_test, labels_pred, adjusted=True))

In [None]:
from pycm import ConfusionMatrix
cm = ConfusionMatrix(actual_vector=labels_test_tf,predict_vector=labels_pred_tf)
print(cm)

In [None]:
# print feature impotances

feature_importances = pd.Series(rf_random.best_estimator_.feature_importances_, index=features.columns)
print(feature_importances)