## Import libraries

In [1]:
print()

import warnings; warnings.simplefilter('ignore')
import pandas                  as pd
import numpy                   as np
import sklearn.model_selection as ms
import sklearn.metrics         as mt
from imblearn.under_sampling   import RandomUnderSampler
from sklearn.ensemble          import RandomForestClassifier
from sklearn.model_selection   import RandomizedSearchCV
from joblib                    import dump, load

print('All fine!')


All fine!


## Prepare datasets

In [2]:
print()

rus = RandomUnderSampler()

instances_pe = pd.read_csv('./datasets/pe-dataset.csv')
print('PE raw dataset:', instances_pe.shape)

cln_instances_pe = instances_pe.drop(columns=['panel_info', 'panel_eplet'])
print('PE cleaned dataset:', cln_instances_pe.shape)

imb_train_labels_pe = np.array(cln_instances_pe['reactive'])
imb_instances_pe = cln_instances_pe.drop(columns=['reactive'])
train_instances_pe, train_labels_pe = rus.fit_sample(imb_instances_pe, imb_train_labels_pe)

print('PE train labels', train_instances_pe.shape)
print('PE train instances:', train_labels_pe.shape)

#Commented for performance reasons
#print()
#instances_sp = pd.read_csv('./datasets/sp-dataset.csv')
#print('SP raw dataset:', instances_sp.shape)
#cln_instances_sp = instances_sp.drop(columns=['panel_info', 'panel_eplet'])
#print('SP cleaned dataset:', cln_instances_sp.shape)
#imb_train_labels_sp = np.array(cln_instances_sp['reactive'])
#imb_instances_sp = cln_instances_sp.drop(columns=['reactive'])
#train_instances_sp, train_labels_sp = rus.fit_sample(imb_instances_sp, imb_train_labels_sp)
#print('SP train labels', train_instances_sp.shape)
#print('SP train instances:', train_labels_sp.shape)


PE raw dataset: (81399, 12)
PE cleaned dataset: (81399, 10)
PE train labels (6836, 9)
PE train instances: (6836,)


## Build production model

In [3]:
print()

clf = RandomForestClassifier()

param_grid = {
  'n_estimators': [100, 200, 400],
  'criterion': ['gini', 'entropy'],
  'min_samples_split': [2, 4, 8],
  'min_samples_leaf': [1, 2, 4],
  'max_features': [None, 'auto', 'sqrt', 'log2']
}

params_search = RandomizedSearchCV(estimator=clf, param_distributions=param_grid, scoring='roc_auc', n_jobs=7)
params_search.fit(train_instances_pe, train_labels_pe)

production_model = params_search.best_estimator_

print('All fine!')


All fine!


## Validate production model with PE dataset (internal)

In [4]:
#Commented for performance reasons
#print()
#predicted_labels_pe = ms.cross_val_predict(production_model, train_instances_pe, train_labels_pe, n_jobs=3)
#print('PE Confusion matrix (TN FP FN TP):', mt.confusion_matrix(train_labels_pe, predicted_labels_pe).ravel())
#print("PE AUC-ROC: %0.2f" % (mt.roc_auc_score(train_labels_pe, predicted_labels_pe)*100))

## Validate production model with SP dataset (external)

In [5]:
#Commented for performance reasons
#print()
#predicted_labels_sp = ms.cross_val_predict(production_model, train_instances_sp, train_labels_sp, n_jobs=-3)
#print('SP Confusion matrix (TN FP FN TP):', mt.confusion_matrix(train_labels_sp, predicted_labels_sp).ravel())
#print("SP AUC-ROC: %0.2f" % (mt.roc_auc_score(train_labels_sp, predicted_labels_sp)*100))

## Persist production model

In [10]:
print()

dump(production_model, 'eplogic.joblib')

print('All fine!')


All fine!


## Test persisted model

In [8]:
print()

test_model = load('eplogic.joblib')

test_data = [[0, 1, 0, 0, 0, 0, 10, 1000, 1500]]

result = test_model.predict(test_data)
print('Test classification (should be 1): ' + str(result[0]))

probabilities = test_model.predict_proba(test_data)
print('Probability of beeing 0: ' + str(probabilities[0][0]))
print('Probability of beeing 1: ' + str(probabilities[0][1]))


Test classification (should be 1): 1
Probability of beeing 0: 0.17859250124875128
Probability of beeing 1: 0.8214074987512485
