In [14]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 
from sklearn.metrics import fbeta_score, make_scorer
import model_utils as utils

# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

dataset, features, labels = utils.getDataSet("datasets/dataset.csv")

Unnamed: 0,AFEC_DPTO,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_MPIO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,...,PATOLOGIA_TIPO,PET_COD_DEPTO,PET_MPIO,PET_TIPOPER,PQR_CANAL,PQR_CLASE_SNS,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM,RIESGO_VIDA
0,0.956957,0.7687688,0.9999999,0.5765766,0.7572573,0.938939,0.6041041,0.7367367,0.6211211,0.9999999,...,0.646647,0.943443,0.898899,1.0,1.0,1.0,1.0,0.743744,0.246246,1
1,1e-07,0.9999999,1e-07,0.5765766,1e-07,0.206707,0.981982,1e-07,0.6211211,0.9999999,...,0.646647,0.662162,0.47047,1.0,0.638138,1.0,1.0,0.743744,1.0,0
2,1e-07,1e-07,1e-07,1e-07,1e-07,0.206707,1e-07,1e-07,1e-07,1e-07,...,0.646647,0.212212,0.162162,1.0,0.638138,0.487237,1.0,0.367367,0.684184,0
3,0.6231231,0.4489489,1e-07,0.5765766,1e-07,0.68018,0.6041041,1e-07,0.6211211,0.9999999,...,0.338338,0.462963,0.327828,1.0,0.286787,1.0,0.152152,0.743744,1.0,0
4,0.4184184,0.7687688,0.6376376,0.9999999,0.7572573,0.896897,0.6231231,0.7367367,0.6211211,0.9999999,...,0.985485,0.316817,0.953954,1.0,0.286787,1.0,0.152152,0.743744,0.246246,1


## Shuffle and Split Data

In [15]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.15, random_state = 10)

X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size = 0.1, random_state = 10)
# Show the results of the split
print "features_final set has {} samples.".format(features.shape[0])
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])
print "Validation set has {} samples.".format(X_validation.shape[0])

features_final set has 2256602 samples.
Training set has 1918111 samples.
Testing set has 304641 samples.
Validation set has 33850 samples.


In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Initialize the three models
clf_A = GaussianNB()
clf_B = RandomForestClassifier(random_state = 300)
clf_C = AdaBoostClassifier(random_state = 300)


# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'train_time', 'pred_time', 'f_test', 'f_train'])

for k, clf in enumerate([clf_A, clf_B, clf_C]):
    clf_name = clf.__class__.__name__  
    clf, dfResults = utils.train_predict(clf, X_train, y_train, X_test, y_test, dfResults)


GaussianNB trained.
RandomForestClassifier trained.
AdaBoostClassifier trained.


In [17]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])


Unnamed: 0,learner,f_test
1,RandomForestClassifier,0.725811
2,AdaBoostClassifier,0.660444
0,GaussianNB,0.634728


# Tuning Models

## Tuning RandomForestClassifier

In [18]:

rfClassifier = RandomForestClassifier(random_state = 300)

rfParameters = {    
  'criterion':['gini', 'entropy'],
  'max_depth':[5, 10],
  'max_features':['auto', 'sqrt', 'log2', None],
  'class_weight': ['balanced', 'balanced_subsample'], 
}

rfClassifier = utils.tuneClassifier(rfClassifier, rfParameters, X_train, X_test, y_train, y_test)

Unoptimized model
------
F-score on testing data: 0.7258

Optimized Model
------
Final F-score on the testing data: 0.8018


In [19]:
rfClassifier.get_params()

{'bootstrap': True,
 'class_weight': 'balanced_subsample',
 'criterion': 'gini',
 'max_depth': 10,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 10,
 'n_jobs': None,
 'oob_score': False,
 'random_state': 300,
 'verbose': 0,
 'warm_start': False}

## Tuning AdaBoostClassifier

In [20]:

from sklearn.ensemble import AdaBoostClassifier
adaClassifier = AdaBoostClassifier(random_state = 300)

adaParameters = {
  'learning_rate':[0.1, 0.5, 1],
  'algorithm' :['SAMME', 'SAMME.R']
  #'max_features':['auto', 'sqrt', 'log2', None],
}

adaParameters = utils.tuneClassifier(adaClassifier, adaParameters, X_train, X_test, y_train, y_test)

Unoptimized model
------
F-score on testing data: 0.6604

Optimized Model
------
Final F-score on the testing data: 0.6604


# Stacking
## Stacking with to best classifiers

In [21]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf = StackingClassifier(classifiers=[rfClassifier, adaClassifier], 
                          use_features_in_secondary = True,
                          meta_classifier=RandomForestClassifier(random_state = 20))
sclf = sclf.fit(X_train, y_train)
sclf_predictions = sclf.predict(X_test)

print "F-score on StackingClassifier: {:.4f}".format(fbeta_score(y_test, sclf_predictions, beta = 2))

  clf.fit(X, y)
  self.meta_clf_.fit(meta_features, y)


F-score on StackingClassifier: 0.7310


## Stacking with all classifiers

In [22]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf2 = StackingClassifier(classifiers=[rfClassifier, adaClassifier, GaussianNB()], 
                          use_features_in_secondary = True,
                          meta_classifier=RandomForestClassifier(random_state = 20))
sclf2 = sclf2.fit(X_train, y_train)
sclf_predictions2 = sclf2.predict(X_test)

print "F-score on StackingClassifier: {:.4f}".format(fbeta_score(y_test, sclf_predictions2, beta = 2))

F-score on StackingClassifier: 0.7289


In [23]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy = 'not majority')
X_oversampled_train, y_oversampled_train = ros.fit_sample(X_train, y_train)

clfRFC = RandomForestClassifier(random_state = 300)
clfRFC = rfClassifier.fit(X_oversampled_train, y_oversampled_train)
predictions_test = clfRFC.predict(X_test)
f_test =  fbeta_score(y_test, predictions_test, 2)
print("Oversampled score: %f"%(f_test))

Oversampled score: 0.797041


In [24]:
from hpsklearn import HyperoptEstimator, random_forest

estim = HyperoptEstimator( classifier=random_forest('myRFC') )
estim.fit( X_train.values, y_train.values )

predictions_hyper_test = estim.predict(X_test)
f_hyper_test =  fbeta_score(y_test, predictions_hyper_test, 2)
print("HyperoptEstimator score: %f"%(f_hyper_test))

WARN: OMP_NUM_THREADS=None =>
... If you are using openblas if you are using openblas set OMP_NUM_THREADS=1 or risk subprocess calls hanging indefinitely


  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  learner.fit(XEXfit, yfit)
  self._best_learner.fit(XEX, y)


HyperoptEstimator score: 0.761332


In [25]:
from sklearn.externals import joblib

joblib.dump(rfClassifier, 'rfClassifier.joblib') 

['rfClassifier.joblib']