In [13]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 
from sklearn.metrics import fbeta_score
import model_utils as utils
from sklearn.externals import joblib

# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

dataset, features, labels = utils.getDataSet("datasets/dataset.csv")


Unnamed: 0,AFEC_DPTO,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_MPIO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,...,PET_DPTO,PET_MPIO,PET_TIPOPER,PQR_CANAL,PQR_CLASE_SNS,PQR_ESTADO,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM,RIESGO_VIDA
0,0.483984,0.524024,0.908408,0.613614,0.625125,0.673173,0.335335,0.594094,0.459459,1.0,...,0.568569,0.707708,1.0,0.038539,1e-07,0.488989,0.414414,1e-07,1.0,0
1,0.932432,0.542543,0.845345,0.613614,0.625125,0.948448,0.382883,0.594094,0.459459,1.0,...,0.923924,0.962462,1.0,0.628128,0.9999999,0.488989,1.0,0.987988,0.355856,0
2,0.848849,0.267768,0.438438,0.613614,0.625125,0.955956,0.335335,0.594094,0.459459,1.0,...,0.855355,0.941942,1.0,0.433934,0.9999999,0.488989,0.209209,0.987988,1.0,0
3,0.113614,1.0,0.908408,1.0,0.625125,0.859787,0.382883,0.594094,1.0,1.0,...,0.02002,0.483483,1.0,1.0,0.9999999,1.0,1.0,0.987988,1.0,1
4,0.483984,0.75976,0.809309,0.613614,0.625125,0.673173,0.723724,0.594094,0.459459,1.0,...,0.573574,0.707708,1.0,0.628128,0.9999999,0.488989,1.0,0.987988,0.551051,0


## Shuffle and Split Data

In [14]:
display(features.head(n = 5))

Unnamed: 0,AFEC_DPTO,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_MPIO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,...,PET_COD_DEPTO,PET_DPTO,PET_MPIO,PET_TIPOPER,PQR_CANAL,PQR_CLASE_SNS,PQR_ESTADO,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM
0,0.483984,0.524024,0.908408,0.613614,0.625125,0.673173,0.335335,0.594094,0.459459,1.0,...,1e-07,0.568569,0.707708,1.0,0.038539,1e-07,0.488989,0.414414,1e-07,1.0
1,0.932432,0.542543,0.845345,0.613614,0.625125,0.948448,0.382883,0.594094,0.459459,1.0,...,1e-07,0.923924,0.962462,1.0,0.628128,0.9999999,0.488989,1.0,0.987988,0.355856
2,0.848849,0.267768,0.438438,0.613614,0.625125,0.955956,0.335335,0.594094,0.459459,1.0,...,1e-07,0.855355,0.941942,1.0,0.433934,0.9999999,0.488989,0.209209,0.987988,1.0
3,0.113614,1.0,0.908408,1.0,0.625125,0.859787,0.382883,0.594094,1.0,1.0,...,1e-07,0.02002,0.483483,1.0,1.0,0.9999999,1.0,1.0,0.987988,1.0
4,0.483984,0.75976,0.809309,0.613614,0.625125,0.673173,0.723724,0.594094,0.459459,1.0,...,1e-07,0.573574,0.707708,1.0,0.628128,0.9999999,0.488989,1.0,0.987988,0.551051


In [15]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = 0.15, random_state = 10, stratify=labels)

X_test, X_validation, y_test, y_validation = train_test_split(X_test, y_test, test_size = 0.1, random_state = 10, stratify=y_test)
# Show the results of the split
print "features_final set has {} samples.".format(features.shape[0])
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])
print "Validation set has {} samples.".format(X_validation.shape[0])

features_final set has 1416472 samples.
Training set has 1204001 samples.
Testing set has 191223 samples.
Validation set has 21248 samples.


In [16]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Initialize the three models
clf_A = GaussianNB()
clf_B = RandomForestClassifier(random_state = 300)
clf_C = AdaBoostClassifier(random_state = 300)


# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'train_time', 'pred_time', 'f_test', 'f_train'])

for k, clf in enumerate([clf_A, clf_B, clf_C]):
    clf_name = clf.__class__.__name__  
    clf, dfResults = utils.train_predict(clf, 1, X_train, y_train, X_test, y_test, dfResults)


GaussianNB trained.


  learner = learner.fit(X_train, y_train)


RandomForestClassifier trained.
AdaBoostClassifier trained.


In [17]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])


Unnamed: 0,learner,f_test
1,RandomForestClassifier,0.919225
2,AdaBoostClassifier,0.898555
0,GaussianNB,0.570912


# Tuning Models

## Tuning RandomForestClassifier

In [None]:
from sklearn.metrics import make_scorer
rfClassifier = RandomForestClassifier(random_state = 300)

rfParameters = {    
  'criterion':['gini', 'entropy'],
  'max_depth':[5, 10],
  'max_features':['auto', 'sqrt', 'log2', None],
  'class_weight': ['balanced', 'balanced_subsample'], 
}

scorers = {
    'fb_score': make_scorer(fbeta_score, beta=2)
}

rfClassifier = utils.tuneClassifier(rfClassifier, rfParameters, X_train, X_test, y_train, y_test)

joblib.dump(rfClassifier, 'rfClassifier.joblib') 


## Tuning AdaBoostClassifier

In [None]:

from sklearn.ensemble import AdaBoostClassifier
adaClassifier = AdaBoostClassifier(random_state = 300)

adaParameters = {
  'learning_rate':[0.1, 0.5, 1],
  'algorithm' :['SAMME', 'SAMME.R']
  #'max_features':['auto', 'sqrt', 'log2', None],
}

adaClassifier = utils.tuneClassifier(adaClassifier, adaParameters,  X_train, X_test, y_train, y_test)

joblib.dump(rfClassifier, 'adaClassifier.joblib') 


# Stacking
## Stacking with to best classifiers

In [None]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf = StackingClassifier(classifiers=[rfClassifier, adaClassifier], 
                          use_features_in_secondary = True,
                          meta_classifier=RandomForestClassifier(random_state = 20))
sclf = sclf.fit(X_train, y_train)
joblib.dump(sclf, 'sclf.joblib') 

utils.modelValidation('sclf.joblib', X_test, y_test)

## Stacking with all classifiers

In [None]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf2 = StackingClassifier(classifiers=[rfClassifier, adaClassifier, GaussianNB()], 
                          use_features_in_secondary = True,
                          meta_classifier=RandomForestClassifier(random_state = 20))
sclf2 = sclf2.fit(X_train, y_train)
joblib.dump(sclf2, 'sclf2.joblib') 

utils.modelValidation('sclf2.joblib', X_test, y_test)

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy = 'not majority')
X_oversampled_train, y_oversampled_train = ros.fit_sample(X_train, y_train)

clfRFC = RandomForestClassifier(random_state = 300)
clfRFC = rfClassifier.fit(X_oversampled_train, y_oversampled_train)
joblib.dump(clfRFC, 'clfRFC.joblib') 

utils.modelValidation('clfRFC.joblib', X_test, y_test)

In [None]:
'''
from hpsklearn import HyperoptEstimator, random_forest

estim = HyperoptEstimator( classifier=random_forest('myRFC') )
estim.fit( X_train.values, y_train.values )

predictions_hyper_test = estim.predict(X_test)
f_hyper_test =  recall_score(y_test, predictions_hyper_test)
print("HyperoptEstimator score: %f"%(f_hyper_test))
'''