In [1]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 

import model_utils as utils

# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

dataset, features, labels = utils.getDataSet("datasets/dataset.csv")

Unnamed: 0,AFEC_DPTO,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_MPIO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,...,PATOLOGIA_TIPO,PET_COD_DEPTO,PET_MPIO,PET_TIPOPER,PQR_CANAL,PQR_CLASE_SNS,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM,RIESGO_VIDA
0,0.623624,0.849349,0.633634,0.573574,0.756757,0.592092,0.872372,0.737237,1.0,1.0,...,0.646647,0.460961,0.842342,0.130631,0.13964,1.0,0.153654,0.744244,0.485485,0
1,0.758759,0.873874,0.633634,1.0,0.756757,0.517607,0.872372,0.737237,1.0,1.0,...,0.646647,0.733734,0.467961,1.0,0.639139,1.0,1.0,0.744244,0.485485,0
2,0.875375,0.849349,0.810811,1.0,0.756757,0.842843,0.927427,0.737237,0.617618,1.0,...,0.646647,0.660661,0.67968,1.0,0.639139,1.0,1.0,0.744244,0.485485,0
3,0.982482,0.849349,1.0,1.0,0.756757,0.981982,0.921421,0.737237,0.617618,1.0,...,0.646647,0.983984,0.977477,1.0,1.0,1.0,1.0,0.744244,0.485485,0
4,0.771271,0.849349,0.633634,1.0,0.756757,0.872873,0.921421,0.98048,0.617618,1.0,...,0.964464,0.791792,0.741241,1.0,1.0,1.0,1.0,0.744244,0.485485,1


## Shuffle and Split Data

In [2]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size = 0.15, 
                                                    random_state = 10)

# Show the results of the split
print "features_final set has {} samples.".format(features.shape[0])
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

features_final set has 2375371 samples.
Training set has 2019065 samples.
Testing set has 356306 samples.


In [3]:
from sklearn.metrics import fbeta_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Initialize the three models
clf_A = GaussianNB()
clf_B = RandomForestClassifier(random_state = 300)
clf_C = AdaBoostClassifier(random_state = 300)


samples_100 = len(y_train)
samples_10 = int(samples_100*0.1)
samples_1 = int(samples_10*0.1)

# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'learner_index', 'size_index', 'train_time', 'pred_time', 'f_test', 'f_train'])

for k, clf in enumerate([clf_A, clf_B, clf_C]):
    clf_name = clf.__class__.__name__    
    dfResults = utils.train_predict(clf, k, 0, samples_100, X_train, y_train, X_test, y_test, dfResults)


  y = column_or_1d(y, warn=True)


GaussianNB trained on 2019065 samples.


  learner = learner.fit(X_train[:sample_size], y_train[:sample_size])


RandomForestClassifier trained on 2019065 samples.
AdaBoostClassifier trained on 2019065 samples.


In [4]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])


Unnamed: 0,learner,f_test
1,RandomForestClassifier,0.727557
2,AdaBoostClassifier,0.657321
0,GaussianNB,0.633024


# Tuning Models

## Tuning RandomForestClassifier

In [None]:
from sklearn.externals import joblib

rfClassifier = RandomForestClassifier(random_state = 20)

rfParameters = {    
  'criterion':['gini', 'entropy'],
  'max_depth':[5, 10],
  'max_features':['auto', 'sqrt', 'log2', None],
  'class_weight': ['balanced', 'balanced_subsample'], 
}

rfClassifier = utils.tuneClassifier(rfClassifier, rfParameters, X_train, X_test, y_train, y_test)

In [None]:
rfClassifier.get_params()

## Tuning AdaBoostClassifier

In [None]:

from sklearn.ensemble import AdaBoostClassifier
adaClassifier = AdaBoostClassifier(random_state = 20)

adaParameters = {
  'learning_rate':[0.1, 0.5, 1],
  'algorithm' :['SAMME', 'SAMME.R']
  #'max_features':['auto', 'sqrt', 'log2', None],
}

adaParameters = utils.tuneClassifier(adaClassifier, adaParameters, X_train, X_test, y_train, y_test)

# Stacking
## Stacking with to best classifiers

In [None]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf = StackingClassifier(classifiers=[rfClassifier, adaClassifier], 
                          use_features_in_secondary = True,
                          meta_classifier=RandomForestClassifier(random_state = 20))
sclf = sclf.fit(X_train, y_train)
sclf_predictions = sclf.predict(X_test)

print "F-score on StackingClassifier: {:.4f}".format(fbeta_score(y_test, sclf_predictions, beta = 2))

## Stacking with all classifiers

In [None]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf2 = StackingClassifier(classifiers=[rfClassifier, adaClassifier, GaussianNB()], 
                          use_features_in_secondary = True,
                          meta_classifier=RandomForestClassifier(random_state = 20))
sclf2 = sclf2.fit(X_train, y_train)
sclf_predictions2 = sclf2.predict(X_test)

print "F-score on StackingClassifier: {:.4f}".format(fbeta_score(y_test, sclf_predictions2, beta = 2))

In [None]:
from imblearn.over_sampling import RandomOverSampler

ros = RandomOverSampler(sampling_strategy = 'not majority')
X_oversampled_train, y_oversampled_train = ros.fit_sample(X_train, y_train)

clfRFC = RandomForestClassifier(random_state = 300)
clfRFC = rfClassifier.fit(X_oversampled_train, y_oversampled_train)
predictions_test = clfRFC.predict(X_test)
f_test =  fbeta_score(y_test, predictions_test, 2)
print("Oversampled score: %f"%(f_test))