In [1]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 
from sklearn.metrics import fbeta_score
import model_utils as model_utils
from sklearn.externals import joblib
import warnings
warnings.filterwarnings('ignore')

import json
# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)


with open('best_config.json') as json_data_file:
    config = json.load(json_data_file)[0]
print(config)
seed = int(config['seed'])
test_size = config['test_size']

dataset = pd.read_csv("datasets/encoded_dataset.csv")
display(dataset.head(n = 5))

labels = dataset[['RIESGO_VIDA']]
features = dataset.drop(['RIESGO_VIDA'], axis = 1)




{'seed': 134.0, 'test_size': 0.1, 'impute': 0.0}


Unnamed: 0,AFEC_DPTO,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_MPIO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,...,AFEC_GETNICO_is_missing,AFEC_TIPOPER_is_missing,AFEC_EDADR_is_missing,PET_COD_DEPTO_is_missing,PATOLOGIA_TIPO_is_missing,AFEC_MPIO_is_missing,ENT_TIPOVIG_SNS_is_missing,AFEC_PARENTESCO_is_missing,PET_MPIO_is_missing,RIESGO_VIDA
0,0.137201,0.200671,0.117783,0.693147,0.485508,0.094359,0.559616,0.405465,0.693147,0.693147,...,0,0,0,0,0,0,0,0,0,0
1,0.648027,0.200671,0.485508,0.405465,0.485508,0.121797,0.060625,0.405465,0.182322,0.693147,...,0,0,0,0,0,0,0,0,0,0
2,0.424883,0.200671,0.693147,0.405465,0.485508,0.434276,0.594707,0.405465,0.182322,0.693147,...,0,0,0,0,0,0,0,0,0,0
3,0.567984,0.200671,0.117783,0.405465,0.485508,0.230358,0.594707,0.485508,0.182322,0.693147,...,0,0,0,0,0,0,0,0,0,1
4,0.137201,0.200671,0.117783,0.405465,0.485508,0.094359,0.559616,0.405465,0.693147,0.693147,...,0,0,0,0,0,0,0,0,0,0


In [2]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = test_size, random_state = seed, stratify=labels)

# Show the results of the split
print("features_final set has {} samples.".format(features.shape[0]))
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))


features_final set has 281311 samples.
Training set has 253179 samples.
Testing set has 28132 samples.


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Initialize the three models
classifiers = model_utils.init_classifiers(seed)

# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'train_time', 'pred_time', 'f_test', 'f_train'])

for clf in list(classifiers.values()):
    clf_name = clf.__class__.__name__ 
    clf, dfResults = model_utils.train_predict(clf, 2, X_train, y_train, X_test, y_test, dfResults)


AdaBoostClassifier trained: 0.738396
BaggingClassifier trained: 0.762011
ExtraTreesClassifier trained: 0.687141
GradientBoostingClassifier trained: 0.780502
RandomForestClassifier trained: 0.740881
XGBClassifier trained: 0.782256
LogisticRegression trained: 0.573509
PassiveAggressiveClassifier trained: 0.022600
RidgeClassifier trained: 0.553649
RidgeClassifierCV trained: 0.553649
SGDClassifier trained: 0.728199


In [None]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])


# Tuning Models

## Tuning PassiveAggressiveClassifier

In [None]:
from sklearn.metrics import make_scorer
from sklearn.linear_model import PassiveAggressiveClassifier
clf = PassiveAggressiveClassifier(random_state = seed)

clfParameters = {    
  'fit_intercept':[True, False],
  'max_iter':[1000, 2000],
  'early_stopping':[True, False],
  'warm_start':[True, False],
  'class_weight': ['balanced', None],
  'average': [True, False] 
}

rf_classifier, default_rf_score, tuned_rf_score, cnf_rf_matrix = model_utils.tune_classifier(clf, clfParameters, X_train, X_test, y_train, y_test)

model_utils.plot_confusion_matrix(cnf_rf_matrix, classes=['Life not as risk', 'Life at risk'], normalize = True)
print("Unoptimized model\n------")
print("F-score on testing data: {:.4f}".format(default_rf_score))
print("\nOptimized Model\n------")
print("Final F-score on the testing data: {:.4f}".format(tuned_rf_score))
joblib.dump(rf_classifier, 'rf_classifier.joblib')


In [None]:
clf.random_state = 100

## Tuning MLPClassifier

In [None]:
from sklearn.neural_network import MLPClassifier

nn_classifier = MLPClassifier()

nnParameters = {
  'hidden_layer_sizes':[50, 100, 200],
  'activation' :['identity', 'logistic', 'tanh', 'relu'],
  'solver': ['lbfgs', 'sgd', 'adam'],
  'batch_size': [700],
  'early_stopping': [True]  
}

nn_classifier, default_nn_score, tuned_nn_score, matrix = model_utils.tune_classifier(nn_classifier, nnParameters,  X_train, X_test, y_train, y_test)

#model_utils.plot_confusion_matrix(cnf_ada_matrix, classes=['Life not as risk', 'Life at risk'], normalize = True)
print("Unoptimized model\n------")
print("F-score on testing data: {:.4f}".format(default_nn_score))
print("\nOptimized Model\n------")
print("Final F-score on the testing data: {:.4f}".format(tuned_nn_score))

#joblib.dump(ada_classifier, 'ada_classifier.joblib') 


## Tuning GradientBoostingClassifier


In [None]:
from sklearn.ensemble import GradientBoostingClassifier

from scipy.stats import randint as sp_randint
from sklearn.metrics import make_scorer

rf_classifier = GradientBoostingClassifier()

rfParameters = { 
  'loss': ['deviance', 'exponential'],
  #'learning_rate':[0.1, 0.5, 1],
  'n_estimators':[50, 100, 200],
  #'subsample': [0.5, 1], 
  'criterion':['friedman_mse', 'mse', 'mae'],
  #'min_samples_split': sp_randint(2, 4),
  #'min_samples_leaf': sp_randint(2, 4),
  #'max_depth':sp_randint(5, 30),
  #'max_features':['auto', 'sqrt', 'log2', None] 
}

rf_classifier, default_rf_score, tuned_rf_score = model_utils.tune_classifier(rf_classifier, rfParameters, X_train, X_test, y_train, y_test)

#model_utils.plot_confusion_matrix(cnf_rf_matrix, classes=['Life not as risk', 'Life at risk'], normalize = True)
print("Unoptimized model\n------")
print("F-score on testing data: {:.4f}".format(default_rf_score))
print("\nOptimized Model\n------")
print("Final F-score on the testing data: {:.4f}".format(tuned_rf_score))
#joblib.dump(rf_classifier, 'rf_classifier.joblib') )


# Stacking
## Stacking with to best classifiers

In [None]:
sclf_two, sclf_score = model_utils.get_stack_two(rf_classifier, ada_classifier, X_train, X_test, y_train, y_test, seed)

joblib.dump(sclf_two, 'sclf_two.joblib') 

model_utils.model_validation('sclf_two.joblib', X_test, y_test)

## Stacking with all classifiers

In [None]:
sclf_all, sclf_all_score = model_utils.get_stack_all(rf_classifier, ada_classifier, gauss_classifier, X_train, X_test, y_train, y_test, seed)

joblib.dump(sclf_all, 'sclf_all.joblib') 

model_utils.model_validation('sclf_all.joblib', X_test, y_test)