In [1]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 
from sklearn.metrics import fbeta_score
import model_utils as model_utils
from sklearn.externals import joblib

import json
# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)


with open('best_config.json') as json_data_file:
    config = json.load(json_data_file)[0]
print(config)
seed = int(config['seed'])
test_size = config['test_size']

dataset = pd.read_csv("datasets/encoded_dataset.csv")
display(dataset.head(n = 5))

labels = dataset[['RIESGO_VIDA']]
features = dataset.drop(['RIESGO_VIDA'], axis = 1)


{'seed': 134.0, 'test_size': 0.1, 'impute': 0.0}


Unnamed: 0,ENT_COD_DEPTO,COD_MOTESP,PET_COD_DEPTO,MES,PET_MPIO,COD_MACROMOT,RIESGO_VIDA,ENT_COD_MPIO,COD_MOTGEN,PET_TIPOPER_juridica,...,PQR_TIPOPETICION_peticion de informacion,PQR_TIPOPETICION_queja (funcionario de la supersalud),PQR_TIPOPETICION_reclamo,PQR_TIPOPETICION_seguimiento a cumplimiento fallos de tutela,TRIM_trim i,TRIM_trim ii,TRIM_trim iii,TRIM_trim iv,PQR_CLASE_SNS_pqr,PQR_CLASE_SNS_sol inf
0,0.056711,0.000152,0.061875,0.0,0.091208,0.0,0,0.477412,0.0,1,...,0,0,1,0,1,0,0,0,1,0
1,0.0462,8.7e-05,0.405465,0.0,0.488847,0.0,0,0.405977,0.0,0,...,0,0,1,0,1,0,0,0,1,0
2,0.068993,0.00231,0.562651,0.0,0.119851,0.0,0,0.55812,0.002169,0,...,0,0,1,0,1,0,0,0,1,0
3,0.035577,0.005544,0.32424,0.0,0.434541,0.0,0,0.321326,0.005414,0,...,0,0,1,0,1,0,0,0,1,0
4,0.059559,0.000195,0.50013,0.0,0.2281,0.0,1,0.495987,0.0,0,...,0,0,1,0,1,0,0,0,1,0


In [2]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, labels, test_size = test_size, random_state = seed, stratify=labels)

# Show the results of the split
print("features_final set has {} samples.".format(features.shape[0]))
print("Training set has {} samples.".format(X_train.shape[0]))
print("Testing set has {} samples.".format(X_test.shape[0]))


features_final set has 823249 samples.
Training set has 740924 samples.
Testing set has 82325 samples.


In [None]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB

# Initialize the three models
clf_A = GaussianNB()
clf_B = RandomForestClassifier(random_state = seed)
clf_C = AdaBoostClassifier(random_state = seed)


# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'train_time', 'pred_time', 'f_test', 'f_train'])

for k, clf in enumerate([clf_A, clf_B, clf_C]):
    clf_name = clf.__class__.__name__  
    clf, dfResults = model_utils.train_predict(clf, 2, X_train, y_train, X_test, y_test, dfResults)


  y = column_or_1d(y, warn=True)


GaussianNB trained.


  learner = learner.fit(X_train, y_train)


In [None]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])


# Tuning Models

## Tuning RandomForestClassifier

In [6]:
from sklearn.metrics import make_scorer

rf_classifier = RandomForestClassifier(random_state = seed)

rfParameters = {    
  'criterion':['gini', 'entropy'],
  'max_depth':[5, 10, 15, 30],
  'max_features':['auto', 'sqrt', 'log2', None],
  'class_weight': ['balanced', 'balanced_subsample'], 
}

rf_classifier, default_rf_score, tuned_rf_score, cnf_rf_matrix = model_utils.tune_classifier(rf_classifier, rfParameters, X_train, X_test, y_train, y_test)

model_utils.plot_confusion_matrix(cnf_rf_matrix, classes=['Life not as risk', 'Life at risk'], normalize = True)
print("Unoptimized model\n------")
print("F-score on testing data: {:.4f}".format(default_rf_score))
print("\nOptimized Model\n------")
print("Final F-score on the testing data: {:.4f}".format(tuned_rf_score))
#joblib.dump(rf_classifier, 'rf_classifier.joblib') )




KeyboardInterrupt: 

## Tuning AdaBoostClassifier

In [7]:
from sklearn.ensemble import AdaBoostClassifier
ada_classifier = AdaBoostClassifier(random_state = seed)

adaParameters = {
  'learning_rate':[0.1, 0.5, 1],
  'algorithm' :['SAMME', 'SAMME.R']
}

ada_classifier, default_ada_score, tuned_ada_score, cnf_ada_matrix = model_utils.tune_classifier(ada_classifier, adaParameters,  X_train, X_test, y_train, y_test)

model_utils.plot_confusion_matrix(cnf_ada_matrix, classes=['Life not as risk', 'Life at risk'], normalize = True)
print("Unoptimized model\n------")
print("F-score on testing data: {:.4f}".format(default_ada_score))
print("\nOptimized Model\n------")
print("Final F-score on the testing data: {:.4f}".format(tuned_ada_score))

#joblib.dump(ada_classifier, 'ada_classifier.joblib') 


  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)
  'precision', 'predicted', average, warn_for)


KeyboardInterrupt: 

## Tuning GaussianClassifier


In [None]:
gauss_classifier = GaussianNB()

gaussParameters = {
  'priors':[None],
  'var_smoothing' :[1e-09]
}

gauss_classifier, default_gauss_score, tuned_gauss_score, cnf_gauss_matrix = model_utils.tune_classifier(gauss_classifier, gaussParameters,  X_train, X_test, y_train, y_test)

model_utils.plot_confusion_matrix(cnf_gauss_matrix, classes=['Life not as risk', 'Life at risk'], normalize = True)
print "Unoptimized model\n------"
print "F-score on testing data: {:.4f}".format(default_gauss_score)
print "\nOptimized Model\n------"
print "Final F-score on the testing data: {:.4f}".format(tuned_gauss_score)

joblib.dump(gauss_classifier, 'gauss_classifier.joblib') 


# Stacking
## Stacking with to best classifiers

In [None]:
sclf_two, sclf_score = model_utils.get_stack_two(rf_classifier, ada_classifier, X_train, X_test, y_train, y_test, seed)

joblib.dump(sclf_two, 'sclf_two.joblib') 

model_utils.model_validation('sclf_two.joblib', X_test, y_test)

## Stacking with all classifiers

In [None]:
sclf_all, sclf_all_score = model_utils.get_stack_all(rf_classifier, ada_classifier, gauss_classifier, X_train, X_test, y_train, y_test, seed)

joblib.dump(sclf_all, 'sclf_all.joblib') 

model_utils.model_validation('sclf_all.joblib', X_test, y_test)