In [15]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 

import model_utils as utils

# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

dataset, features, labels = utils.getDataSet("datasets/dataset_health_features.csv.gz")

Unnamed: 0,COD_MACROMOT,COD_MOTGEN,COD_MOTESP,PATOLOGIA_1,PATOLOGIA_TIPO,CIE_10,RIESGO_VIDA
0,0.741241,0.715215,0.818819,0.626376,0.720721,0.008008,0
1,0.741241,0.715215,0.647147,0.626376,0.720721,0.445445,0
2,0.741241,0.932933,0.777277,0.626376,0.720721,0.92993,0
3,0.741241,0.553554,0.571071,0.626376,0.720721,0.003316,0
4,0.741241,0.711712,0.845345,0.963463,0.95996,0.982259,1


## Shuffle and Split Data

In [16]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size = 0.20, 
                                                    random_state = 10)

# Show the results of the split
print "features_final set has {} samples.".format(features.shape[0])
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

features_final set has 2130783 samples.
Training set has 1704626 samples.
Testing set has 426157 samples.


### Creating a Training and Predicting Pipeline

In [17]:
'''
TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data 
encoded to numerical values done in the data preprocessing step.
FP = income.count() - TP # Specific to the naive case

TN = 0 # No predicted negatives in the naive case
FN = 0 # No predicted negatives in the naive case
'''

tp = float(np.sum(labels['RIESGO_VIDA']))
fp = float(labels['RIESGO_VIDA'].count() - tp)
tn = 0
fn = 0

# TODO: Calculate accuracy, precision and recall
recall = tp / (tp + fn)
precision = tp / (tp + fp)

# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
# HINT: The formula above can be written as (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
beta = 2
fscore = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

# Print the results 
print "Naive Predictor: [F-score: {:.4f}]".format(fscore)

Naive Predictor: [F-score: 0.4395]


In [18]:
import visuals as vs
from sklearn.metrics import fbeta_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
# Initialize the three models
clf_A = SGDClassifier(random_state = 300)
clf_B = RandomForestClassifier(random_state = 300)
clf_C = AdaBoostClassifier(random_state = 300)


samples_100 = len(y_train)
samples_10 = int(samples_100*0.1)
samples_1 = int(samples_10*0.1)

# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'learner_index', 'size_index', 'train_time', 'pred_time', 'f_test', 'f_train'])

for k, clf in enumerate([clf_A, clf_B, clf_C]):
    clf_name = clf.__class__.__name__    
    dfResults = utils.train_predict(clf, k, 0, samples_100, X_train, y_train, X_test, y_test, dfResults)


SGDClassifier trained on 1704626 samples.
RandomForestClassifier trained on 1704626 samples.
AdaBoostClassifier trained on 1704626 samples.


In [19]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])


Unnamed: 0,learner,f_test
1,RandomForestClassifier,0.544893
2,AdaBoostClassifier,0.523238
0,SGDClassifier,0.330444


# Tuning Models

## Tuning RandomForestClassifier

In [20]:
from sklearn.externals import joblib

rfClassifier = RandomForestClassifier(random_state = 20)

rfParameters = {    
  'criterion':['gini', 'entropy'],
  'max_depth':[5, 10],
  'max_features':['auto', 'sqrt', 'log2', None],
  'class_weight': ['balanced', 'balanced_subsample'], 
}

rfClassifier = utils.tuneClassifier(rfClassifier, rfParameters, X_train, X_test, y_train, y_test)


Unoptimized model
------
F-score on testing data: 0.5453

Optimized Model
------
Final F-score on the testing data: 0.7051


## Tuning AdaBoostClassifier


In [24]:
from sklearn.ensemble import AdaBoostClassifier
adaClassifier = AdaBoostClassifier(random_state = 20)

adaParameters = {
  'learning_rate':[0.1, 0.5, 1],
  'algorithm' :['SAMME', 'SAMME.R']
  #'max_features':['auto', 'sqrt', 'log2', None],
}

adaParameters = utils.tuneClassifier(adaClassifier, adaParameters, X_train, X_test, y_train, y_test)

joblib.dump(rfClassifier, 'adaClassifier.joblib') 

  'precision', 'predicted', average, warn_for)


Unoptimized model
------
F-score on testing data: 0.5232

Optimized Model
------
Final F-score on the testing data: 0.5324


['adaClassifier.joblib']

# Stacking

In [26]:
from mlxtend.classifier import StackingClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn import model_selection
from sklearn.metrics import fbeta_score, make_scorer
#meta with Gaussian
sclf = StackingClassifier(classifiers=[rfClassifier, adaClassifier], 
                          use_features_in_secondary = True,
                          meta_classifier=GaussianNB())
sclf = sclf.fit(X_train, y_train)
sclf_predictions = sclf.predict(X_test)

print "F-score on StackingClassifier: {:.4f}".format(fbeta_score(y_test, sclf_predictions, beta = 2))

F-score on StackingClassifier: 0.6940
