In [1]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 

import model_utils as utils

# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

dataset, features, labels = utils.getDataSet("datasets/dataset_over_sampled.csv.gz")

Unnamed: 0,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,ALTO_COSTO,CIE_10,...,PQR_CLASE_SNS,PQR_ESTADO,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM,AFEC_LOCATION,ENT_LOCATION,PET_LOCATION,COMPLETE_MOTIVE,RIESGO_VIDA
0,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.445445,...,1e-07,1e-07,1.0,1e-07,1e-07,0.246246,0.185686,0.053053,0.274775,0
1,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.445445,...,1e-07,1e-07,1.0,1e-07,0.6481481,0.076577,0.185686,0.368869,0.142142,0
2,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.445445,...,1e-07,1e-07,1.0,1e-07,1e-07,0.076577,0.185686,0.890891,0.142142,0
3,0.5805806,0.5970971,0.527027,0.7267267,0.9454454,0.7082082,0.9999999,0.9999999,1e-07,0.239239,...,0.9999999,0.3413413,0.168669,0.991992,0.3806306,0.246246,0.958458,0.845846,0.883383,0
4,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.453954,...,1e-07,1e-07,1.0,1e-07,0.9999999,0.076577,0.90991,0.618619,0.274775,0


## Shuffle and Split Data

In [2]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size = 0.20, 
                                                    random_state = 10)

# Show the results of the split
print "features_final set has {} samples.".format(features.shape[0])
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

features_final set has 3315592 samples.
Training set has 2652473 samples.
Testing set has 663119 samples.


### Creating a Training and Predicting Pipeline

In [3]:
'''
TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data 
encoded to numerical values done in the data preprocessing step.
FP = income.count() - TP # Specific to the naive case

TN = 0 # No predicted negatives in the naive case
FN = 0 # No predicted negatives in the naive case
'''

tp = float(np.sum(labels['RIESGO_VIDA']))
fp = float(labels['RIESGO_VIDA'].count() - tp)
tn = 0
fn = 0

# TODO: Calculate accuracy, precision and recall
recall = tp / (tp + fn)
precision = tp / (tp + fp)

# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
# HINT: The formula above can be written as (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
beta = 2
fscore = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

# Print the results 
print "Naive Predictor: [F-score: {:.4f}]".format(fscore)

Naive Predictor: [F-score: 0.8333]


In [4]:
import visuals as vs
from sklearn.metrics import fbeta_score
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import SGDClassifier
# Initialize the three models
clf_A = SGDClassifier(random_state = 300)
clf_B = RandomForestClassifier(random_state = 300)
clf_C = AdaBoostClassifier(random_state = 300)


samples_100 = len(y_train)

# Collect results on the learners
dfResults = pd.DataFrame(columns=['learner', 'learner_index', 'size_index', 'train_time', 'pred_time', 'f_test', 'f_train'])

for k, clf in enumerate([clf_A, clf_B, clf_C]):
    clf_name = clf.__class__.__name__    
    dfResults = utils.train_predict(clf, k, 0, samples_100, X_train, y_train, X_test, y_test, dfResults)


  y = column_or_1d(y, warn=True)


SGDClassifier trained on 2652473 samples.


  learner = learner.fit(X_train[:sample_size], y_train[:sample_size])


RandomForestClassifier trained on 2652473 samples.
AdaBoostClassifier trained on 2652473 samples.


In [5]:
display( dfResults.sort_values(by=['f_test'], ascending = False)[['learner', 'f_test']])

Unnamed: 0,learner,f_test
1,RandomForestClassifier,0.991604
2,AdaBoostClassifier,0.882436
0,SGDClassifier,0.882184


# Tuning Models


## Tuning SGDClassifier

In [7]:
sdgClassifier = SGDClassifier(random_state = 20)

sdgParameters = {
  'eta0':[0.01],
  'loss':['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'],
  'penalty':['none', 'l2', 'l1', 'elasticnet'],
  'learning_rate':['constant', 'optimal', 'invscaling'],
  'class_weight' :['balanced']
  #'max_features':['auto', 'sqrt', 'log2', None],
}

sdgClassifier = utils.tuneClassifier(sdgClassifier, sdgParameters, X_train, X_test, y_train, y_test)

joblib.dump(rfClassifier, 'sdgClassifier_over_sampled.joblib') 

Unoptimized model
------
F-score on testing data: 0.8910

Optimized Model
------
Final F-score on the testing data: 0.8983


['sdgClassifier_over_sampled.joblib']

## Testing Model with the validation dataset


In [14]:
dataset_excluded, features_excluded, labels_excluded = utils.getDataSet("datasets/dataset_over_sampled_validation.csv.gz")

from sklearn.base import clone
rfClassifier = (clone(clf_B)).fit(X_train, y_train)

predictions_excluded = sdgClassifier.predict(features_excluded)
f_test_excluded =  fbeta_score(labels_excluded, predictions_excluded, 2)
print(f_test_excluded)

Unnamed: 0,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,ALTO_COSTO,CIE_10,...,PQR_CLASE_SNS,PQR_ESTADO,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM,AFEC_LOCATION,ENT_LOCATION,PET_LOCATION,COMPLETE_MOTIVE,RIESGO_VIDA
0,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.445445,...,1e-07,1e-07,1.0,1e-07,1e-07,0.076577,0.185686,0.053053,0.372873,0
1,0.5805806,0.5970971,0.527027,0.7267267,0.5580581,0.7082082,0.5730731,0.9999999,1e-07,0.445445,...,0.9999999,0.6316316,0.168669,0.6571572,0.4119119,0.779349,0.917076,0.437668,0.647648,0
2,0.9999999,0.5970971,0.9999999,0.7267267,0.9114114,0.7082082,0.8268268,0.9999999,1e-07,0.377878,...,1e-07,0.6316316,0.168669,1e-07,1e-07,0.865365,0.842843,0.890891,0.372873,0
3,0.7374875,0.9096597,0.527027,0.7267267,0.9454454,0.978979,0.5730731,0.9999999,1e-07,0.445445,...,0.9999999,0.6316316,1.0,0.8298298,1e-07,0.703203,0.710711,0.514014,0.647648,0
4,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,1e-07,0.445445,...,1e-07,1e-07,1.0,1e-07,0.9999999,0.246246,0.036036,0.262262,0.274775,0


  after removing the cwd from sys.path.


0.6921063907933399
