In [1]:
import numpy as np
import pandas as pd
from time import time
import matplotlib.pyplot as plt
from IPython.display import display 


# Pretty display for notebooks
%matplotlib inline
pd.set_option('display.max_colwidth', -1)

dataset = pd.read_csv("datasets/dataset_encoded.csv.gz", compression='gzip')
dataset = dataset.drop(['Unnamed: 0'], axis = 1)
display(dataset.head(n = 5))

labels = dataset[['RIESGO_VIDA']]
features = dataset.drop(['RIESGO_VIDA', 'PQR_ESTADO'], axis = 1)

Unnamed: 0,AFEC_DPTO,AFEC_EDADR,AFEC_EDUC,AFEC_GENERO,AFEC_GETNICO,AFEC_MPIO,AFEC_PARENTESCO,AFEC_POBESPECIAL,AFEC_REGAFILIACION,AFEC_TIPOPER,...,PET_DPTO,PET_MPIO,PET_TIPOPER,PQR_CANAL,PQR_CLASE_SNS,PQR_ESTADO,PQR_TIPOATENCION,PQR_TIPOPETICION,TRIM,RIESGO_VIDA
0,0.159108,0.233195,0.165027,0.171842,0.196613,0.158095,0.234418,0.185798,0.247397,0.182693,...,0.115756,0.161965,0.097005,0.07146,0.215504,0.063637,0.078341,0.213982,0.123776,0
1,0.182208,0.246041,0.165027,0.198391,0.196613,0.151685,0.234418,0.185798,0.247397,0.182693,...,0.136739,0.101469,0.129838,0.080834,0.215504,0.063637,0.153969,0.213982,0.123776,0
2,0.200419,0.233195,0.190694,0.198391,0.196613,0.196657,0.276908,0.185798,0.163945,0.182693,...,0.170398,0.139685,0.129838,0.080834,0.215504,0.063637,0.153969,0.213982,0.123776,0
3,0.261765,0.233195,0.274073,0.198391,0.196613,0.288224,0.268976,0.185798,0.163945,0.182693,...,0.210334,0.232032,0.129838,0.382929,0.215504,0.063637,0.153969,0.213982,0.123776,0
4,0.189494,0.233195,0.165027,0.198391,0.196613,0.196716,0.268976,0.326796,0.163945,0.182693,...,0.141061,0.152191,0.129838,0.382929,0.215504,0.691235,0.153969,0.213982,0.123776,1


## Shuffle and Split Data

In [2]:
# Import train_test_split
from sklearn.model_selection import train_test_split

# Split the 'features' and 'labels' data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(features, 
                                                    labels, 
                                                    test_size = 0.20, 
                                                    random_state = 10)

# Show the results of the split
print "features_final set has {} samples.".format(features.shape[0])
print "Training set has {} samples.".format(X_train.shape[0])
print "Testing set has {} samples.".format(X_test.shape[0])

features_final set has 2130783 samples.
Training set has 1704626 samples.
Testing set has 426157 samples.


### Creating a Training and Predicting Pipeline

In [3]:
'''
TP = np.sum(income) # Counting the ones as this is the naive case. Note that 'income' is the 'income_raw' data 
encoded to numerical values done in the data preprocessing step.
FP = income.count() - TP # Specific to the naive case

TN = 0 # No predicted negatives in the naive case
FN = 0 # No predicted negatives in the naive case
'''

tp = float(np.sum(labels['RIESGO_VIDA']))
fp = float(labels['RIESGO_VIDA'].count() - tp)
tn = 0
fn = 0

# TODO: Calculate accuracy, precision and recall
accuracy = (tp + tn)/labels['RIESGO_VIDA'].count()
recall = tp / (tp + fn)
precision = tp / (tp + fp)
print(accuracy)
# TODO: Calculate F-score using the formula above for beta = 0.5 and correct values for precision and recall.
# HINT: The formula above can be written as (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)
beta = 2
fscore = (1 + beta**2) * (precision * recall) / ((beta**2 * precision) + recall)

# Print the results 
print "Naive Predictor: [Accuracy score: {:.4f}, F-score: {:.4f}]".format(accuracy, fscore)

0.135542192706
Naive Predictor: [Accuracy score: 0.1355, F-score: 0.4395]


In [4]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import fbeta_score

def train_predict(learner, sample_size, X_train, y_train, X_test, y_test): 
    '''
    inputs:
       - learner: the learning algorithm to be trained and predicted on
       - sample_size: the size of samples (number) to be drawn from training set
       - X_train: features training set
       - y_train: income training set
       - X_test: features testing set
       - y_test: income testing set
    '''
    
    results = {}
    # TODO: Fit the learner to the training data using slicing with 'sample_size' using .fit(training_features[:], training_labels[:])
    start = time() # Get start time
    learner = learner.fit(X_train[:sample_size], y_train[:sample_size])
    end = time() # Get end time
    
    # TODO: Calculate the training time
    results['train_time'] = end - start
        
    # TODO: Get the predictions on the test set(X_test),
    #       then get predictions on the first 300 training samples(X_train) using .predict()
    start = time() # Get start time
    predictions_test = learner.predict(X_test)
    predictions_train = learner.predict(X_train[:300])
    end = time() # Get end time
    
    # TODO: Calculate the total prediction time
    results['pred_time'] = end - start
            
    # TODO: Compute accuracy on the first 300 training samples which is y_train[:300]
    results['acc_train'] = accuracy_score(y_train[:300], predictions_train)
        
    # TODO: Compute accuracy on test set using accuracy_score()
    results['acc_test'] =  accuracy_score(y_test, predictions_test)
    
    # TODO: Compute F-score on the the first 300 training samples using fbeta_score()
    b=0.5
    results['f_train'] = fbeta_score(y_train[:300], predictions_train, b)
        
    # TODO: Compute F-score on the test set which is y_test
    results['f_test'] =  fbeta_score(y_test, predictions_test, b)
       
    # Success
    print "{} trained on {} samples.".format(learner.__class__.__name__, sample_size)
        
    # Return the results
    return results
  

In [7]:
import visuals as vs
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Initialize the three models
clf_A = MLPClassifier(random_state = 300)
clf_B = RandomForestClassifier(random_state = 300)
clf_C = KNeighborsClassifier()

samples_100 = len(y_train)
samples_10 = int(samples_100*0.1)
samples_1 = int(samples_10*0.1)

# Collect results on the learners
results = {}
for clf in [clf_A, clf_B, clf_C]:
    clf_name = clf.__class__.__name__
    results[clf_name] = {}
    for i, samples in enumerate([samples_1, samples_10, samples_100]):
        results[clf_name][i] = \
        train_predict(clf, samples, X_train, y_train, X_test, y_test)

# Run metrics visualization for the three supervised learning models chosen
vs.evaluate(results, accuracy, fscore)

  y = column_or_1d(y, warn=True)


MLPClassifier trained on 17046 samples.
MLPClassifier trained on 170462 samples.




MLPClassifier trained on 1704626 samples.




RandomForestClassifier trained on 17046 samples.
RandomForestClassifier trained on 170462 samples.


KeyboardInterrupt: 

In [None]:
print(results)

## Tuning Model