In [1]:
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestClassifier
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_curve
import math

%pylab inline

Populating the interactive namespace from numpy and matplotlib


In [2]:
# Load in data set
data = pd.read_csv("breast_cancer.csv")

In [3]:
data.head()

Unnamed: 0.1,Unnamed: 0,id number,clump_thickness,uniformity_of_cell_size,uniformity_of_cell_shape,marginal_adhesion,epithelial_cell_size,bare_nuclei,bland_chromatin,normal_nucleoli,mitoses,malignant
0,0,1000025,5,1,1,1,2,1,3,1,1,0
1,1,1002945,5,4,4,5,7,10,3,2,1,0
2,2,1015425,3,1,1,1,2,2,3,1,1,0
3,3,1016277,6,8,8,1,3,4,3,7,1,0
4,4,1017023,4,1,1,3,2,1,3,1,1,0


In [4]:
# Remove the id columns and separate out the dependent variable, malignant
data = data.drop(['Unnamed: 0', 'id number'], axis=1)

In [6]:
y = data.pop("malignant")

In [7]:
# Setup holdout data set for test/train:
X_train, X_test, y_train, y_test = train_test_split(data, y, test_size=.2, random_state=42)

In [8]:
# Run Grid Search on a Random Forest model to optimize:
n_estimators = [300,400,500]
max_features = ['auto', 'sqrt','log2']
min_samples_split = [3,5,7]

rfc = RandomForestClassifier(n_jobs=1)
estimator = GridSearchCV(rfc,
                         dict(n_estimators=n_estimators,
                              max_features=max_features,
                              min_samples_split=min_samples_split
                              ), cv=None, n_jobs=-1)

In [9]:
# Run the training data through the optimization:
estimator.fit(X_train, y_train)

GridSearchCV(cv=None, error_score='raise',
       estimator=RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False),
       fit_params={}, iid=True, n_jobs=-1,
       param_grid={'min_samples_split': [3, 5, 7], 'max_features': ['auto', 'sqrt', 'log2'], 'n_estimators': [300, 400, 500]},
       pre_dispatch='2*n_jobs', refit=True, scoring=None, verbose=0)

In [10]:
# Output the best parameters for the training data:
estimator.best_estimator_

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=3,
            min_weight_fraction_leaf=0.0, n_estimators=400, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [11]:
# Note: split = 3, estimators = 400
# Store this classifier's parameters:
best_rfc = estimator.best_estimator_

### Measure Model's Performance

In [14]:
# Accuracy (the number of correct predictions divided the total number of records):
accuracy = accuracy_score(y_test, best_rfc.predict(X_test))
print "Accuracy: ", accuracy

Accuracy:  0.964285714286


In [15]:
# Precision and Recall:
print classification_report(y_test, best_rfc.predict(X_test))

             precision    recall  f1-score   support

          0       0.97      0.98      0.97        95
          1       0.95      0.93      0.94        45

avg / total       0.96      0.96      0.96       140



In [16]:
# AUC:
roc = roc_auc_score(y_test, best_rfc.predict_proba(X_test)[:,1])
print "AUC Score: ", roc

AUC Score:  0.996023391813


### K-Fold Cross Validation

In [17]:
from sklearn import cross_validation
scores = cross_validation.cross_val_score(best_rfc, data, y, cv=10)

In [18]:
# Preview the range of scores across all chunks of the model:
scores

array([ 0.91549296,  0.97142857,  0.97142857,  0.91428571,  0.98571429,
        0.98571429,  0.97142857,  0.98571429,  0.98550725,  0.98550725])

In [19]:
# Average these scores:
mean_score = scores.mean()
std_dev = scores.std()
std_error = scores.std() / math.sqrt(scores.shape[0])
ci =  2.262 * std_error
lower_bound = mean_score - ci
upper_bound = mean_score + ci

print "Score is %f +/-  %f" % (mean_score, ci)
print '95 percent probability that if this experiment were repeated over and over the average score would be between %f and %f' % (lower_bound, upper_bound)

Score is 0.967222 +/-  0.019228
95 percent probability that if this experiment were repeated over and over the average score would be between 0.947994 and 0.986450


### Model Review

The K-Fold Cross Validation score's average value came in nearly 3% lower than the AUC score. This is because the AUC score of 99.6% (seems nice!) was using the optimized Random Forest model on only 20% of the entire data set. This training set may or may not best represent the entire model, so the optimization drilled in to be best for this data subset. With K-Fold, the optimized model was run on 10 separate groups of the data test set with training run on the entire remaining of the data set, so the prediction capability is run on more of the entire data set and some minimal statistics provide how consistent is the model's capability. The K-Fold resulting in a (still nice!) result of 96.7% +/ 0.02.

##### The model above creates a Random Forest on a 20% hold out set of the data for testing. This model is run through a range of parameters to identify the optimum parameter set. This optimized set used 400 estimators and sample split of 3 for the Random Forest model generation, which was then run on the test data. The accuracy of this model was a very high 99.6% representing the number of correct predictions over the total number of records. For this data set, since there are 31% more not malignent (0) results than malignent (1), and the model is only tested on a small subset of this data, it is possible that the accuracy measure isn't representing a true performance of this model running on a more general set of data.

##### The precision of the model to predict malignent (1) results is 95% meaning that it will predict this percentage of the actual '1' results. The recall on malignent (1) results is 93% meaning of the '1' results predicted, this percentage of these results will be true predictions.

##### The AUC score came at a very high 99.6%, which is suggesting the model is capable of identifying all positive results and nearly no false positives.