# Imports

In [1]:
# Import pandas and numpy
import pandas as pd
import numpy as np

# Import the classifiers we will be using
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# Import train/test split function
from sklearn.model_selection import train_test_split

# Import cross validation scorer
from sklearn.model_selection import cross_val_score

# Import ROC AUC scoring function
from sklearn.metrics import roc_auc_score

# Read the data
#### This a breast cancer diagnostic dataset: these features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass. 
#### "diagnosis" is our target: 0 for benign, 1 for malignant.

In [2]:
# Read in our dataset, using the parameter 'index_col' to select the index
df =  pd.read_csv('../data/breast_cancer.csv', index_col='id')

In [3]:
df.head()

Unnamed: 0_level_0,diagnosis,radius_mean,texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,...,radius_worst,texture_worst,perimeter_worst,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
842302,1,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
842517,1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
84300903,1,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
84348301,1,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
84358402,1,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
df.shape

(569, 31)

In [5]:
# Remove the target from the features
features = df.drop(['diagnosis'], axis=1)
# Select the target
target = df['diagnosis']

# Train/test split

In [6]:
X_train, X_test, y_train, y_test = train_test_split(features, target, test_size=0.3, random_state=0)

# Modelling with standard train/test split

In [7]:
# Choose the model
tree_model = DecisionTreeClassifier(random_state=0)

# Fit the model
tree_model.fit(X_train, y_train)

# Make the predictions
y_pred = tree_model.predict_proba(X_test)

# Score the predictions
score = roc_auc_score(y_test, y_pred[:,1])

print("ROC AUC: " + str(score))

print("Number of mislabeled points out of a total %d points: %d" % (y_test.shape[0],(y_test != np.round_(y_pred[:,1])).sum()))



ROC AUC: 0.917328042328
Number of mislabeled points out of a total 171 points: 15


# Modelling with k-fold cross validation

In [8]:
# Choose the classifer
tree_model = DecisionTreeClassifier(random_state=0)

# Fit, predict and score in one step!
# The arguments, in order: 
#1. Model 
#2. Features
#3. Target
#4. Number of k-folds
#5. Scoring function
#6. Number of CPU cores to use
score_tree_model = cross_val_score(tree_model, features, target, cv=5, scoring='roc_auc', n_jobs=-1)

print("ROC AUC scores: " + str(score_tree_model))
print("Average ROC AUC: " + str(score_tree_model.mean()))

ROC AUC scores: [ 0.90956072  0.91408269  0.90526492  0.94315895  0.90794769]
Average ROC AUC: 0.916002994712
