In [1]:
import sys
import os
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import sklearn
import sklearn.preprocessing
import sklearn.model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
import tensorflow as tf
import keras
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from keras.utils import np_utils

Using TensorFlow backend.


In [2]:
os.chdir('Downloads')

In [3]:
#download the data
cancer_data=pd.read_csv('breast-cancer-wisconsin.data', header=-1)

In [4]:
cancer_data.columns=['Sample ID', 'Clump thickness', 'Cell size uniformity', 'Cell shape uniformity', 'Marginal adhesion', \
                    'Single epithelial cell size', 'Bare nuclei', 'Bland Chromatin', 'Normal nucleoli', \
                    'Mitoses', 'Class']

In [5]:
#Redo classification vector with 0 and 1 instead of 2 and 4
for i in range(0, len(cancer_data)):
    if cancer_data['Class'][i]==2:
        cancer_data['Class'][i]=0
    elif cancer_data['Class'][i]==4:
        cancer_data['Class'][i]=1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  app.launch_new_instance()
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


In [12]:
#remove rows where Bare nuclei did not have a value (there were only 16, so we shouldn't lose significance)
clean_cancer_data=cancer_data[cancer_data['Bare nuclei']!='?']

In [19]:
#start with simple logistic regression. Note that I am splitting the data into five cross-validation groups 
#and averaging the error.

model=LogisticRegression()
#model=RandomForestClassifier(n_estimators=50, oob_score=True)
#model=GradientBoostingClassifier(learning_rate=.05)

col_names=['Clump thickness', 'Cell size uniformity', 'Cell shape uniformity', 'Marginal adhesion', \
                    'Single epithelial cell size', 'Bare nuclei', 'Bland Chromatin', 'Normal nucleoli', \
                    'Mitoses']

mean_error=[]
total_error=[]
mean_values=[]

temp=list(range(5))
for i in list(range(5)):
    temp=list(range(5))
    temp.remove(i)
    test=clean_cancer_data[i::5]
    training_setup=[clean_cancer_data[temp[0]::5], clean_cancer_data[temp[1]::5], clean_cancer_data[temp[2]::5], \
                    clean_cancer_data[temp[3]::5]]
    training=pd.concat(training_setup)

    fit=model.fit(training[col_names], training['Class'])
    lr_prediction=fit.predict(test[col_names])
    
    mean_error.append(np.mean(np.abs(lr_prediction-test['Class'])))
    
    total_error.append(np.abs(lr_prediction-test['Class']))
    
    mean_values.append(np.mean(np.abs(test['Class'])))
    
print(mean_error)
print(np.mean(mean_error))
print('Logistic Classification Feature Weights')
for i in range(0, len(test[col_names].keys())): print(fit.coef_[0][i], test[col_names].keys()[i])
#for i in range(0, len(test[col_names].keys())): print(fit.feature_importances_[i], test[col_names].keys()[i])

[0.043795620437956206, 0.058394160583941604, 0.014598540145985401, 0.014705882352941176, 0.051470588235294115]
0.0365929583512
Logistic Classification Feature Weights
0.255687850076 Clump thickness
0.213340276061 Cell size uniformity
0.243841332718 Cell shape uniformity
0.194585582316 Marginal adhesion
-0.0184962784915 Single epithelial cell size
0.359224106889 Bare nuclei
0.092505247025 Bland Chromatin
0.134712817806 Normal nucleoli
0.235918351653 Mitoses


In [20]:
#Not bad, 3.7% error, let's try the random forest next

#model=LogisticRegression()
model=RandomForestClassifier(n_estimators=50, oob_score=True)
#model=GradientBoostingClassifier(learning_rate=.05)

col_names=['Clump thickness', 'Cell size uniformity', 'Cell shape uniformity', 'Marginal adhesion', \
                    'Single epithelial cell size', 'Bare nuclei', 'Bland Chromatin', 'Normal nucleoli', \
                    'Mitoses']

mean_error=[]
total_error=[]
mean_values=[]

temp=list(range(5))
for i in list(range(5)):
    temp=list(range(5))
    temp.remove(i)
    test=clean_cancer_data[i::5]
    training_setup=[clean_cancer_data[temp[0]::5], clean_cancer_data[temp[1]::5], clean_cancer_data[temp[2]::5], \
                    clean_cancer_data[temp[3]::5]]
    training=pd.concat(training_setup)

    fit=model.fit(training[col_names], training['Class'])
    lr_prediction=fit.predict(test[col_names])
    
    mean_error.append(np.mean(np.abs(lr_prediction-test['Class'])))
    
    total_error.append(np.abs(lr_prediction-test['Class']))
    
    mean_values.append(np.mean(np.abs(test['Class'])))
    
print(mean_error)
print(np.mean(mean_error))
print('Logistic Classification Feature Weights')
#for i in range(0, len(test[col_names].keys())): print(fit.coef_[0][i], test[col_names].keys()[i])
for i in range(0, len(test[col_names].keys())): print(fit.feature_importances_[i], test[col_names].keys()[i])

[0.043795620437956206, 0.058394160583941604, 0.014598540145985401, 0.014705882352941176, 0.029411764705882353]
0.0321811936453
Logistic Classification Feature Weights
0.0582120007831 Clump thickness
0.209137736078 Cell size uniformity
0.316109193411 Cell shape uniformity
0.0213042406621 Marginal adhesion
0.0781202003116 Single epithelial cell size
0.16538405183 Bare nuclei
0.094023405367 Bland Chromatin
0.0492686024539 Normal nucleoli
0.00844056910249 Mitoses


In [22]:
#Better, 3.2% error. Finally, let's try a gradient boosted tree

#model=LogisticRegression()
#model=RandomForestClassifier(n_estimators=50, oob_score=True)
model=GradientBoostingClassifier()

col_names=['Clump thickness', 'Cell size uniformity', 'Cell shape uniformity', 'Marginal adhesion', \
                    'Single epithelial cell size', 'Bare nuclei', 'Bland Chromatin', 'Normal nucleoli', \
                    'Mitoses']

mean_error=[]
total_error=[]
mean_values=[]

temp=list(range(5))
for i in list(range(5)):
    temp=list(range(5))
    temp.remove(i)
    test=clean_cancer_data[i::5]
    training_setup=[clean_cancer_data[temp[0]::5], clean_cancer_data[temp[1]::5], clean_cancer_data[temp[2]::5], \
                    clean_cancer_data[temp[3]::5]]
    training=pd.concat(training_setup)

    fit=model.fit(training[col_names], training['Class'])
    lr_prediction=fit.predict(test[col_names])
    
    mean_error.append(np.mean(np.abs(lr_prediction-test['Class'])))
    
    total_error.append(np.abs(lr_prediction-test['Class']))
    
    mean_values.append(np.mean(np.abs(test['Class'])))
    
print(mean_error)
print(np.mean(mean_error))
print('Logistic Classification Feature Weights')
#for i in range(0, len(test[col_names].keys())): print(fit.coef_[0][i], test[col_names].keys()[i])
for i in range(0, len(test[col_names].keys())): print(fit.feature_importances_[i], test[col_names].keys()[i])

[0.0364963503649635, 0.051094890510948905, 0.021897810218978103, 0.014705882352941176, 0.03676470588235294]
0.032191927866
Logistic Classification Feature Weights
0.125751359962 Clump thickness
0.117819994702 Cell size uniformity
0.181630859326 Cell shape uniformity
0.0792172011904 Marginal adhesion
0.134183642204 Single epithelial cell size
0.146946889223 Bare nuclei
0.0519912955783 Bland Chromatin
0.125562648131 Normal nucleoli
0.0368961096834 Mitoses


In [24]:
#Pretty much identical. Let's see what a neural net can do.

mean_error=[]
total_error=[]
mean_values=[]

col_names=['Clump thickness', 'Cell size uniformity', 'Cell shape uniformity', 'Marginal adhesion', \
          'Single epithelial cell size', 'Bland Chromatin', 'Normal nucleoli', \
                    'Mitoses']

batches=30

temp=list(range(5))
for i in list(range(5)):
    temp=list(range(5))
    temp.remove(i)
    test=clean_cancer_data[i::5]
    training_setup=[clean_cancer_data[temp[0]::5], clean_cancer_data[temp[1]::5], clean_cancer_data[temp[2]::5], \
                    clean_cancer_data[temp[3]::5]]
    training=pd.concat(training_setup)

    model=Sequential()
    s='relu'
    model.add(Dense(40, activation=s, input_dim=len(col_names)))
    model.add(Dropout(.1))
    model.add(Dense(40, activation=s))
    model.add(Dropout(.1))
    model.add(Dense(1, activation='sigmoid'))
    #keras.optimizers.RMSprop(lr=.001, rho=0.9, epsilon=1e-08, decay=0.0)
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

    model.fit(training[col_names].values, training['Class'].values, batch_size=batches, epochs=50, verbose=0)
    
    nn_prediction=model.predict(test[col_names].values)
    nn_prediction=np.ndarray.tolist(nn_prediction)
    nn_prediction=[float(nn_prediction[i][0]) for i in range(len(nn_prediction))]
    
    mean_error.append(np.mean(np.abs(nn_prediction-test['Class'])))
    
    total_error.append(np.abs(nn_prediction-test['Class']))
    
    mean_values.append(np.mean(np.abs(test['Class'])))

print(mean_error)
print(np.mean(mean_error))


[0.07003184626674293, 0.07509545638692483, 0.04541440381087281, 0.06067088310031549, 0.07541061020524734]
0.065324639954


In [None]:
#Average error of 6.5%, so the decision tree wins!