# Cancer Wisconsin Dataset
`Introduction and Background`

The Breast Cancer Wisconsin (Diagnostic) Data Set is obtained from the UCI Machine Learning Repository.


-Features are computed from a digitized image of a fine needle aspirate (FNA) of a breast mass.

-They describe characteristics of the cell nuclei present in the image.

Attribute Information:

1) ID number
2) Diagnosis (M = malignant, B = benign)
3-32)

Ten real-valued features are computed for each cell nucleus:

a) radius (mean of distances from center to points on the perimeter)
b) texture (standard deviation of gray-scale values)
c) perimeter
d) area
e) smoothness (local variation in radius lengths)
f) compactness (perimeter^2 / area - 1.0)
g) concavity (severity of concave portions of the contour)
h) concave points (number of concave portions of the contour)
i) symmetry
j) fractal dimension ("coastline approximation" - 1)

In [1]:

#check boxplot, p values etc., bar charts?, line graph 
import numpy as np
%matplotlib inline  
import time
import statsmodels.formula.api as smf
import statsmodels.api as sm
import pandas as pd
import seaborn as sns

import matplotlib.pyplot as plt

from sklearn.metrics import confusion_matrix, accuracy_score,\
precision_score, recall_score
from sklearn import tree, svm, ensemble
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from IPython.display import Image
import pydotplus 


  from pandas.core import datetools


In [2]:
data = pd.read_csv('data.csv')
# # print data.head() #print the first 5 rows of all columns
# #print data.info() # check all columns have the same number of entries. 
# model = smf.ols(formula = 'area_mean~texture_worst', data = data).fit()

# print model.summary() #try a simple model to see if the database works

# yh = model.predict()
# x = data['texture_worst']
# y2 = data['area_mean']

# plt.plot(x, y2, '.')
# plt.plot(x,yh, '-')
# plt.xlabel('Texture Worst')
# plt.ylabel('Area Mean')
# plt.title('$R^2$ = %f'%(model.rsquared))
# plt.show() 

In [3]:
# model2 = smf.glm(formula = 'diagnosis_binary~area_mean+texture_worst+symmetry_worst', data = data, family = sm.families.Binomial()).fit()
# print model2.summary()
# from sklearn.metrics import confusion_matrix

# conf_mat = confusion_matrix(data['diagnosis_binary'], model2.predict()>0.5, labels = [1, 0])
#from sklearn.metrics import accuracy_score, precision_score, recall_score

# print '\nMetrics'
# print 'Accuracy: %.3f' %accuracy_score(data['diagnosis_binary'], model2.predict()>0.5)
# print 'Precision: %.3f' %precision_score(data['diagnosis_binary'], model2.predict()>0.5)
# print 'Recall: %.3f' %recall_score(data['diagnosis_binary'], model2.predict()>0.5)


In [4]:
# df_cm = pd.DataFrame(conf_mat, index = ['B', 'M'], columns = ['B', 'M'])
# plt.figure(figsize = (5,3.5))
# sns.heatmap(df_cm, annot=True, fmt='g', cmap = 'binary')
# plt.title('Confusion Matrix')
# plt.ylabel('Actual')
# plt.xlabel('Predicted')
# plt.show()

In [5]:
cancertree = tree.DecisionTreeClassifier(max_depth=2, min_impurity_split=0.2)
# cancertree.fit(X_train,y_train)

In [6]:
# dot_data = tree.export_graphviz(cancertree, out_file=None) 
# graph = pydotplus.graph_from_dot_data(dot_data) 


# dot_data = tree.export_graphviz(cancertree, out_file=None,
#                                 feature_names = variables,
#                                 class_names = ('Negative', 'Positive'),
#                                 filled = True, rounded = True,
#                                 special_characters = True)
# graph = pydotplus.graph_from_dot_data(dot_data)
# Image(graph.create_png())

In [7]:
# print '\nFor the training set:'
# print 'Accuracy: %0.4f'%(accuracy_score(y_train, cancertree.predict(X_train)))
# print 'Precision: %0.4f'%(precision_score(y_train, cancertree.predict(X_train)))
# print 'Recall: %0.4f'%(recall_score(y_train, cancertree.predict(X_train)))


# print '\nFor the test set:'
# print 'Accuracy: %0.4f'%(accuracy_score(y_test, cancertree.predict(X_test)))
# print 'Precision: %0.4f'%(precision_score(y_test, cancertree.predict(X_test)))
# print 'Recall: %0.4f'%(recall_score(y_test, cancertree.predict(X_test)))

In [8]:
# gerald = tree.DecisionTreeClassifier()

# # For more information on what is happening here see slide 25

# parameters_tree = {'min_impurity_split': np.linspace(0,0.2,20),'max_depth':range(2,8)}

# super_tree = GridSearchCV(gerald, parameters_tree, cv=10, scoring='accuracy')
# super_tree.fit(X_train,y_train)


# # Returns the best fitted model on the data
# gerald = super_tree.best_estimator_

# print '\nFor the training set:'
# print 'Accuracy: %0.4f'%(accuracy_score(y_train, gerald.predict(X_train)))
# print 'Precision: %0.4f'%(precision_score(y_train, gerald.predict(X_train)))
# print 'Recall: %0.4f'%(recall_score(y_train, gerald.predict(X_train)))


# print '\nFor the test set:'
# print 'Accuracy: %0.4f'%(accuracy_score(y_test, gerald.predict(X_test)))
# print 'Precision: %0.4f'%(precision_score(y_test, gerald.predict(X_test)))
# print 'Recall: %0.4f'%(recall_score(y_test, gerald.predict(X_test)))

In [9]:
# from sklearn import tree 
# big_mac = tree.DecisionTreeClassifier(max_depth=2, min_impurity_split=0.2)
# scores = cross_val_score(big_mac, X_train, y_train, cv=10, scoring='accuracy') 
# print 'Mean accuracy from the cross-validation: %.3f \nand the full vector: %s\n' %(np.mean(scores), ', '.join(map("{:.3f}".format, scores)))
# big_mac.fit(X_train,y_train)

In [36]:
from sklearn import ensemble
#import time
data = pd.read_csv('data.csv')
train, test = train_test_split(data, test_size = 0.2, random_state = 9)
target = 'diagnosis_binary'
variables = list(data.columns)
variables.remove(target)
X_train = train[variables]
y_train = train['diagnosis_binary']
X_test = test[variables]

start_time = time.time()
v = ensemble.RandomForestClassifier(n_estimators=100)
y_test = test['diagnosis_binary']
scores = cross_val_score(v, X_train, y_train, cv=10, scoring='accuracy')
print 'Mean accuracy from the cross-validation: %.3f \nand the full vector: %s\n' %(np.mean(scores), ','.join(map("{:.3f}".format, scores)))
v.fit(X_train,y_train)
#print clf.tree_
print 'Accuracy on the train set: %.3f' %accuracy_score(y_train, v.predict(X_train))
print 'Accuracy on the test set: %.3f' %accuracy_score(y_test, v.predict(X_test))
print 'Precision: %0.4f'%(precision_score(y_train, v.predict(X_train)))
print 'Recall: %0.4f'%(recall_score(y_train, v.predict(X_train)))
print 'Precision: %0.4f'%(precision_score(y_test, v.predict(X_test)))
print 'Recall: %0.4f'%(recall_score(y_test, v.predict(X_test)))
print("--- %s seconds ---" % (time.time() - start_time))

Mean accuracy from the cross-validation: 0.956 
and the full vector: 0.957,0.979,0.957,0.956,0.956,0.978,0.956,0.867,0.956,1.000

Accuracy on the train set: 1.000
Accuracy on the test set: 0.956
Precision: 1.0000
Recall: 1.0000
Precision: 0.9730
Recall: 0.9000
--- 12.7949998379 seconds ---


In [11]:
from sklearn import svm
start_time = time.time()
# z = svm.SVC(kernel='linear', C = 1) # c is 1 by default 
z = svm.SVC(kernel='rbf', C=1000, gamma=0.0001)
# train, test = train_test_split(data, test_size = 0.2, random_state = 42)
# # variables = ['area_mean','texture_worst','symmetry_worst']
# target = 'diagnosis_binary'
# variables = list(data.columns)
# variables.remove(target)
# X_train = train[variables]
# y_train = train['diagnosis_binary']
# X_test = test[variables]
# y_test = test['diagnosis_binary']
scores = cross_val_score(z, X_train, y_train, cv=10, scoring='accuracy')
print 'Mean accuracy from the cross-validation: %.3f \nand the full vector: %s\n' %(np.mean(scores), ','.join(map("{:.3f}".format, scores)))
#print clf.tree_
#z.best_params_
z.fit(X_train,y_train)
print 'Accuracy on the train set: %.3f' %accuracy_score(y_train, z.predict(X_train))
print 'Accuracy on the test set: %.3f' %accuracy_score(y_test, z.predict(X_test))
print 'Precision: %0.4f'%(precision_score(y_train, z.predict(X_train)))
print 'Recall: %0.4f'%(recall_score(y_train, z.predict(X_train)))
print 'Precision: %0.4f'%(precision_score(y_test, z.predict(X_test)))
print 'Recall: %0.4f'%(recall_score(y_test, z.predict(X_test)))
print("--- %s seconds ---" % (time.time() - start_time))

Mean accuracy from the cross-validation: 0.934 
and the full vector: 0.935,0.957,0.935,0.913,0.935,0.956,0.956,0.911,0.911,0.933

Accuracy on the train set: 0.998
Accuracy on the test set: 0.912
Precision: 0.9940
Recall: 1.0000
Precision: 0.9111
Recall: 0.8723
--- 0.615000009537 seconds ---


In [69]:
from sklearn.model_selection import GridSearchCV
#parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5], 'C': [1,2,100,1000,10000]}]
parameters = [{'kernel': ['rbf'], 'gamma': np.linspace(1e-2,1e-6,10), 'C': np.linspace(1,10,5)}]
svr = svm.SVC()
x = GridSearchCV(svr, parameters, cv=5, scoring='accuracy')
x.fit(X_train,y_train)
v = x.best_params_
print v
d = svm.SVC(kernel='rbf', C=v['C'], gamma=v['gamma'])
scores = cross_val_score(d, X_train, y_train, cv=10, scoring='accuracy')
print 'Mean accuracy from the cross-validation: %.3f \nand the full vector: %s\n' %(np.mean(scores), ','.join(map("{:.3f}".format, scores)))
d.fit(X_train,y_train)
print 'Accuracy on the train set: %.3f' %accuracy_score(y_train, d.predict(X_train))
print 'Accuracy on the test set: %.3f' %accuracy_score(y_test, d.predict(X_test))
print 'Precision: %0.4f'%(precision_score(y_train, d.predict(X_train)))
print 'Recall: %0.4f'%(recall_score(y_train, d.predict(X_train)))
print 'Precision: %0.4f'%(precision_score(y_test, d.predict(X_test)))
print 'Recall: %0.4f'%(recall_score(y_test, d.predict(X_test)))
print("--- %s seconds ---" % (time.time() - start_time))

# print 'Accuracy on the train set: %.3f' %accuracy_score(y_train, clf.predict(X_train))

# print 'Accuracy on the test set: %.3f' %accuracy_score(y_test, clf.predict(X_test))


{'kernel': 'rbf', 'C': 3.25, 'gamma': 9.9999999999999995e-07}
Mean accuracy from the cross-validation: 0.918 
and the full vector: 0.915,0.936,0.978,0.911,0.889,0.956,0.911,0.889,0.889,0.911

Accuracy on the train set: 0.925
Accuracy on the test set: 0.930
Precision: 0.9539
Recall: 0.8430
Precision: 1.0000
Recall: 0.8000
--- 1603.648 seconds ---


Improvements 
More Info
Different regions
More data