In [1]:
# Libraries

#---Pandas
import pandas as pd
from pandas.tools.plotting import scatter_matrix

#---numpy
import numpy as np

#---matplotlib
from matplotlib import pyplot as plt

#---machine learning
from sklearn import cross_validation
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

In [2]:
data = pd.read_csv("data/data.csv")

print "----------All The Columns---------"
print data.columns
print "\n--------Dimension of data---------"
print data.shape
print "\n----------Types of Tumor----------"
print data.groupby('diagnosis').size()

----------All The Columns---------
Index([u'id', u'diagnosis', u'radius_mean', u'texture_mean', u'perimeter_mean',
       u'area_mean', u'smoothness_mean', u'compactness_mean',
       u'concavity_mean', u'concave points_mean', u'symmetry_mean',
       u'fractal_dimension_mean', u'radius_se', u'texture_se', u'perimeter_se',
       u'area_se', u'smoothness_se', u'compactness_se', u'concavity_se',
       u'concave points_se', u'symmetry_se', u'fractal_dimension_se',
       u'radius_worst', u'texture_worst', u'perimeter_worst', u'area_worst',
       u'smoothness_worst', u'compactness_worst', u'concavity_worst',
       u'concave points_worst', u'symmetry_worst', u'fractal_dimension_worst',
       u'Unnamed: 32'],
      dtype='object')

--------Dimension of data---------
(569, 33)

----------Types of Tumor----------
diagnosis
B    357
M    212
dtype: int64


In [3]:
#dropping the ID because that doesn't help decide whether a tumor is malignant or benign
data.drop('id',axis = 1, inplace = True)

In [4]:
array = data.values

# All the data minus the diagnosis
X = array[:,1:-2]

# Diagnosis
Y = array[:,0]

#making 20 percent our test data
test_size = .2
seed = 7

X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(\
                                    X,Y,test_size = test_size,random_state = seed)

In [5]:
# 20 fold cross validation for accuracy
n_folds = 20
n_instances = len(X_train)
scoring = 'accuracy'

In [6]:
models = []

#append tuples so we can distinguish by name later
models.append(('LR', LogisticRegression()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))

In [7]:
results = []
names = []

for name, model in models:
    kfold = cross_validation.KFold(n = n_instances, n_folds = n_folds, random_state = seed)
    cv_results = cross_validation.cross_val_score(model, X_train, Y_train, cv = kfold, scoring = scoring)
    results.append(cv_results)
    names.append(name)
    model_accuracy = "%s: %f +- %f" % (name,cv_results.mean(),cv_results.std())
    print model_accuracy

LR: 0.947134 +- 0.059090
KNN: 0.934190 +- 0.055028
CART: 0.931917 +- 0.054759
NB: 0.936067 +- 0.051857


In [None]:
"""

Logistic Regression is the most accurate.


"""

In [9]:
LR = LogisticRegression()
LR.fit(X_train,Y_train)
predictions = LR.predict(X_test)

print "Accuracy: ", accuracy_score(Y_test, predictions)
print "\n--------------------------------------------------------"
print "Confusion Matrix"
print confusion_matrix(Y_test, predictions)
print "\n--------------------------------------------------------"
print "Classification Report"
print classification_report(Y_test, predictions, digits = 4)

"""
"""
print

Accuracy:  0.947368421053

--------------------------------------------------------
Confusion Matrix
[[74  0]
 [ 6 34]]

--------------------------------------------------------
Classification Report
             precision    recall  f1-score   support

          B     0.9250    1.0000    0.9610        74
          M     1.0000    0.8500    0.9189        40

avg / total     0.9513    0.9474    0.9463       114




In [None]:
"""
Telling us that our test is 94.63% accurate.

The confusion matrix tells us that our model had 6 errors in saying it was benign when it was malignant.
It made no error in saying there was a benign tumor when it was malignant. (Which is great)
"""