# Cancer Diagnosis Using Machine Learning


In [5]:
# import libraries
import numpy as np
import pandas as pd

from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.utils import resample
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import RandomForestClassifier

# Read Data:
Read in the UCI cancer dataset. 

In [6]:
# The dataset includes 9 numerical features. The last column is the binary label (“1” means it is a malignant cancer, “0” means 
# it is a benign tumor).
cancerData = pd.read_csv('data/Cancer_small.csv')
cancerData.head()

Unnamed: 0,Clump_Thickness,Uniformity_of_Cell_Size,Uniformity_of_Cell_Shape,Marginal_Adhesion,Single_Epithelial_Cell_Size,Bare_Nuclei,Bland_Chromatin,Normal_Nucleoli,Mitoses,Malignant_Cancer
0,5,1,1,1,2,1,3,1,1,0
1,5,4,4,5,7,10,3,2,1,0
2,3,1,1,1,2,2,3,1,1,0
3,6,8,8,1,3,4,3,7,1,0
4,4,1,1,3,2,1,3,1,1,0


# Split data:
Split the dataset into testing and training sets with the following parameters: test_size=0.3, random_state=2.

In [3]:
label = 'Malignant_Cancer'

# split X and y
X = cancerData.drop(label, axis=1)
y = cancerData[label]

# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=2)

# Train Decision Tree Classifier

In [4]:
# instantiate model object
my_decisiontree = DecisionTreeClassifier(random_state=2)

# fit model
my_decisiontree.fit(X_train, y_train)

y_predict = my_decisiontree.predict(X_test)

score = accuracy_score(y_test, y_predict)

print('DTC Score:',score)


DTC Score: 0.866666666667


# Bagging:
Perform bagging with 19 Decision Trees. These are the "base" tree classifiers. Then aggregate the results by voting.
#### 1) Bootstrapping with replacement
#### 2) Voting


In [5]:
bootstrap_size = int(.8*X_train.shape[0])

print('X', X.shape)
print('X_train', X_train.shape)
print('Bootstrap Size:', bootstrap_size)

# dictionary to hold predictions of all classifiers
predictionsForX_Test = {}

# each of the 19 DTCs is fitted and predicts on X_test
for i in range(0,19):
    sampleX = resample(X_train, n_samples = bootstrap_size , random_state=i , replace = True)
    sampleY = resample(y_train, n_samples = bootstrap_size , random_state=i , replace = True)
    Base_DecisionTree = DecisionTreeClassifier(random_state=2)
    Base_DecisionTree.fit(sampleX, sampleY)
    
    # add np array to dict
    predictionsForX_Test[i] = Base_DecisionTree.predict(X_test)


# array to hold voted predictions    
y_voted_pred = np.empty([X_test.shape[0]])
    
# voting for each of the 45 testing samples
# go through each sample
for i in range (X_test.shape[0]):
    zero = 0
    one = 0
    
    # go through each Decision Tree Fitted Classifier
    for j in range(0,19):
        if predictionsForX_Test[j][i] == 0:
            zero = zero + 1
        else:
            one = one + 1
    
    # prints out voting results for each testing sample
    #print(i, zero, one)
    
    # count votes and take majority
    if zero > one:
        y_voted_pred[i] = 0
    else:
        y_voted_pred[i] = 1
    
score = accuracy_score(y_test, y_voted_pred)

print('\nBagging DTC Score:',score)



X (150, 9)
X_train (105, 9)
Bootstrap Size: 84

Bagging DTC Score: 0.911111111111


# Adaboost:
Test with Adaboost to compare with other methods.

In [6]:
my_AdaBoost = AdaBoostClassifier(n_estimators = 19,random_state=2)
my_AdaBoost.fit(X_train, y_train)

y_predict = my_AdaBoost.predict(X_test)

score = accuracy_score(y_test, y_predict)

print('AdaBoost Score:',score)


AdaBoost Score: 0.933333333333


# Random Forest:
Test with Random Forest to compare with other methods.

In [7]:
my_RandomForest = RandomForestClassifier(n_estimators = 19, bootstrap = True, random_state=2)
my_RandomForest.fit(X_train, y_train)

y_predict = my_RandomForest.predict(X_test)

score = accuracy_score(y_test, y_predict)

print('Random Forest Classifier Score:',score)



Random Forest Classifier Score: 0.955555555556
