# Breast Tumor Classification using different Machine Learning Classifiers

In [95]:
import numpy as np
import pandas as pd
import seaborn as sn
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder

In [96]:
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier

In [97]:
data = pd.read_csv('data 2.csv')

In [98]:
X = data[['radius_mean','texture_mean','perimeter_mean','area_mean','smoothness_mean','compactness_mean','concavity_mean','concave points_mean','symmetry_mean','fractal_dimension_mean','radius_se','texture_se','perimeter_se','area_se','smoothness_se','compactness_se','concavity_se','concave points_se','symmetry_se','fractal_dimension_se','radius_worst','texture_worst','perimeter_worst','area_worst','smoothness_worst','compactness_worst','concavity_worst','concave points_worst','symmetry_worst','fractal_dimension_worst']]
Y = data['diagnosis']

# KNN-Classifier

### Average Accuracy after 100 Iterations: 0.94

In [99]:
# Convert character labels to numerical labels
le = LabelEncoder()
Y = le.fit_transform(Y)
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,random_state = 0)

In [100]:
knn = KNeighborsClassifier(n_neighbors = 5)

### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [101]:
knn.fit(X_train,Y_train)
Y_pred = knn.predict(X_test)
print('Accuracy of K-NN Classifier on Test Set: ', knn.score(X_test,Y_test))

Accuracy of K-NN Classifier on Test Set:  0.9370629370629371


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


In [102]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[85  5]
 [ 4 49]]
              precision    recall  f1-score   support

           0       0.96      0.94      0.95        90
           1       0.91      0.92      0.92        53

    accuracy                           0.94       143
   macro avg       0.93      0.93      0.93       143
weighted avg       0.94      0.94      0.94       143



In [103]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9344863731656184


### Cross Validation of KNN Classifier

In [104]:
scores = cross_val_score(knn, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.9122807  0.87719298 0.89473684 0.96491228 0.94736842 0.92982456
 0.96491228 0.92982456 0.9122807  0.96428571]
Mean Accuracy: 0.93 (+/- 0.06)


  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)
  mode, _ = stats.mode(_y[neigh_ind, k], axis=1)


# Logistic Regression

### Average Accuracy after 100 Iterations: 0.95

In [105]:
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

In [106]:
# Scaling data in preprocessing 
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [107]:
pipe = make_pipeline(StandardScaler(), LogisticRegression())

In [108]:
pipe.fit(X_train, Y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression())])

### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [109]:
Y_pred = pipe.predict(X_test)
print('Accuracy of Logistic Regression on Test Set: ', pipe.score(X_test, Y_test))

Accuracy of Logistic Regression on Test Set:  0.958041958041958


In [110]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[87  3]
 [ 3 50]]
              precision    recall  f1-score   support

           0       0.97      0.97      0.97        90
           1       0.94      0.94      0.94        53

    accuracy                           0.96       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



In [111]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9550314465408805


### Cross Validation of Logistic Regression Model

In [112]:
scores = cross_val_score(pipe, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.98245614 0.98245614 0.98245614 0.96491228 0.98245614 0.98245614
 0.94736842 1.         1.         0.98214286]
Mean Accuracy: 0.98 (+/- 0.03)


# Support Vector Machine

### Average Accuracy after 100 Iterations: 0.96

In [113]:
from sklearn.svm import SVC
#from adspy_shared_utilities import (plot_class_regions_for_classifier_subplot)

In [114]:
svm_model = SVC(kernel='linear')
svm_model.fit(X_train, Y_train)

SVC(kernel='linear')

### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [115]:
Y_pred = svm_model.predict(X_test)
print('Accuracy of SVM on Test Set: ', svm_model.score(X_test,Y_test))

Accuracy of SVM on Test Set:  0.958041958041958


In [116]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[85  5]
 [ 1 52]]
              precision    recall  f1-score   support

           0       0.99      0.94      0.97        90
           1       0.91      0.98      0.95        53

    accuracy                           0.96       143
   macro avg       0.95      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



In [117]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9627882599580713


### Cross Validation of Support Vector Machine

In [118]:
scores = cross_val_score(svm_model, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.98245614 0.92982456 0.92982456 0.94736842 0.96491228 0.98245614
 0.92982456 0.94736842 0.96491228 0.96428571]
Mean Accuracy: 0.95 (+/- 0.04)


# Decision Tree

### Average Accuracy after 100 Iterations: 0.91

In [119]:
from sklearn.tree import DecisionTreeClassifier

In [120]:
dt_model = DecisionTreeClassifier()
dt_model.fit(X_train, Y_train)
Y_pred = dt_model.predict(X_test)
print('Accuracy of Decision Tree on Test Set: ', dt_model.score(X_test,Y_test))

Accuracy of Decision Tree on Test Set:  0.8951048951048951


### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [121]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[76 14]
 [ 1 52]]
              precision    recall  f1-score   support

           0       0.99      0.84      0.91        90
           1       0.79      0.98      0.87        53

    accuracy                           0.90       143
   macro avg       0.89      0.91      0.89       143
weighted avg       0.91      0.90      0.90       143



In [122]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9127882599580713


### Cross Validation of Decision Tree

In [123]:
scores = cross_val_score(dt_model, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.96491228 0.89473684 0.92982456 0.87719298 0.94736842 0.87719298
 0.87719298 0.94736842 0.92982456 0.94642857]
Mean Accuracy: 0.92 (+/- 0.06)


# Gradient Boosted Decision Tree

### Average Accuracy after 100 Iterations: 0.96

In [124]:
from sklearn.ensemble import GradientBoostingClassifier

In [125]:
clf_gb = GradientBoostingClassifier(n_estimators=100, learning_rate=0.1, max_depth=3, random_state=42)
clf_gb.fit(X_train, Y_train)

GradientBoostingClassifier(random_state=42)

### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [126]:
Y_pred = clf_gb.predict(X_test)
print('Accuracy of Gradient Boosted Decision Tree: ', clf_gb.score(X_test,Y_test))

Accuracy of Gradient Boosted Decision Tree:  0.965034965034965


In [127]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[87  3]
 [ 2 51]]
              precision    recall  f1-score   support

           0       0.98      0.97      0.97        90
           1       0.94      0.96      0.95        53

    accuracy                           0.97       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.97      0.97      0.97       143



In [128]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9644654088050315


### Cross Validation of Gradient Boosted Decision Tree

In [129]:
scores = cross_val_score(clf_gb, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.98245614 0.89473684 0.94736842 0.94736842 0.98245614 0.96491228
 0.96491228 0.98245614 0.96491228 1.        ]
Mean Accuracy: 0.96 (+/- 0.06)


# Naive Bayes Classifier

### Average Accuracy after 100 Iterations: 0.91

In [130]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()

In [131]:
# Fit the model on the training data
clf.fit(X_train, Y_train)
# Make predictions on the test data
Y_pred = clf.predict(X_test)

### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [132]:
print('Accuracy of Naive Bayes Classifier on Test Set: ', clf.score(X_test,Y_test))
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

Accuracy of Naive Bayes Classifier on Test Set:  0.9020979020979021
[[88  2]
 [12 41]]
              precision    recall  f1-score   support

           0       0.88      0.98      0.93        90
           1       0.95      0.77      0.85        53

    accuracy                           0.90       143
   macro avg       0.92      0.88      0.89       143
weighted avg       0.91      0.90      0.90       143



In [133]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.8756813417190775


### Cross Validation of Naive Bayes Classifier

In [134]:
scores = cross_val_score(clf, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.85964912 0.87719298 0.87719298 0.92982456 0.87719298 0.87719298
 0.9122807  0.92982456 0.89473684 0.89285714]
Mean Accuracy: 0.89 (+/- 0.05)


# Random Forest Algorithm

### Average Accuracy after 100 Iterations: 0.97

In [135]:
from sklearn.ensemble import RandomForestClassifier
# Create an instance of the RandomForestClassifier class
rfc = RandomForestClassifier(n_estimators=100, random_state=42)
# Fit the model on the training data
rfc.fit(X_train, Y_train)
Y_pred = rfc.predict(X_test)

### Accuracy, Recall, Precision, F1 Score & ROC-AUC Metrics

In [136]:
print('Accuracy of Random Forest Algorithm on Test Set: ', rfc.score(X_test,Y_test))

Accuracy of Random Forest Algorithm on Test Set:  0.972027972027972


In [137]:
print(confusion_matrix(Y_test, Y_pred))
print(classification_report(Y_test, Y_pred))

[[87  3]
 [ 1 52]]
              precision    recall  f1-score   support

           0       0.99      0.97      0.98        90
           1       0.95      0.98      0.96        53

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143



In [138]:
# calculate the ROC AUC score
roc_auc = roc_auc_score(Y_test, Y_pred, multi_class='ovr')
print("ROC AUC Score:", roc_auc)

ROC AUC Score: 0.9738993710691823


### Cross Validation of Random Forest Algorithm

In [139]:
scores = cross_val_score(rfc, X, Y, cv=10)
# Print the accuracy scores for each fold
print("Accuracy scores:", scores)
# Print the mean and standard deviation of the accuracy scores
print("Mean Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

Accuracy scores: [0.98245614 0.89473684 0.92982456 0.94736842 1.         0.98245614
 0.94736842 0.98245614 0.96491228 1.        ]
Mean Accuracy: 0.96 (+/- 0.06)


# Neural Network

### Model Testing reaches 0.9-1.0 Accuracy Levels after 100 Epochs and more

In [140]:
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder, OneHotEncoder

In [141]:
# Convert labels to integers
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(Y)

In [142]:
# Convert integers to one-hot encoded vectors
onehot_encoder = OneHotEncoder(sparse=False, categories='auto')
y_onehot = onehot_encoder.fit_transform(y_encoded.reshape(-1, 1))

In [143]:
# Split data into training and testing sets
nX_train, nX_test, nY_train, nY_test = train_test_split(X, y_onehot, test_size=0.2)

In [144]:
# Define neural network architecture
model = keras.Sequential([
    keras.layers.Dense(10, activation='relu', input_shape=(nX_train.shape[1],)),
    keras.layers.Dense(nY_train.shape[1], activation='softmax')
])

In [145]:
# Compile the model
model.compile(optimizer='adam', loss='categorical_crossentropy', metrics=['accuracy'])

### Model Training & Evaluation Metrics

In [146]:
# Train the model
model.fit(nX_train, nY_train, epochs=100, batch_size=32, validation_data=(nX_test, nY_test))

Epoch 1/100
Epoch 2/100
Epoch 3/100
Epoch 4/100
Epoch 5/100
Epoch 6/100
Epoch 7/100
Epoch 8/100
Epoch 9/100
Epoch 10/100
Epoch 11/100
Epoch 12/100
Epoch 13/100
Epoch 14/100
Epoch 15/100
Epoch 16/100
Epoch 17/100
Epoch 18/100
Epoch 19/100
Epoch 20/100
Epoch 21/100
Epoch 22/100
Epoch 23/100
Epoch 24/100
Epoch 25/100
Epoch 26/100
Epoch 27/100
Epoch 28/100
Epoch 29/100
Epoch 30/100
Epoch 31/100
Epoch 32/100
Epoch 33/100
Epoch 34/100
Epoch 35/100
Epoch 36/100
Epoch 37/100
Epoch 38/100
Epoch 39/100
Epoch 40/100
Epoch 41/100
Epoch 42/100
Epoch 43/100
Epoch 44/100
Epoch 45/100
Epoch 46/100
Epoch 47/100
Epoch 48/100
Epoch 49/100
Epoch 50/100
Epoch 51/100
Epoch 52/100
Epoch 53/100
Epoch 54/100
Epoch 55/100
Epoch 56/100
Epoch 57/100
Epoch 58/100
Epoch 59/100
Epoch 60/100
Epoch 61/100
Epoch 62/100
Epoch 63/100
Epoch 64/100
Epoch 65/100
Epoch 66/100
Epoch 67/100
Epoch 68/100
Epoch 69/100
Epoch 70/100
Epoch 71/100
Epoch 72/100
Epoch 73/100
Epoch 74/100
Epoch 75/100
Epoch 76/100
Epoch 77/100
Epoch 78

<keras.callbacks.History at 0x1e040b26790>

In [147]:
Y_pred = model.predict(X_test)



In [149]:
# Evaluate the model on the test data
test_loss, test_acc = model.evaluate(X_test, Y_pred)
# Print the test accuracy
print('Test Accuracy: ', test_acc)
print('Test Loss: ', test_loss)

Test Accuracy:  1.0
Test Loss:  0.11529054492712021
