In [79]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

print('Setup completed!')

Setup completed!


In [80]:
DATASET = '../data/'

In [81]:
df = pd.read_csv(DATASET + 'breast-cancer-encoded.csv', index_col=[0])

In [82]:
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,def-malig,breast,breast-quad,irradiat
0,no-recurrence-events,1,2,5,0,0,2,0,1,0
1,no-recurrence-events,2,2,3,0,0,1,1,4,0
2,no-recurrence-events,2,2,3,0,0,1,0,1,0
3,no-recurrence-events,4,0,2,0,0,1,1,2,0
4,no-recurrence-events,2,2,0,0,0,1,1,3,0


In [83]:
x = df.iloc[:, 1:]
y = df['class']

In [84]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, stratify=y, test_size=0.2, random_state=42
)

In [85]:
x_train.shape, x_test.shape

((220, 9), (56, 9))

## Build Classification models

In [86]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

### K-Nearest Neighbors

In [87]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(x_train, y_train)

# make predictions
y_train_pred = knn.predict(x_train)
y_test_pred = knn.predict(x_test)

# training set performance
knn_train_accuracy = accuracy_score(y_train, y_train_pred)
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred)
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred)
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred)
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted')


In [88]:
print('Model performance for training set')
print(f'Accuracy: {knn_train_accuracy}')
print(f'MCC: {knn_train_mcc}')
print(f'F1 score: {knn_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {knn_test_accuracy}')
print(f'MCC: {knn_test_mcc}')
print(f'F1 score: {knn_test_f1}')

Model performance for training set
Accuracy: 0.8090909090909091
MCC: 0.513205391695839
F1 score: 0.7989883592017737
--------------------------------------
Model performance for test set
Accuracy: 0.7321428571428571
MCC: 0.2613838422748508
F1 score: 0.7054187192118225


#### KNN did not overfit the data

## Support Vector Machine (SVM)

In [89]:
from sklearn.svm import SVC

svm = SVC(gamma=2, C=1)
svm.fit(x_train, y_train)

SVC(C=1, gamma=2)

In [90]:
# make predictions
y_train_pred = svm.predict(x_train)
y_test_pred = svm.predict(x_test)

# training set performance
svm_train_accuracy = accuracy_score(y_train, y_train_pred)
svm_train_mcc = matthews_corrcoef(y_train, y_train_pred)
svm_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
svm_test_accuracy = accuracy_score(y_test, y_test_pred)
svm_test_mcc = matthews_corrcoef(y_test, y_test_pred)
svm_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [91]:
print('Model performance for training set')
print(f'Accuracy: {svm_train_accuracy}')
print(f'MCC: {svm_train_mcc}')
print(f'F1 score: {svm_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {svm_test_accuracy}')
print(f'MCC: {svm_test_mcc}')
print(f'F1 score: {svm_test_f1}')

Model performance for training set
Accuracy: 0.9772727272727273
MCC: 0.9451890389839106
F1 score: 0.9771132115601437
--------------------------------------
Model performance for test set
Accuracy: 0.7321428571428571
MCC: 0.21320071635561041
F1 score: 0.6351172047766476


## Neural Network

In [92]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(x_train, y_train)

MLPClassifier(alpha=1, max_iter=1000)

In [93]:
# make predictions
y_train_pred = mlp.predict(x_train)
y_test_pred = mlp.predict(x_test)

# training set performance
mlp_train_accuracy = accuracy_score(y_train, y_train_pred)
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred)
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred)
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred)
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [94]:
print('Model performance for training set')
print(f'Accuracy: {mlp_train_accuracy}')
print(f'MCC: {mlp_train_mcc}')
print(f'F1 score: {mlp_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {mlp_test_accuracy}')
print(f'MCC: {mlp_test_mcc}')
print(f'F1 score: {mlp_test_f1}')

Model performance for training set
Accuracy: 0.7727272727272727
MCC: 0.4125812950947179
F1 score: 0.7589531680440773
--------------------------------------
Model performance for test set
Accuracy: 0.7142857142857143
MCC: 0.27386127875258304
F1 score: 0.708246225319396


# Build stacked model

In [95]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('knn', knn),
    ('svm', svm),
    ('mlp', mlp),
]

In [96]:
# build stack model
stack_model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

# train stack model
stack_model.fit(x_train, y_train)

# make prediction
y_train_pred = stack_model.predict(x_train)
y_test_pred = stack_model.predict(x_test)

# training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred)
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred)
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred)
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred)
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [97]:
print('Model performance for training set')
print(f'Accuracy: {stack_model_train_accuracy}')
print(f'MCC: {stack_model_train_mcc}')
print(f'F1 score: {stack_model_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {stack_model_test_accuracy}')
print(f'MCC: {stack_model_test_mcc}')
print(f'F1 score: {stack_model_test_f1}')

Model performance for training set
Accuracy: 0.8772727272727273
MCC: 0.705610890403457
F1 score: 0.8661034882837769
--------------------------------------
Model performance for test set
Accuracy: 0.7321428571428571
MCC: 0.21320071635561041
F1 score: 0.6351172047766476


## Results

In [98]:
acc_train_list = {'knn':knn_train_accuracy,
                  'svm':svm_train_accuracy,
                  'mlp':mlp_train_accuracy,
                  'stack':stack_model_train_accuracy}

mcc_train_list = {'knn':knn_train_mcc,
                  'svm':svm_train_mcc,
                  'mlp':mlp_train_mcc,
                  'stack':stack_model_train_mcc}

f1_train_list = {'knn':knn_train_f1,
                  'svm':svm_train_f1,
                  'mlp':mlp_train_f1,
                  'stack':stack_model_train_f1}

In [99]:
mcc_train_list

{'knn': 0.513205391695839,
 'svm': 0.9451890389839106,
 'mlp': 0.4125812950947179,
 'stack': 0.705610890403457}

In [100]:
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])

model_df = pd.concat([acc_df, mcc_df, f1_df], axis=1)

In [101]:
model_df

Unnamed: 0,Accuracy,MCC,F1
knn,0.809091,0.513205,0.798988
svm,0.977273,0.945189,0.977113
mlp,0.772727,0.412581,0.758953
stack,0.877273,0.705611,0.866103


## Conclusion: SVM is the best performer among the models for this dataset

In [102]:
model_df.to_csv(DATASET + 'model-performance.csv')

## Saving SVM model to a file

In [103]:
import pickle

In [111]:
with open('../svm_model', 'wb') as f:
    mp = pickle.dump(svm, f)