In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import seaborn as sns

print('Setup completed!')

Setup completed!


In [2]:
DATASET = '../data/'

In [5]:
df = pd.read_csv(DATASET + 'breast-cancer-encoded.csv', index_col=[0])

In [6]:
df.head()

Unnamed: 0,class,age,menopause,tumor-size,inv-nodes,node-caps,def-malig,breast,breast-quad,irradiat
0,no-recurrence-events,1.0,2.0,5.0,0.0,0.0,2.0,0.0,1.0,0.0
1,no-recurrence-events,2.0,2.0,3.0,0.0,0.0,1.0,1.0,4.0,0.0
2,no-recurrence-events,2.0,2.0,3.0,0.0,0.0,1.0,0.0,1.0,0.0
3,no-recurrence-events,4.0,0.0,2.0,0.0,0.0,1.0,1.0,2.0,0.0
4,no-recurrence-events,2.0,2.0,0.0,0.0,0.0,1.0,1.0,3.0,0.0


In [9]:
x = df.iloc[:, 1:]
y = df['class']

In [10]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(
    x, y, stratify=y, test_size=0.2, random_state=42
)

In [11]:
x_train.shape, x_test.shape

((220, 9), (56, 9))

## Build Classification models

In [12]:
from sklearn.metrics import accuracy_score
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import f1_score

### K-Nearest Neighbors

In [13]:
from sklearn.neighbors import KNeighborsClassifier

knn = KNeighborsClassifier(3)
knn.fit(x_train, y_train)

# make predictions
y_train_pred = knn.predict(x_train)
y_test_pred = knn.predict(x_test)

# training set performance
knn_train_accuracy = accuracy_score(y_train, y_train_pred)
knn_train_mcc = matthews_corrcoef(y_train, y_train_pred)
knn_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
knn_test_accuracy = accuracy_score(y_test, y_test_pred)
knn_test_mcc = matthews_corrcoef(y_test, y_test_pred)
knn_test_f1 = f1_score(y_test, y_test_pred, average='weighted')


In [14]:
print('Model performance for training set')
print(f'Accuracy: {knn_train_accuracy}')
print(f'MCC: {knn_train_mcc}')
print(f'F1 score: {knn_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {knn_test_accuracy}')
print(f'MCC: {knn_test_mcc}')
print(f'F1 score: {knn_test_f1}')

Model performance for training set
Accuracy: 0.8090909090909091
MCC: 0.513205391695839
F1 score: 0.7989883592017737
--------------------------------------
Model performance for test set
Accuracy: 0.7321428571428571
MCC: 0.2613838422748508
F1 score: 0.7054187192118225


#### KNN did not overfit the data

## Support Vector Machine (SVM)

In [32]:
from sklearn.svm import SVC

svm = SVC(gamma=2, C=1)
svm.fit(x_train, y_train)

SVC(C=1, gamma=2)

In [33]:
# make predictions
y_train_pred = svm.predict(x_train)
y_test_pred = svm.predict(x_test)

# training set performance
svm_train_accuracy = accuracy_score(y_train, y_train_pred)
svm_train_mcc = matthews_corrcoef(y_train, y_train_pred)
svm_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
svm_test_accuracy = accuracy_score(y_test, y_test_pred)
svm_test_mcc = matthews_corrcoef(y_test, y_test_pred)
svm_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [34]:
print('Model performance for training set')
print(f'Accuracy: {svm_train_accuracy}')
print(f'MCC: {svm_train_mcc}')
print(f'F1 score: {svm_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {svm_test_accuracy}')
print(f'MCC: {svm_test_mcc}')
print(f'F1 score: {svm_test_f1}')

Model performance for training set
Accuracy: 0.9772727272727273
MCC: 0.9451890389839106
F1 score: 0.9771132115601437
--------------------------------------
Model performance for test set
Accuracy: 0.7321428571428571
MCC: 0.21320071635561041
F1 score: 0.6351172047766476


## Neural Network

In [35]:
from sklearn.neural_network import MLPClassifier

mlp = MLPClassifier(alpha=1, max_iter=1000)
mlp.fit(x_train, y_train)

MLPClassifier(alpha=1, max_iter=1000)

In [36]:
# make predictions
y_train_pred = mlp.predict(x_train)
y_test_pred = mlp.predict(x_test)

# training set performance
mlp_train_accuracy = accuracy_score(y_train, y_train_pred)
mlp_train_mcc = matthews_corrcoef(y_train, y_train_pred)
mlp_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
mlp_test_accuracy = accuracy_score(y_test, y_test_pred)
mlp_test_mcc = matthews_corrcoef(y_test, y_test_pred)
mlp_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [37]:
print('Model performance for training set')
print(f'Accuracy: {mlp_train_accuracy}')
print(f'MCC: {mlp_train_mcc}')
print(f'F1 score: {mlp_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {mlp_test_accuracy}')
print(f'MCC: {mlp_test_mcc}')
print(f'F1 score: {mlp_test_f1}')

Model performance for training set
Accuracy: 0.7681818181818182
MCC: 0.4020047036006941
F1 score: 0.7550353614183399
--------------------------------------
Model performance for test set
Accuracy: 0.75
MCC: 0.3440511766019877
F1 score: 0.7380952380952381


# Build stacked model

In [38]:
from sklearn.ensemble import StackingClassifier
from sklearn.linear_model import LogisticRegression

estimators = [
    ('knn', knn),
    ('svm', svm),
    ('mlp', mlp),
]

In [39]:
# build stack model
stack_model = StackingClassifier(
    estimators=estimators, final_estimator=LogisticRegression()
)

# train stack model
stack_model.fit(x_train, y_train)

# make prediction
y_train_pred = stack_model.predict(x_train)
y_test_pred = stack_model.predict(x_test)

# training set model performance
stack_model_train_accuracy = accuracy_score(y_train, y_train_pred)
stack_model_train_mcc = matthews_corrcoef(y_train, y_train_pred)
stack_model_train_f1 = f1_score(y_train, y_train_pred, average='weighted')

# test set performance
stack_model_test_accuracy = accuracy_score(y_test, y_test_pred)
stack_model_test_mcc = matthews_corrcoef(y_test, y_test_pred)
stack_model_test_f1 = f1_score(y_test, y_test_pred, average='weighted')

In [40]:
print('Model performance for training set')
print(f'Accuracy: {stack_model_train_accuracy}')
print(f'MCC: {stack_model_train_mcc}')
print(f'F1 score: {stack_model_train_f1}')
print('--------------------------------------')
print('Model performance for test set')
print(f'Accuracy: {stack_model_test_accuracy}')
print(f'MCC: {stack_model_test_mcc}')
print(f'F1 score: {stack_model_test_f1}')

Model performance for training set
Accuracy: 0.8818181818181818
MCC: 0.7168069159098094
F1 score: 0.8716179653679653
--------------------------------------
Model performance for test set
Accuracy: 0.7321428571428571
MCC: 0.21320071635561041
F1 score: 0.6351172047766476


## Results

In [45]:
acc_train_list = {'knn':knn_train_accuracy,
                  'svm':svm_train_accuracy,
                  'mlp':mlp_train_accuracy,
                  'stack':stack_model_train_accuracy}

mcc_train_list = {'knn':knn_train_mcc,
                  'svm':svm_train_mcc,
                  'mlp':mlp_train_mcc,
                  'stack':stack_model_train_mcc}

f1_train_list = {'knn':knn_train_f1,
                  'svm':svm_train_f1,
                  'mlp':mlp_train_f1,
                  'stack':stack_model_train_f1}

In [46]:
mcc_train_list

{'knn': 0.513205391695839,
 'svm': 0.9451890389839106,
 'mlp': 0.4020047036006941,
 'stack': 0.7168069159098094}

In [47]:
acc_df = pd.DataFrame.from_dict(acc_train_list, orient='index', columns=['Accuracy'])
mcc_df = pd.DataFrame.from_dict(mcc_train_list, orient='index', columns=['MCC'])
f1_df = pd.DataFrame.from_dict(f1_train_list, orient='index', columns=['F1'])

model_df = pd.concat([acc_df, mcc_df, f1_df], axis=1)

In [48]:
model_df

Unnamed: 0,Accuracy,MCC,F1
knn,0.809091,0.513205,0.798988
svm,0.977273,0.945189,0.977113
mlp,0.768182,0.402005,0.755035
stack,0.881818,0.716807,0.871618


## Conclusion: SVM is the best performer among the models for this dataset

In [49]:
model_df.to_csv(DATASET + 'model-performance.csv')