# Initialisation

In [1]:
# Importing necessary modules
from sklearn.model_selection import train_test_split
from sklearn.metrics         import accuracy_score
from sklearn.neighbors       import KNeighborsClassifier
from sklearn.svm             import SVC
from sklearn.tree            import DecisionTreeClassifier
from matplotlib import pyplot as plt
import pandas as pd
import numpy  as np
plt.style.use('cyberpunk.mplstyle')

# Defining constants
DATASET_FP = 'breast-cancer-wisconsin.csv'
ATTRS      = ['Clump Thickness', 'Uniformity of Cell Size', 'Uniformity of Cell Shape', 'Marginal Adhesion',
              'Single Epithelial Cell Size', 'Bare Nuclei', 'Bland Chromatin', 'Normal Nucleoli', 'Mitoses']
LABEL      = 'Class'
TEST_SIZE  = 0.2
KS         = [1,101] # First element is included, second is not

# Loading dataset
data = pd.read_csv(DATASET_FP)
# Print info about the dataset
print(data.info())
display(data.head())

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 699 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype 
---  ------                       --------------  ----- 
 0   id                           699 non-null    int64 
 1   Clump Thickness              699 non-null    int64 
 2   Uniformity of Cell Size      699 non-null    int64 
 3   Uniformity of Cell Shape     699 non-null    int64 
 4   Marginal Adhesion            699 non-null    int64 
 5   Single Epithelial Cell Size  699 non-null    int64 
 6   Bare Nuclei                  699 non-null    object
 7   Bland Chromatin              699 non-null    int64 
 8   Normal Nucleoli              699 non-null    int64 
 9   Mitoses                      699 non-null    int64 
 10  Class                        699 non-null    object
dtypes: int64(9), object(2)
memory usage: 60.2+ KB
None


Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,benign
1,1002945,5,4,4,5,7,10,3,2,1,benign
2,1015425,3,1,1,1,2,2,3,1,1,benign
3,1016277,6,8,8,1,3,4,3,7,1,benign
4,1017023,4,1,1,3,2,1,3,1,1,benign


<br>

# Preprocessing

In [2]:
# The missing values are represented with '?', replacing them with NA
data.replace('?', pd.NA, inplace=True)
# Since there are only 16 rows in the dataset with missing values, we can drop them
data.dropna(inplace=True)

# In the class column, replacing 'benign' with 0 and 'malignant' with 1
data['Class'].replace('benign',    0, inplace=True)
data['Class'].replace('malignant', 1, inplace=True)
# Since all values in the dataframe can be represented as integers, converting them
data = data.astype(int)

print(data.info())
display(data.head())

<class 'pandas.core.frame.DataFrame'>
Index: 683 entries, 0 to 698
Data columns (total 11 columns):
 #   Column                       Non-Null Count  Dtype
---  ------                       --------------  -----
 0   id                           683 non-null    int64
 1   Clump Thickness              683 non-null    int64
 2   Uniformity of Cell Size      683 non-null    int64
 3   Uniformity of Cell Shape     683 non-null    int64
 4   Marginal Adhesion            683 non-null    int64
 5   Single Epithelial Cell Size  683 non-null    int64
 6   Bare Nuclei                  683 non-null    int64
 7   Bland Chromatin              683 non-null    int64
 8   Normal Nucleoli              683 non-null    int64
 9   Mitoses                      683 non-null    int64
 10  Class                        683 non-null    int64
dtypes: int64(11)
memory usage: 64.0 KB
None


Unnamed: 0,id,Clump Thickness,Uniformity of Cell Size,Uniformity of Cell Shape,Marginal Adhesion,Single Epithelial Cell Size,Bare Nuclei,Bland Chromatin,Normal Nucleoli,Mitoses,Class
0,1000025,5,1,1,1,2,1,3,1,1,0
1,1002945,5,4,4,5,7,10,3,2,1,0
2,1015425,3,1,1,1,2,2,3,1,1,0
3,1016277,6,8,8,1,3,4,3,7,1,0
4,1017023,4,1,1,3,2,1,3,1,1,0


<br>

# Splitting data

In [3]:
# Splitting data into x and y
x = data[ATTRS]
y = data[LABEL]
# Splitting data into train and test sets
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=TEST_SIZE)

<br>

# KNN

In [4]:
accuracies = {}

for k in range(*KS):
    # Creating a KNN classifier
    knn_model = KNeighborsClassifier(n_neighbors=k)
    # Training the classifier
    knn_model.fit(x_train, y_train)
    # Making predictions
    y_pred = knn_model.predict(x_test)
    # Calculating accuracy
    knn_accuracy = accuracy_score(y_test, y_pred)
    print(f'{k:2}: {knn_accuracy}')

 1: 0.9562043795620438
 2: 0.948905109489051
 3: 0.9708029197080292
 4: 0.9708029197080292
 5: 0.9708029197080292
 6: 0.9708029197080292
 7: 0.9708029197080292
 8: 0.9635036496350365
 9: 0.9635036496350365
10: 0.9635036496350365
11: 0.9708029197080292
12: 0.9635036496350365
13: 0.9708029197080292
14: 0.9635036496350365
15: 0.9635036496350365
16: 0.9635036496350365
17: 0.9708029197080292
18: 0.9635036496350365
19: 0.9635036496350365
20: 0.9635036496350365
21: 0.9635036496350365
22: 0.9635036496350365
23: 0.9635036496350365
24: 0.9635036496350365
25: 0.9635036496350365
26: 0.9635036496350365
27: 0.9635036496350365
28: 0.9635036496350365
29: 0.9635036496350365
30: 0.9635036496350365
31: 0.9635036496350365
32: 0.9635036496350365
33: 0.9635036496350365
34: 0.9635036496350365
35: 0.9635036496350365
36: 0.9635036496350365
37: 0.9635036496350365
38: 0.9635036496350365
39: 0.9635036496350365
40: 0.9635036496350365
41: 0.9635036496350365
42: 0.9635036496350365
43: 0.9635036496350365
44: 0.963503