In [1]:
import pandas as pd

In [2]:
data = pd.read_csv('data.csv')

In [3]:
data.drop('Unnamed: 32', axis = 1, inplace = True)
data.drop('id', axis = 1, inplace = True)

In [4]:
X = data.drop('diagnosis', axis = 1)
y = data['diagnosis']
feature_names = X.columns

In [5]:
y.replace('B', 0, inplace = True)
y.replace('M', 1, inplace = True)

  y.replace('M', 1, inplace = True)


In [6]:
from sklearn.model_selection import train_test_split

In [7]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, stratify = y, random_state = 42)

In [8]:
from matplotlib import pyplot as plt

In [9]:
# detekcija autlajera
def IQR(data, feature_names):
    iqr = pd.DataFrame(0.0, index = feature_names, columns = ['lower', 'min', 'num_lower', 'upper', 'max', 'num_upper', 'percentage'])
    for name in feature_names:
        Q1, Q3 = X[name].quantile([0.25, 0.75])
        IQR = Q3 - Q1
        lower = Q1 - 1.5*IQR
        upper = Q3 +  1.5*IQR

        iqr.loc[name, 'lower'] = lower
        iqr.loc[name, 'upper'] = upper

        num_lower = (X[name] < lower).sum()
        num_upper = (X[name] > upper).sum()

        iqr.loc[name, 'num_lower'] = num_lower
        iqr.loc[name, 'num_upper'] = num_upper

        iqr.loc[name, 'min'] = min(X[name])
        iqr.loc[name, 'max'] = max(X[name])

        percentage = (num_lower + num_upper) / X.shape[0] * 100
        iqr.loc[name, 'percentage'] = percentage

    return iqr

In [10]:
IQR(X_train, feature_names)

Unnamed: 0,lower,min,num_lower,upper,max,num_upper,percentage
radius_mean,5.58,6.981,0.0,21.9,28.11,14.0,2.460457
texture_mean,7.725,9.71,0.0,30.245,39.28,7.0,1.230228
perimeter_mean,31.775,43.79,0.0,147.495,188.5,13.0,2.28471
area_mean,-123.3,143.5,0.0,1326.3,2501.0,25.0,4.393673
smoothness_mean,0.057975,0.05263,1.0,0.133695,0.1634,5.0,1.054482
compactness_mean,-0.0333,0.01938,0.0,0.22862,0.3454,16.0,2.811951
concavity_mean,-0.12215,0.0,0.0,0.28241,0.4268,18.0,3.163445
concave points_mean,-0.060225,0.0,0.0,0.154535,0.2012,10.0,1.757469
symmetry_mean,0.1112,0.106,1.0,0.2464,0.304,14.0,2.636204
fractal_dimension_mean,0.04507,0.04996,0.0,0.07875,0.09744,15.0,2.636204


In [11]:
# normalizacija
from sklearn.preprocessing import MinMaxScaler

In [12]:
scaler = MinMaxScaler()
scaler.fit(X_train)
X_train = scaler.transform(X_train)
X_test = scaler.transform(X_test)

In [18]:
from sklearn.metrics import classification_report, confusion_matrix
from termcolor import colored

def report(model, X, y, text = "training"):
    y_pred = model.predict(X)
    
    print(colored("Classification report for model {} on {} data".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(classification_report(y, y_pred, zero_division = True))
    print("---------------------------------------------------------------------------------")
    
    print(colored("Confusion matrix for model {} on {} data ".format(type(model).__name__, text), "green"))
    print("---------------------------------------------------------------------------------")
    print(pd.DataFrame(confusion_matrix(y, y_pred), columns=['B', 'M'], index=['B', 'M']))
    print("---------------------------------------------------------------------------------")

In [19]:
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

0,1,2
,n_neighbors,5
,weights,'uniform'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [20]:
report(knn, X_train, y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      1.00      0.98       267
           1       0.99      0.95      0.97       159

    accuracy                           0.98       426
   macro avg       0.98      0.97      0.98       426
weighted avg       0.98      0.98      0.98       426

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
     B    M
B  266    1
M    8  151
---------------------------------------------------------------------------------


In [21]:
report(knn, X_test, y_test, "test")

[32mClassification report for model KNeighborsClassifier on test data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.96      1.00      0.98        90
           1       1.00      0.92      0.96        53

    accuracy                           0.97       143
   macro avg       0.98      0.96      0.97       143
weighted avg       0.97      0.97      0.97       143

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on test data [0m
---------------------------------------------------------------------------------
    B   M
B  90   0
M   4  49
---------------------------------------------------------------------------------


In [22]:
from sklearn.model_selection import GridSearchCV

In [32]:
params = {
    'n_neighbors' : range(10, 50, 5),
    'weights' : ['uniform', 'distance'],
    'p' : [1, 2]
}
estimator = GridSearchCV(KNeighborsClassifier(), param_grid = params, cv = 6, verbose = 4)

In [33]:
estimator.fit(X_train, y_train)

Fitting 6 folds for each of 32 candidates, totalling 192 fits
[CV 1/6] END n_neighbors=10, p=1, weights=uniform;, score=0.972 total time=   0.0s
[CV 2/6] END n_neighbors=10, p=1, weights=uniform;, score=0.972 total time=   0.0s
[CV 3/6] END n_neighbors=10, p=1, weights=uniform;, score=0.986 total time=   0.0s
[CV 4/6] END n_neighbors=10, p=1, weights=uniform;, score=0.958 total time=   0.0s
[CV 5/6] END n_neighbors=10, p=1, weights=uniform;, score=0.958 total time=   0.0s
[CV 6/6] END n_neighbors=10, p=1, weights=uniform;, score=0.930 total time=   0.0s
[CV 1/6] END n_neighbors=10, p=1, weights=distance;, score=0.972 total time=   0.0s
[CV 2/6] END n_neighbors=10, p=1, weights=distance;, score=0.972 total time=   0.0s
[CV 3/6] END n_neighbors=10, p=1, weights=distance;, score=0.986 total time=   0.0s
[CV 4/6] END n_neighbors=10, p=1, weights=distance;, score=0.972 total time=   0.0s
[CV 5/6] END n_neighbors=10, p=1, weights=distance;, score=0.958 total time=   0.0s
[CV 6/6] END n_neigh

0,1,2
,estimator,KNeighborsClassifier()
,param_grid,"{'n_neighbors': range(10, 50, 5), 'p': [1, 2], 'weights': ['uniform', 'distance']}"
,scoring,
,n_jobs,
,refit,True
,cv,6
,verbose,4
,pre_dispatch,'2*n_jobs'
,error_score,
,return_train_score,False

0,1,2
,n_neighbors,10
,weights,'distance'
,algorithm,'auto'
,leaf_size,30
,p,2
,metric,'minkowski'
,metric_params,
,n_jobs,


In [34]:
estimator.best_score_

np.float64(0.9694835680751174)

In [35]:
report(estimator.best_estimator_, X_train, y_train)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       267
           1       1.00      1.00      1.00       159

    accuracy                           1.00       426
   macro avg       1.00      1.00      1.00       426
weighted avg       1.00      1.00      1.00       426

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
     B    M
B  267    0
M    0  159
---------------------------------------------------------------------------------


In [36]:
report(estimator.best_estimator_, X_test, y_test)

[32mClassification report for model KNeighborsClassifier on training data[0m
---------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.97      1.00      0.98        90
           1       1.00      0.94      0.97        53

    accuracy                           0.98       143
   macro avg       0.98      0.97      0.98       143
weighted avg       0.98      0.98      0.98       143

---------------------------------------------------------------------------------
[32mConfusion matrix for model KNeighborsClassifier on training data [0m
---------------------------------------------------------------------------------
    B   M
B  90   0
M   3  50
---------------------------------------------------------------------------------
