In [1]:
# Making sure the code is compatible with all Python versions and on different devices 
from __future__ import division, print_function, unicode_literals

# Loading necessary packages
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
from sklearn import neighbors, datasets 

# Loading the dataset
# Source - https://github.com/upalr/Python-camp/wiki/1.-Importing-data-from-the-Internet
from urllib.request import urlretrieve
url = "http://archive.ics.uci.edu/ml/machine-learning-databases/breast-cancer-wisconsin/wdbc.data"
urlretrieve(url, 'wdbc.csv')
# Adding column names using the data names appendix
wdbc_names = ['ID', 'Diagnosis', 'Mean Radius', 'Mean Texture', 'Mean Perimeter', 'Mean Area', 'Mean Smoothness', 
              'Mean Compactness', 'Mean Concavity', 'Mean Concave Points', 'Mean Symmetry', 'Mean Fractal Dimension', 
              'Radius SE', 'Texture SE', 'Perimeter SE', 'Area SE', 'Smoothness SE', 'Compactness SE', 'Concavity SE', 
              'Concave Points SE', 'Symmetry SE', 'Fractal Dimension SE', 'Worst Radius', 'Worst Texture', 'Worst Perimeter', 
              'Worst Area', 'Worst Smoothness', 'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 
              'Worst Symmetry', 'Worst Fractal Dimension']
wdbc = pd.read_csv('wdbc.csv', header=None, names=wdbc_names)

In [2]:
# Exploring the dataset for all the variables and to see if the column names align properly
wdbc.head()

Unnamed: 0,ID,Diagnosis,Mean Radius,Mean Texture,Mean Perimeter,Mean Area,Mean Smoothness,Mean Compactness,Mean Concavity,Mean Concave Points,...,Worst Radius,Worst Texture,Worst Perimeter,Worst Area,Worst Smoothness,Worst Compactness,Worst Concavity,Worst Concave Points,Worst Symmetry,Worst Fractal Dimension
0,842302,M,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,842517,M,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,84300903,M,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,84348301,M,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,84358402,M,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [3]:
# Packages for EDA
import seaborn as sns
sns.set(color_codes=True)
from scipy import stats

# Exploring the different variables using summary statistics
print(wdbc.describe()) 

                 ID  Mean Radius  Mean Texture  Mean Perimeter    Mean Area  \
count  5.690000e+02   569.000000    569.000000      569.000000   569.000000   
mean   3.037183e+07    14.127292     19.289649       91.969033   654.889104   
std    1.250206e+08     3.524049      4.301036       24.298981   351.914129   
min    8.670000e+03     6.981000      9.710000       43.790000   143.500000   
25%    8.692180e+05    11.700000     16.170000       75.170000   420.300000   
50%    9.060240e+05    13.370000     18.840000       86.240000   551.100000   
75%    8.813129e+06    15.780000     21.800000      104.100000   782.700000   
max    9.113205e+08    28.110000     39.280000      188.500000  2501.000000   

       Mean Smoothness  Mean Compactness  Mean Concavity  Mean Concave Points  \
count       569.000000        569.000000      569.000000           569.000000   
mean          0.096360          0.104341        0.088799             0.048919   
std           0.014064          0.052813     

In [4]:
# Running a correlation matrix to study the interactions between the variables
wdbc.corr()

Unnamed: 0,ID,Mean Radius,Mean Texture,Mean Perimeter,Mean Area,Mean Smoothness,Mean Compactness,Mean Concavity,Mean Concave Points,Mean Symmetry,...,Worst Radius,Worst Texture,Worst Perimeter,Worst Area,Worst Smoothness,Worst Compactness,Worst Concavity,Worst Concave Points,Worst Symmetry,Worst Fractal Dimension
ID,1.0,0.074626,0.09977,0.073159,0.096893,-0.012968,9.6e-05,0.05008,0.044158,-0.022114,...,0.082405,0.06472,0.079986,0.107187,0.010338,-0.002968,0.023203,0.035174,-0.044224,-0.029866
Mean Radius,0.074626,1.0,0.323782,0.997855,0.987357,0.170581,0.506124,0.676764,0.822529,0.147741,...,0.969539,0.297008,0.965137,0.941082,0.119616,0.413463,0.526911,0.744214,0.163953,0.007066
Mean Texture,0.09977,0.323782,1.0,0.329533,0.321086,-0.023389,0.236702,0.302418,0.293464,0.071401,...,0.352573,0.912045,0.35804,0.343546,0.077503,0.27783,0.301025,0.295316,0.105008,0.119205
Mean Perimeter,0.073159,0.997855,0.329533,1.0,0.986507,0.207278,0.556936,0.716136,0.850977,0.183027,...,0.969476,0.303038,0.970387,0.94155,0.150549,0.455774,0.563879,0.771241,0.189115,0.051019
Mean Area,0.096893,0.987357,0.321086,0.986507,1.0,0.177028,0.498502,0.685983,0.823269,0.151293,...,0.962746,0.287489,0.95912,0.959213,0.123523,0.39041,0.512606,0.722017,0.14357,0.003738
Mean Smoothness,-0.012968,0.170581,-0.023389,0.207278,0.177028,1.0,0.659123,0.521984,0.553695,0.557775,...,0.21312,0.036072,0.238853,0.206718,0.805324,0.472468,0.434926,0.503053,0.394309,0.499316
Mean Compactness,9.6e-05,0.506124,0.236702,0.556936,0.498502,0.659123,1.0,0.883121,0.831135,0.602641,...,0.535315,0.248133,0.59021,0.509604,0.565541,0.865809,0.816275,0.815573,0.510223,0.687382
Mean Concavity,0.05008,0.676764,0.302418,0.716136,0.685983,0.521984,0.883121,1.0,0.921391,0.500667,...,0.688236,0.299879,0.729565,0.675987,0.448822,0.754968,0.884103,0.861323,0.409464,0.51493
Mean Concave Points,0.044158,0.822529,0.293464,0.850977,0.823269,0.553695,0.831135,0.921391,1.0,0.462497,...,0.830318,0.292752,0.855923,0.80963,0.452753,0.667454,0.752399,0.910155,0.375744,0.368661
Mean Symmetry,-0.022114,0.147741,0.071401,0.183027,0.151293,0.557775,0.602641,0.500667,0.462497,1.0,...,0.185728,0.090651,0.219169,0.177193,0.426675,0.4732,0.433721,0.430297,0.699826,0.438413


In [5]:
# Setting the features and target attributes for the dataset
features = ['Mean Radius', 'Mean Texture', 'Mean Perimeter', 'Mean Area', 'Mean Smoothness', 
              'Mean Compactness', 'Mean Concavity', 'Mean Concave Points', 'Mean Symmetry', 'Mean Fractal Dimension', 
              'Radius SE', 'Texture SE', 'Perimeter SE', 'Area SE', 'Smoothness SE', 'Compactness SE', 'Concavity SE', 
              'Concave Points SE', 'Symmetry SE', 'Fractal Dimension SE', 'Worst Radius', 'Worst Texture', 'Worst Perimeter', 
              'Worst Area', 'Worst Smoothness', 'Worst Compactness', 'Worst Concavity', 'Worst Concave Points', 
              'Worst Symmetry', 'Worst Fractal Dimension']

X = wdbc[features] # we use all the variables except for ID and Diagnosis as the features 
y = wdbc.Diagnosis # Setting the Diagnosis column as the target variable 

In [14]:
from sklearn.model_selection import train_test_split

#Split Validation
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)

In [7]:
# Normalizing the variables
from sklearn.preprocessing import StandardScaler # Standardize features by removing the mean and scaling to unit variance

sc = StandardScaler()
sc.fit(X_train) # Compute mean and standard deviation to use for scaling

X_train_std = sc.transform(X_train) # Standardize training set 
X_test_std = sc.transform(X_test) # Standardize test set using the same mean and standard deviation from training set

In [12]:
#################################### kNN ####################################################

from sklearn import neighbors, datasets

# Setting the parameters for kNN classifier
knn = neighbors.KNeighborsClassifier(n_neighbors=3, #number of neighbors is 3
                                     p=2, 
                                     metric='minkowski')

# Train the model      
knn = knn.fit(X_train_std, y_train)  

In [30]:
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix

# Estimate the predicted values by applying the kNN algorithm
y_pred = knn.predict(X_test_std)

# Accuracy
print('Accuracy (out-of-sample): %.2f' % accuracy_score(y_test, y_pred))

# F1 score
print('F1 score (out-of-sample): ', f1_score(y_test, y_pred, average='macro'))

# Precision
print('Precision (out-of-sample): ', precision_score(y_test, y_pred, pos_label='B', average='binary'))

# Recall
print('Recall (out-of-sample): ', recall_score(y_test, y_pred, pos_label='B', average='binary'))

# Build a text report showing the main classification metrics (out-of-sample performance)
print(classification_report(y_test, y_pred))

# Confusion matrix
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

Accuracy (out-of-sample): 0.96
F1 score (out-of-sample):  0.9555629802873371
Precision (out-of-sample):  0.9464285714285714
Recall (out-of-sample):  0.9906542056074766
              precision    recall  f1-score   support

           B       0.95      0.99      0.97       107
           M       0.98      0.91      0.94        64

    accuracy                           0.96       171
   macro avg       0.96      0.95      0.96       171
weighted avg       0.96      0.96      0.96       171

Confusion Matrix: [[106   1]
 [  6  58]]


In [17]:
# Finds the K-neighbors of a point. 
print('The k nearest neighbors (and the corresponding distances) to user are:', knn.kneighbors())

# Finds the K-neighbors of all points in the training set.
print('The k nearest neighbors to each user are:', knn.kneighbors(X, return_distance=False)) 

# Computes the (weighted) graph of k-Neighbors for points in X (complete training set)
A = knn.kneighbors_graph(X) # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.kneighbors_graph.html
A.toarray()

The k nearest neighbors (and the corresponding distances) to user are: (array([[2.07318915, 2.28964797, 2.44944933],
       [1.69169621, 1.70561777, 1.78090911],
       [2.54425386, 2.83051665, 2.88090876],
       ...,
       [1.26218631, 1.91654054, 1.93774672],
       [2.4738232 , 2.48465807, 2.53266456],
       [1.80723367, 2.19210622, 2.29270303]]), array([[ 47, 331,  94],
       [310, 292, 344],
       [390, 396, 273],
       ...,
       [380, 335, 348],
       [198, 282, 229],
       [381, 189, 225]], dtype=int64))
The k nearest neighbors to each user are: [[ 25 306 327]
 [ 25 306 327]
 [ 25 306 327]
 ...
 [ 25 306 327]
 [ 25 306 327]
 [ 25 306 327]]


array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

In [32]:
################################## Logistic Regression ################################
from sklearn import linear_model

# Creating the classification model using Logistic Regression
clf = linear_model.LogisticRegression(multi_class='ovr', #used for binary categorical target variable
                                      C=1e5, tol=1e-5, max_iter=100, solver='liblinear', penalty='l2') 

# Fitting the classification model on our training data 
clf = clf.fit(X_train, y_train)
print('The weights of the attributes are:', clf.coef_)

#######

# Making predictions using the logistic regression classification model

y_pred = clf.predict(X_test)             # Classification prediction
y_pred_prob = clf.predict_proba(X_test)  # Class probabilities
print(y_pred[0], y_pred_prob[0], np.sum(y_pred_prob[0]))

# Build a text report showing the main classification metrics (out-of-sample performance)
print('Accuracy (out-of-sample): %.2f' % accuracy_score(y_test, y_pred))
print('F1 score (out-of-sample): ', f1_score(y_test, y_pred, average='macro'))
print('Precision (out-of-sample): ', precision_score(y_test, y_pred, pos_label='B', average='binary'))
print('Recall (out-of-sample): ', recall_score(y_test, y_pred, pos_label='B', average='binary'))
print(classification_report(y_test, y_pred))

# Confusion matrix
print('Confusion Matrix:', confusion_matrix(y_test, y_pred))

The weights of the attributes are: [[-1.83771711e+01 -5.83375350e-01  3.00825163e+00  2.18353879e-02
   1.02940319e+01 -4.73880188e+01  2.25453861e+01  4.55013455e+01
  -2.68425796e+01 -1.35661453e+00 -6.56492932e+01 -8.32051612e+00
   3.27320487e+00  9.72298205e-01  1.86848557e+01 -4.19926462e+01
  -9.91702658e+01  2.17753874e+00 -1.80158817e+01 -5.11076832e+00
   2.78962798e+00  1.63458582e+00 -5.10339809e-01 -1.04096754e-02
   1.47697307e+02 -5.84077246e+01  4.55604199e+01  4.41893986e+01
   7.20866335e+01  1.77676660e+01]]
B [9.99999999e-01 5.03102042e-10] 1.0
Accuracy (out-of-sample): 0.97
F1 score (out-of-sample):  0.968479262672811
Precision (out-of-sample):  0.9636363636363636
Recall (out-of-sample):  0.9906542056074766
              precision    recall  f1-score   support

           B       0.96      0.99      0.98       107
           M       0.98      0.94      0.96        64

    accuracy                           0.97       171
   macro avg       0.97      0.96      0.97 

In [145]:
from sklearn import metrics 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import cross_val_score

# Fit model to all the data
clf_lr = linear_model.LogisticRegression(multi_class='ovr', #used for binary categorical target variable
                                      C=1e5, tol=1e-5, max_iter=100, solver='liblinear', penalty='l2') 

# Accuracy
scores=cross_val_score(clf_lr, wdbc[features], wdbc.Diagnosis, cv=5)
print("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))
print(scores)

# F-1 scores
scores_f1=cross_val_score(clf_lr, wdbc[features], wdbc.Diagnosis, cv=5, scoring='f1_macro')
print("F1-score: %0.2f (+/- %0.2f)" % (scores_f1.mean(), scores_f1.std() * 2))# returns an array of scores of the estimator for each run of the cross validation.
print(scores_f1)

Accuracy: 0.97 (+/- 0.02)
[0.94736842 0.97368421 0.96491228 0.97368421 0.98230088]
F1-score: 0.97 (+/- 0.03)
[0.94398952 0.97186343 0.96148649 0.97186343 0.98122924]
