In [4]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_breast_cancer 
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline

In [5]:
data = load_breast_cancer()

In [6]:
print(data.DESCR)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [7]:
list(data.target_names)

['malignant', 'benign']

* The outcomes are either 0 - malignant, or 1 - benign 

In [9]:
df = pd.DataFrame(data.data, columns=data.feature_names)
df['target'] = data.target

In [10]:
df['target'].value_counts()

1    357
0    212
Name: target, dtype: int64

In [11]:
X = df.iloc[:,:-1]
y = df.iloc[:,-1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

## K Nearest Neighbors

In [12]:
from sklearn.neighbors import KNeighborsClassifier

In [13]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', KNeighborsClassifier())
])
param_grid = [
    {
        'clf__n_neighbors': np.arange(1, 25),
    },
]
knn_gscv = GridSearchCV(pipe, param_grid, cv=5)
#fit model to data
knn_gscv.fit(X, y)
knn_gscv.best_params_

{'clf__n_neighbors': 7}

In [14]:
pipe = make_pipeline(StandardScaler(), KNeighborsClassifier(n_neighbors=7))
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=7))])

In [15]:
y_pred = pipe.predict(X_test)

In [16]:
confusion_matrix(y_test, y_pred)

array([[47,  6],
       [ 1, 89]], dtype=int64)

In [17]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn:",tn , "fp:",fp , "fn:",fn , "tp:",tp)  

tn: 47 fp: 6 fn: 1 tp: 89


In [18]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.98      0.89      0.93        53
           1       0.94      0.99      0.96        90

    accuracy                           0.95       143
   macro avg       0.96      0.94      0.95       143
weighted avg       0.95      0.95      0.95       143



In [15]:
cv_scores = cross_val_score(pipe, X, y, cv=5)

print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.97368421 0.95614035 0.98245614 0.96491228 0.97345133]
cv_scores mean:0.9701288619779538


## Logistic Regression

In [16]:
from sklearn.linear_model import LogisticRegression

In [17]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', LogisticRegression())
])
param_grid = [
    {
        'clf__max_iter':[1000,10000],
        'clf__C': [1, 10, 100, 1000],
    },
]
lr_gscv = GridSearchCV(pipe, param_grid, cv=5)
#fit model to data
lr_gscv.fit(X, y)
lr_gscv.best_params_

{'clf__C': 1, 'clf__max_iter': 1000}

In [18]:
pipe = make_pipeline(StandardScaler(), LogisticRegression(max_iter=1000, C=1))
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=1, max_iter=1000))])

In [19]:
y_pred = pipe.predict(X_test)

In [20]:
confusion_matrix(y_test, y_pred)

array([[50,  3],
       [ 3, 87]], dtype=int64)

In [21]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn:",tn , "fp:",fp , "fn:",fn , "tp:",tp)

tn: 50 fp: 3 fn: 3 tp: 87


In [22]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.94      0.94      0.94        53
           1       0.97      0.97      0.97        90

    accuracy                           0.96       143
   macro avg       0.96      0.96      0.96       143
weighted avg       0.96      0.96      0.96       143



In [23]:
cv_scores = cross_val_score(pipe, X, y, cv=5)

print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.98245614 0.98245614 0.97368421 0.97368421 0.99115044]
cv_scores mean:0.9806862288464524


## Support Vector Classifier

In [24]:
from sklearn.svm import SVC

In [25]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', SVC())
])
param_grid = [
    {
        'clf__kernel': ["linear", "rbf"],
        'clf__C': [1, 10, 100, 1000],
    },
]
svc_gscv = GridSearchCV(pipe, param_grid, cv=5)
#fit model to data
svc_gscv.fit(X, y)
svc_gscv.best_params_

{'clf__C': 10, 'clf__kernel': 'rbf'}

In [26]:
pipe = make_pipeline(StandardScaler(), SVC(gamma='auto', C=10, kernel = 'rbf'))
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=10, gamma='auto'))])

In [27]:
y_pred = pipe.predict(X_test)

In [28]:
confusion_matrix(y_test, y_pred)

array([[52,  1],
       [ 0, 90]], dtype=int64)

In [29]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn:",tn , "fp:",fp , "fn:",fn , "tp:",tp)

tn: 52 fp: 1 fn: 0 tp: 90


In [30]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       1.00      0.98      0.99        53
           1       0.99      1.00      0.99        90

    accuracy                           0.99       143
   macro avg       0.99      0.99      0.99       143
weighted avg       0.99      0.99      0.99       143



In [31]:
cv_scores = cross_val_score(pipe, X, y, cv=5)

print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.96491228 0.97368421 0.97368421 0.98245614 0.99115044]
cv_scores mean:0.9771774569166279


## Multilayer Perceptron

In [32]:
from sklearn.neural_network import MLPClassifier

In [33]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('clf', MLPClassifier())
])
param_grid = [
    {
        'clf__hidden_layer_sizes': [(100,),(10,30,10),(20,)],
        'clf__activation': ['tanh', 'relu'],
        'clf__solver': ['sgd', 'adam'],
        'clf__alpha': [0.01, 0.0001, 0.05],
        'clf__max_iter': [1000, 10000],
        'clf__learning_rate': ['constant','adaptive'],
    },
]
mlp_gscv = GridSearchCV(pipe, param_grid, cv=5)
mlp_gscv.fit(X, y)
mlp_gscv.best_params_

{'clf__activation': 'relu',
 'clf__alpha': 0.01,
 'clf__hidden_layer_sizes': (20,),
 'clf__learning_rate': 'adaptive',
 'clf__max_iter': 1000,
 'clf__solver': 'adam'}

In [127]:
pipe = make_pipeline(StandardScaler(), MLPClassifier(max_iter=1000, activation='tanh', alpha= 0.001, 
                                                    hidden_layer_sizes=(100,50),learning_rate='adaptive',
                                                    solver='sgd'))
pipe.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(activation='tanh', alpha=0.001,
                               hidden_layer_sizes=(100, 50),
                               learning_rate='adaptive', max_iter=1000,
                               solver='sgd'))])

In [128]:
y_pred = pipe.predict(X_test)

In [129]:
confusion_matrix(y_test, y_pred)

array([[51,  2],
       [ 2, 88]], dtype=int64)

In [130]:
tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
print("tn:",tn , "fp:",fp , "fn:",fn , "tp:",tp)

tn: 51 fp: 2 fn: 2 tp: 88


In [131]:
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.96      0.96      0.96        53
           1       0.98      0.98      0.98        90

    accuracy                           0.97       143
   macro avg       0.97      0.97      0.97       143
weighted avg       0.97      0.97      0.97       143



In [132]:
cv_scores = cross_val_score(pipe, X, y, cv=5)

print(cv_scores)
print('cv_scores mean:{}'.format(np.mean(cv_scores)))

[0.98245614 0.97368421 0.99122807 0.97368421 0.99115044]
cv_scores mean:0.9824406148113647
