In [1]:
# IMPORTING ALL THE NECESSARY TOOLS AND THE DATA
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.linear_model import LogisticRegression, Perceptron
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler, QuantileTransformer, PolynomialFeatures
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.decomposition import PCA
from sklearn.svm import SVC

In [2]:
# In this part we are loading the breast_cancer data into two objects X the feature matrix (row represents a sample and columns 
#represent the features) and y holds the target value wether the tumor is benign or malignant.
print(load_breast_cancer()['DESCR'])
X , y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state= 1)

.. _breast_cancer_dataset:

Breast cancer wisconsin (diagnostic) dataset
--------------------------------------------

**Data Set Characteristics:**

    :Number of Instances: 569

    :Number of Attributes: 30 numeric, predictive attributes and the class

    :Attribute Information:
        - radius (mean of distances from center to points on the perimeter)
        - texture (standard deviation of gray-scale values)
        - perimeter
        - area
        - smoothness (local variation in radius lengths)
        - compactness (perimeter^2 / area - 1.0)
        - concavity (severity of concave portions of the contour)
        - concave points (number of concave portions of the contour)
        - symmetry
        - fractal dimension ("coastline approximation" - 1)

        The mean, standard error, and "worst" or largest (mean of the three
        worst/largest values) of these features were computed for each image,
        resulting in 30 features.  For instance, field 0 is Mean Radi

In [3]:
# made pipeline for LogisticRegression and entered the tuned hyperparameter from GridsearchCV
pipe = make_pipeline(StandardScaler(), PCA(n_components=9), LogisticRegression(solver= 'lbfgs', max_iter=1000))
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))

0.972027972027972


In [4]:
# used GridsearchCV to tune hyper Parameters and cross validation
#pipe.get_params() gives list of all hyper paramaters in my pipeline
mod = GridSearchCV(estimator=pipe,
                param_grid={'pca__n_components': [1,2,3,4,5,6,7,8,9,10], 'logisticregression__solver': ['lbfgs','liblinear','newton-cg','sag','saga']},
                cv=10)
mod.fit(X, y);
pd.DataFrame(mod.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_logisticregression__solver,param_pca__n_components,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.009539,0.010956,0.000825,0.002475,lbfgs,1,"{'logisticregression__solver': 'lbfgs', 'pca__...",0.894737,0.877193,0.894737,0.912281,0.912281,0.947368,0.877193,0.947368,0.929825,0.928571,0.912155,0.024725,47
1,0.007812,0.007812,0.0,0.0,lbfgs,2,"{'logisticregression__solver': 'lbfgs', 'pca__...",0.964912,0.894737,0.964912,0.947368,0.947368,0.947368,0.929825,0.982456,0.982456,0.982143,0.954355,0.026224,36
2,0.009375,0.007654,0.001562,0.004687,lbfgs,3,"{'logisticregression__solver': 'lbfgs', 'pca__...",0.982456,0.894737,0.929825,0.947368,0.982456,0.929825,0.929825,0.964912,0.947368,0.982143,0.949091,0.027646,41
3,0.00625,0.007654,0.003125,0.00625,lbfgs,4,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.929825,0.964912,0.947368,0.982456,0.982456,0.929825,0.982456,1.0,0.982143,0.970144,0.024857,26
4,0.007812,0.007812,0.001562,0.004687,lbfgs,5,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.964912,0.964912,0.964912,1.0,0.947368,0.947368,0.982456,1.0,0.964286,0.973622,0.019644,16
5,0.009374,0.007654,0.001562,0.004687,lbfgs,6,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.964912,0.964912,0.947368,1.0,0.947368,0.947368,0.982456,1.0,0.964286,0.971867,0.021074,21
6,0.010937,0.00716,0.0,0.0,lbfgs,7,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.947368,0.964912,0.947368,1.0,0.947368,0.947368,0.982456,1.0,0.964286,0.970113,0.022276,31
7,0.00625,0.007655,0.004687,0.00716,lbfgs,8,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.964912,0.982456,0.947368,1.0,0.982456,0.947368,0.982456,1.0,0.946429,0.975345,0.021179,11
8,0.007812,0.007812,0.001562,0.004686,lbfgs,9,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.982456,0.982456,0.964912,1.0,0.982456,0.947368,1.0,1.0,0.964286,0.982393,0.017607,1
9,0.009509,0.005873,0.000814,0.002442,lbfgs,10,"{'logisticregression__solver': 'lbfgs', 'pca__...",1.0,0.982456,0.982456,0.964912,1.0,0.964912,0.947368,1.0,1.0,0.964286,0.980639,0.018371,6


In [10]:
# made pipeline for Perceptron and entered the tuned hyperparameter from GridsearchCV
pipe = make_pipeline(StandardScaler(),PCA(n_components= 5), Perceptron(eta0=0.01))
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))

0.951048951048951


In [6]:
# used GridsearchCV to tune hyper Parameters and cross validation
mod = GridSearchCV(estimator=pipe,
                param_grid={'pca__n_components': [1,2,3,4,5,6,7,8,9,10],'perceptron__eta0': [0.1,0.001,0.01]},
                cv=3)
mod.fit(X, y);
pd.DataFrame(mod.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_pca__n_components,param_perceptron__eta0,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
0,0.005333,0.003771,0.005391,0.007624,1,0.1,"{'pca__n_components': 1, 'perceptron__eta0': 0.1}",0.905263,0.915789,0.915344,0.912132,0.004861,28
1,0.005388,0.00381,0.002664,0.003768,1,0.001,"{'pca__n_components': 1, 'perceptron__eta0': 0...",0.905263,0.910526,0.915344,0.910378,0.004117,29
2,0.002845,0.004023,0.007286,0.005642,1,0.01,"{'pca__n_components': 1, 'perceptron__eta0': 0...",0.905263,0.910526,0.915344,0.910378,0.004117,29
3,0.00808,0.000114,0.0,0.0,2,0.1,"{'pca__n_components': 2, 'perceptron__eta0': 0.1}",0.915789,0.963158,0.94709,0.942012,0.019669,20
4,0.002732,0.003864,0.005334,0.003772,2,0.001,"{'pca__n_components': 2, 'perceptron__eta0': 0...",0.889474,0.952632,0.925926,0.922677,0.025886,27
5,0.007609,0.006059,0.002552,0.00361,2,0.01,"{'pca__n_components': 2, 'perceptron__eta0': 0...",0.915789,0.931579,0.94709,0.931486,0.012779,25
6,0.004697,0.006643,0.0,0.0,3,0.1,"{'pca__n_components': 3, 'perceptron__eta0': 0.1}",0.931579,0.931579,0.957672,0.940277,0.0123,21
7,0.005549,0.003933,0.0,0.0,3,0.001,"{'pca__n_components': 3, 'perceptron__eta0': 0...",0.910526,0.926316,0.952381,0.929741,0.017258,26
8,0.005372,0.003799,0.0,0.0,3,0.01,"{'pca__n_components': 3, 'perceptron__eta0': 0...",0.910526,0.931579,0.957672,0.933259,0.019284,23
9,0.005209,0.007367,0.0,0.0,4,0.1,"{'pca__n_components': 4, 'perceptron__eta0': 0.1}",0.905263,0.952632,0.952381,0.936759,0.022271,22


In [7]:
# made pipeline for SVC and entered the tuned hyperparameter from GridsearchCV
pipe = make_pipeline(StandardScaler(),PCA(n_components=9), SVC(kernel='linear'))
pipe.fit(X_train, y_train)
print(pipe.score(X_test, y_test))

0.958041958041958


In [9]:
# used GridsearchCV to tune hyper Parameters and cross validation
#pipe.get_params()
mod = GridSearchCV(estimator=pipe,
               param_grid={'svc__kernel': ['linear','poly','rbf','sigmoid']}
               ,cv=10)
mod.fit(X, y);
pd.DataFrame(mod.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_svc__kernel,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.01041,0.003592,0.00238,0.003637,linear,{'svc__kernel': 'linear'},0.982456,0.982456,0.964912,0.964912,1.0,1.0,0.964912,1.0,1.0,0.964286,0.982393,0.015763,1
1,0.007051,0.007374,0.0,0.0,poly,{'svc__kernel': 'poly'},0.947368,0.859649,0.877193,0.877193,0.894737,0.929825,0.894737,0.929825,0.964912,0.946429,0.912187,0.034104,4
2,0.00625,0.007655,0.003125,0.006249,rbf,{'svc__kernel': 'rbf'},1.0,0.964912,0.947368,0.982456,1.0,1.0,0.947368,1.0,1.0,0.946429,0.978853,0.023403,2
3,0.007812,0.007812,0.0,0.0,sigmoid,{'svc__kernel': 'sigmoid'},0.912281,0.929825,0.929825,0.947368,0.929825,0.964912,0.947368,0.964912,0.964912,0.928571,0.94198,0.017722,3
