## Classification models using Breast Cancer Data
1) Importing Libraries
2) Importing Data
3) ML Pipeline Experimentations
    + Splitting Data and Label Encoding
    + Baseline model with DummyClassifier
    + Logistic Regression
    + Decision Trees
    + GBM
    + RF
    + SVC
    + LinearSVC
    + Comparing models
    + Randomized Search
4) E2E ML Pipeline
5) Model Inference and Evaluation

### Importing Libraries

In [175]:
import pandas as pd
import numpy as np
import shap
import joblib
from matplotlib import pyplot
import seaborn as sns
from scipy.stats import uniform, randint
from sklearn.datasets import load_breast_cancer
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import make_union
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.metrics import precision_score
from sklearn.metrics import recall_score
from sklearn.metrics import f1_score

### Importing Data

In [95]:
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target

In [96]:
X.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 30 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

### ML Pipeline Experimentations

#### Splitting Data and Label Encoding

In [97]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [135]:
le = LabelEncoder()
y_train_tr = le.fit_transform(y_train)
y_test_tr = le.transform(y_test)

#### Baseline model with DummyClassifier

In [99]:
transformer = make_union(Normalizer())
base_pl = make_pipeline(transformer, DummyClassifier(strategy='most_frequent'))
cross_val_score(base_pl, X_train, y_train_tr, cv=5)

array([0.63736264, 0.62637363, 0.62637363, 0.62637363, 0.62637363])

In [100]:
base_pl.fit(X_train, y_train)
print(f'Train score: {base_pl.score(X_train, y_train_tr)}')
print(f'Test score: {base_pl.score(X_test, y_test_tr)}')

Train score: 0.6285714285714286
Test score: 0.6228070175438597


#### Logistic Regression

In [101]:
transformer = make_union(Normalizer())
logreg_pl = make_pipeline(transformer, LogisticRegression())
cross_val_score(logreg_pl, X_train, y_train_tr, cv=5)

array([0.79120879, 0.7032967 , 0.69230769, 0.72527473, 0.72527473])

In [102]:
logreg_pl.fit(X_train, y_train)
print(f'Train score: {logreg_pl.score(X_train, y_train_tr)}')
print(f'Test score: {logreg_pl.score(X_test, y_test_tr)}')

Train score: 0.7714285714285715
Test score: 0.7807017543859649


#### Decision Trees

In [103]:
transformer = make_union(Normalizer())
dt_pl = make_pipeline(transformer, DecisionTreeClassifier())
cross_val_score(dt_pl, X_train, y_train_tr, cv=5)

array([0.92307692, 0.93406593, 0.9010989 , 0.89010989, 0.93406593])

In [104]:
dt_pl.fit(X_train, y_train)
print(f'Train score: {dt_pl.score(X_train, y_train_tr)}')
print(f'Test score: {dt_pl.score(X_test, y_test_tr)}')

Train score: 1.0
Test score: 0.9122807017543859


#### GBM

In [105]:
transformer = make_union(Normalizer())
gbm_pl = make_pipeline(transformer, GradientBoostingClassifier())
cross_val_score(gbm_pl, X_train, y_train_tr, cv=5)

array([0.96703297, 0.89010989, 0.96703297, 0.93406593, 0.93406593])

In [107]:
gbm_pl.fit(X_train, y_train)
print(f'Train score: {gbm_pl.score(X_train, y_train_tr)}')
print(f'Test score: {gbm_pl.score(X_test, y_test_tr)}')

Train score: 1.0
Test score: 0.9912280701754386


#### RF

In [108]:
transformer = make_union(Normalizer())
rf_pl = make_pipeline(transformer, RandomForestClassifier())
cross_val_score(rf_pl, X_train, y_train_tr, cv=5)

array([0.98901099, 0.91208791, 0.97802198, 0.95604396, 0.93406593])

In [109]:
rf_pl.fit(X_train, y_train)
print(f'Train score: {rf_pl.score(X_train, y_train_tr)}')
print(f'Test score: {rf_pl.score(X_test, y_test_tr)}')

Train score: 1.0
Test score: 0.9824561403508771


#### SVC

In [110]:
transformer = make_union(Normalizer())
svc_pl = make_pipeline(transformer, SVC())
cross_val_score(svc_pl, X_train, y_train_tr, cv=5)

array([0.92307692, 0.84615385, 0.85714286, 0.83516484, 0.82417582])

In [111]:
svc_pl.fit(X_train, y_train)
print(f'Train score: {svc_pl.score(X_train, y_train_tr)}')
print(f'Test score: {svc_pl.score(X_test, y_test_tr)}')

Train score: 0.8791208791208791
Test score: 0.9122807017543859


#### LinearSVC

In [112]:
transformer = make_union(Normalizer())
lsvc_pl = make_pipeline(transformer, LinearSVC())
cross_val_score(lsvc_pl, X_train, y_train_tr, cv=5)

array([0.94505495, 0.87912088, 0.9010989 , 0.86813187, 0.89010989])

In [113]:
lsvc_pl.fit(X_train, y_train)
print(f'Train score: {lsvc_pl.score(X_train, y_train_tr)}')
print(f'Test score: {lsvc_pl.score(X_test, y_test_tr)}')

Train score: 0.8989010989010989
Test score: 0.9298245614035088


#### Comparing models

In [114]:
transformer = gbm_pl[0]
model = gbm_pl[-1]
feature_names = transformer.get_feature_names_out()
model_params = model.get_params()

In [115]:
model_params

{'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'log_loss',
 'max_depth': 3,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_samples_leaf': 1,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 100,
 'n_iter_no_change': None,
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [117]:
(
    pd.DataFrame(zip(feature_names, model.feature_importances_), columns=['feature', 'importance'])
    .sort_values('importance', ascending=False)
)

Unnamed: 0,feature,importance
0,normalizer__mean radius,0.730222
3,normalizer__mean area,0.045899
27,normalizer__worst concave points,0.039042
23,normalizer__worst area,0.038529
7,normalizer__mean concave points,0.023605
20,normalizer__worst radius,0.019906
1,normalizer__mean texture,0.015896
6,normalizer__mean concavity,0.012546
26,normalizer__worst concavity,0.012121
2,normalizer__mean perimeter,0.009046


In [118]:
transformer = rf_pl[0]
model = rf_pl[-1]
feature_names = transformer.get_feature_names_out()
model_params = model.get_params()

In [120]:
(
    pd.DataFrame(zip(feature_names, model.feature_importances_), columns=['feature', 'importance'])
    .sort_values('importance', ascending=False)
)

Unnamed: 0,feature,importance
0,normalizer__mean radius,0.197191
2,normalizer__mean perimeter,0.140959
22,normalizer__worst perimeter,0.093358
23,normalizer__worst area,0.086753
20,normalizer__worst radius,0.068899
9,normalizer__mean fractal dimension,0.044963
3,normalizer__mean area,0.041591
24,normalizer__worst smoothness,0.030441
7,normalizer__mean concave points,0.027553
8,normalizer__mean symmetry,0.026831


In [121]:
transformer = lsvc_pl[0]
model = lsvc_pl[-1]
feature_names = transformer.get_feature_names_out()
model_params = model.get_params()

In [122]:
model.intercept_

array([-0.0198768])

In [124]:
(
    pd.DataFrame(zip(feature_names, model.coef_[0]), columns=['feature', 'importance'])
    .sort_values('importance', ascending=False)
)

Unnamed: 0,feature,importance
2,normalizer__mean perimeter,5.207121
22,normalizer__worst perimeter,5.078631
3,normalizer__mean area,4.526971
21,normalizer__worst texture,1.753862
1,normalizer__mean texture,1.409454
20,normalizer__worst radius,0.883273
0,normalizer__mean radius,0.881847
11,normalizer__texture error,0.113241
12,normalizer__perimeter error,0.030889
28,normalizer__worst symmetry,0.020602


#### Randomized Search

In [125]:
normalizer = Normalizer()
X_train_tr = normalizer.fit_transform(X_train)
X_test_tr = normalizer.transform(X_test)

kfold = StratifiedKFold(n_splits=5)

param_grid = {
    'C': uniform(0.1, 10),
    'kernel': ['linear', 'rbf', 'poly'],
    'gamma': uniform(0.1, 1),
    'degree': randint(1, 5)
}

clf = SVC()
random_search = RandomizedSearchCV(clf, cv=kfold, param_distributions=param_grid, n_iter=20)
random_search.fit(X_train_tr, y_train_tr)

In [130]:
print("Best Hyperparameters: ", random_search.best_params_)
print("Best Score: ", random_search.best_score_)
print('Test score: ', random_search.score(X_test_tr, y_test_tr))

Best Hyperparameters:  {'C': 5.355315840550353, 'degree': 4, 'gamma': 1.0912781894406567, 'kernel': 'poly'}
Best Score:  0.9120879120879121
Test score:  0.9473684210526315


### E2E ML Pipeline

In [154]:
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

le = LabelEncoder()
y_train_tr = le.fit_transform(y_train)
y_test_tr = le.transform(y_test)

model_params = dict(
    C=5.355315840550353,
    degree=4,
    gamma=1.0912781894406567,
    kernel='poly'
)
transformer = make_union(Normalizer())
pl = make_pipeline(transformer, SVC(**model_params))
pl.fit(X_train, y_train)

joblib.dump(le, 'label_encoder.joblib')
joblib.dump(pl, 'pipeline.joblib')



['pipeline.joblib']

### Model Inference and Evaluation

In [166]:
cancer = load_breast_cancer()
X = pd.DataFrame(cancer.data, columns=cancer.feature_names)
y = cancer.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

le = joblib.load('label_encoder.joblib')
pl = joblib.load('pipeline.joblib')

y_test_tr = le.transform(y_test)
y_test_pred = pl.predict(X_test)

In [167]:
print(classification_report(y_test_tr, y_test_hat))

              precision    recall  f1-score   support

           0       0.97      0.88      0.93        43
           1       0.93      0.99      0.96        71

    accuracy                           0.95       114
   macro avg       0.95      0.93      0.94       114
weighted avg       0.95      0.95      0.95       114



In [171]:
print('Accuracy Score: ', accuracy_score(y_test_tr, y_test_pred))
print('Precision Score: ', precision_score(y_test_tr, y_test_pred))
print('Recall Score: ', recall_score(y_test_tr, y_test_pred))
print('F1 Score: ', f1_score(y_test_tr, y_test_pred))

Accuracy Score:  0.9473684210526315
Precision Score:  0.9333333333333333
Recall Score:  0.9859154929577465
F1 Score:  0.9589041095890412


In [174]:
print(confusion_matrix(y_test_tr, y_test_pred))

[[38  5]
 [ 1 70]]
