# Project: Breast Cancer Classification Model


### by Redeemer Salami Okekale

## Table of Contents
- [Introduction](#intro)
- [Exploratory Data Analysis & Cleaning - EDA](#wrangling)
- [Model Selection & Model Training](#model_selection_training)
- [Saving the model](#model_saving)
- [Summary](#summary)

<a id='intro'></a>
## Introduction
> This document explains the fundamental steps in developing a classification machine learning model.



<a id='wrangling'></a>
## Exploratory Data Analysis and Cleaning
> This stage involves importing libraries that will assist us in analyzing and exploring our data to determine what operations we need to perform in order to make it clean and organized.

In [None]:
#install pycaret
!pip install pycaret shap

In [2]:
#import all relevant packages
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt 
#import shap
%matplotlib inline 

In [3]:
# read data_set and make copy to save it 
df_r = pd.read_csv("/content/drive/MyDrive/PyCon2022 (1)/breast_cancer_data.csv")
df = df_r.copy()

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
df.sample(10)

In [None]:
# showing important information 
df.info()

In [None]:
#getting some additional description for stats figures
df.describe()

In [None]:
# find duplicates
df.duplicated().sum()
# No duplicates here 

In [None]:
# What about NaN values?
df.isnull().sum() 

In [8]:
df.drop(['id'], axis=1, inplace=True)

In [9]:
df['diagnosis'].value_counts()

B    357
M    212
Name: diagnosis, dtype: int64

In [10]:
df = (df.groupby('diagnosis', as_index=False)
        .apply(lambda x: x.sample(n=212))
        .reset_index(drop=True))

<a id='model_selection_training'></a>
## Model Selection & Training
> Now that we have a data well prepared we are ready to select and train the right model for the task.


In [12]:
#import package that matches the problem
from pycaret.classification import *

In [13]:
#Set the target/goal
s = setup(df, target = 'diagnosis')


Unnamed: 0,Description,Value
0,session_id,241
1,Target,diagnosis
2,Target Type,Binary
3,Label Encoded,"B: 0, M: 1"
4,Original Data,"(424, 31)"
5,Missing Values,False
6,Numeric Features,30
7,Categorical Features,0
8,Ordinal Features,False
9,High Cardinality Features,False


INFO:logs:create_model_container: 0
INFO:logs:master_model_container: 0
INFO:logs:display_container: 1
INFO:logs:Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_features=[], target='diagnosis',
                                      time_features=[])),
                ('imputer',
                 Simple_Imputer(categorical_strategy='not_available',
                                fill_value_categorical=None,
                                fill_value_numerical=None,
                                numeric_st...
                ('scaling', 'passthrough'), ('P_transform', 'passthrough'),
                ('binn', 'passthrough'), ('rem_outliers', 'passthrough'),
                ('cluste

In [14]:
#compare models; takes a moment
best_model = compare_models()

Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC,TT (Sec)
et,Extra Trees Classifier,0.9829,0.9924,0.9786,0.9866,0.9822,0.9657,0.9663,0.408
rf,Random Forest Classifier,0.9729,0.9922,0.9786,0.9678,0.972,0.9456,0.9477,0.448
lightgbm,Light Gradient Boosting Machine,0.9728,0.9934,0.9643,0.9804,0.9711,0.9454,0.9475,0.123
qda,Quadratic Discriminant Analysis,0.9661,0.9907,0.9857,0.9474,0.9657,0.9322,0.9338,0.012
knn,K Neighbors Classifier,0.9526,0.9712,0.9357,0.968,0.9489,0.9048,0.9096,0.113
ridge,Ridge Classifier,0.9525,0.0,0.9357,0.9665,0.9478,0.9045,0.9091,0.01
lda,Linear Discriminant Analysis,0.9524,0.9917,0.929,0.9739,0.9489,0.9044,0.9084,0.013
gbc,Gradient Boosting Classifier,0.9491,0.9847,0.9362,0.9584,0.9464,0.8978,0.8993,0.17
lr,Logistic Regression,0.9461,0.9959,0.9429,0.9502,0.944,0.8918,0.8966,0.464
nb,Naive Bayes,0.9457,0.9887,0.9152,0.9714,0.9417,0.891,0.8935,0.012


INFO:logs:create_model_container: 14
INFO:logs:master_model_container: 14
INFO:logs:display_container: 2
INFO:logs:ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=241, verbose=0,
                     warm_start=False)
INFO:logs:compare_models() succesfully completed......................................


In [15]:
evaluate_model(best_model)

INFO:logs:Initializing evaluate_model()
INFO:logs:evaluate_model(estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=241, verbose=0,
                     warm_start=False), fold=None, fit_kwargs=None, plot_kwargs=None, feature_name=None, groups=None, use_train_data=False)


interactive(children=(ToggleButtons(description='Plot Type:', icons=('',), options=(('Hyperparameters', 'param…

In [16]:
predictions = predict_model(best_model)

INFO:logs:Initializing predict_model()
INFO:logs:predict_model(estimator=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=241, verbose=0,
                     warm_start=False), probability_threshold=None, encoded_labels=False, drift_report=False, raw_score=False, round=4, verbose=True, ml_usecase=MLUsecase.CLASSIFICATION, display=None, drift_kwargs=None)
INFO:logs:Checking exceptions
INFO:logs:Preloading libraries
INFO:logs:Preparing display monitor


Unnamed: 0,Model,Accuracy,AUC,Recall,Prec.,F1,Kappa,MCC
0,Extra Trees Classifier,0.9453,0.9933,0.9571,0.9437,0.9504,0.8895,0.8896


In [17]:
predictions

Unnamed: 0,Texture_mean,perimeter_mean,area_mean,smoothness_mean,compactness_mean,concavity_mean,concave points_mean,symmetry_mean,fractal_dimension_mean,radius_se,...,area_worst,smoothness_worst,compactness_worst,concavity_worst,concave points_worst,symmetry_worst,fractal_dimension_worst,diagnosis,Label,Score
0,21.510000,135.899994,1264.000000,0.11700,0.18750,0.25650,0.15040,0.2569,0.06670,0.5702,...,1938.000000,0.1592,0.4492,0.5344,0.2685,0.5558,0.10240,M,M,1.00
1,21.799999,79.779999,466.100006,0.08772,0.09445,0.06015,0.03745,0.1930,0.06404,0.2978,...,574.700012,0.1304,0.2463,0.2434,0.1205,0.2972,0.09261,B,B,0.93
2,21.459999,94.250000,648.200012,0.09444,0.09947,0.12040,0.04938,0.2075,0.05636,0.4204,...,808.900024,0.1306,0.1976,0.3349,0.1225,0.3020,0.06846,M,M,0.78
3,15.240000,95.769997,651.900024,0.11320,0.13390,0.09966,0.07064,0.2116,0.06346,0.5115,...,803.599976,0.1277,0.3089,0.2604,0.1397,0.3151,0.08473,B,M,0.63
4,19.070000,97.260002,701.900024,0.09215,0.08597,0.07486,0.04335,0.1561,0.05915,0.3860,...,967.000000,0.1246,0.2101,0.2866,0.1120,0.2282,0.06954,M,M,0.76
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
123,18.750000,87.440002,551.099976,0.10750,0.11380,0.04201,0.03152,0.1723,0.06317,0.1998,...,719.799988,0.1624,0.3124,0.2654,0.1427,0.3518,0.08665,B,B,0.64
124,15.050000,115.000000,955.099976,0.09847,0.11570,0.09875,0.07953,0.1739,0.06149,0.6003,...,1227.000000,0.1255,0.2812,0.2489,0.1456,0.2756,0.07919,M,M,0.89
125,17.250000,140.899994,1546.000000,0.09384,0.08562,0.11680,0.08465,0.1717,0.05054,1.2070,...,3143.000000,0.1363,0.1628,0.2861,0.1820,0.2510,0.06494,M,M,1.00
126,22.070000,111.599998,928.299988,0.09726,0.08995,0.09061,0.06527,0.1867,0.05580,0.4203,...,1436.000000,0.1558,0.2567,0.3889,0.1984,0.3216,0.07570,M,M,0.99


In [18]:
save_model(best_model, 'breast_cancer_model')

INFO:logs:Initializing save_model()
INFO:logs:save_model(model=ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=100, n_jobs=-1,
                     oob_score=False, random_state=241, verbose=0,
                     warm_start=False), model_name=breast_cancer_model, prep_pipe_=Pipeline(memory=None,
         steps=[('dtypes',
                 DataTypes_Auto_infer(categorical_features=[],
                                      display_types=True, features_todrop=[],
                                      id_columns=[],
                                      ml_usecase='classification',
                                      numerical_featur

Transformation Pipeline and Model Successfully Saved


(Pipeline(memory=None,
          steps=[('dtypes',
                  DataTypes_Auto_infer(categorical_features=[],
                                       display_types=True, features_todrop=[],
                                       id_columns=[],
                                       ml_usecase='classification',
                                       numerical_features=[], target='diagnosis',
                                       time_features=[])),
                 ('imputer',
                  Simple_Imputer(categorical_strategy='not_available',
                                 fill_value_categorical=None,
                                 fill_value_numerical=None,
                                 numeric_st...
                  ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0,
                                       class_weight=None, criterion='gini',
                                       max_depth=None, max_features='auto',
                                       max_leaf_nod