# XGBoost: Cancer Detection

BUSMGT 7247

In [1]:
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt

from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import GridSearchCV, train_test_split
from sklearn.metrics import (accuracy_score, classification_report,
                             precision_score, recall_score, f1_score)

from xgboost import XGBClassifier, XGBRegressor

# Options for plots
%matplotlib inline
mpl.rc('axes', labelsize=14)
mpl.rc('xtick', labelsize=12)
mpl.rc('ytick', labelsize=12)

# Seed the random number generator for reproducible results
random_state = 1000
np.random.seed(random_state)

## Load Data
For details, see the [Breast Cancer Wisconsin](https://archive.ics.uci.edu/ml/datasets/Breast+Cancer+Wisconsin+(Diagnostic)) source page.

In [2]:
df_feat, df_target = load_breast_cancer(return_X_y=True, as_frame=True)

In [3]:
df_feat.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [4]:
df_target.value_counts()

1    357
0    212
Name: target, dtype: int64

## Model Fitting

In [5]:
(X_train, X_test,
 y_train, y_test) = train_test_split(df_feat, df_target, test_size=0.2,
                                     random_state=random_state)

In [6]:
# Create an XGBoost Classifier
xgbclf = XGBClassifier(use_label_encoder=False,
                       eval_metric='logloss')

# Observe the default parameter values
xgbclf.get_xgb_params()

{'objective': 'binary:logistic',
 'base_score': None,
 'booster': None,
 'colsample_bylevel': None,
 'colsample_bynode': None,
 'colsample_bytree': None,
 'gamma': None,
 'gpu_id': None,
 'interaction_constraints': None,
 'learning_rate': None,
 'max_delta_step': None,
 'max_depth': None,
 'min_child_weight': None,
 'monotone_constraints': None,
 'n_jobs': None,
 'num_parallel_tree': None,
 'random_state': None,
 'reg_alpha': None,
 'reg_lambda': None,
 'scale_pos_weight': None,
 'subsample': None,
 'tree_method': None,
 'validate_parameters': None,
 'verbosity': None,
 'eval_metric': 'logloss'}

In [7]:
# Use cross-validation to optimize the learning rate
search_parameters = {
    'learning_rate':np.arange(0.05, 0.30, 0.05),
    'max_depth':np.arange(1, 15, 3),
    'colsample_bytree': np.arange(0.6, 1.0, 0.1)
}

gridclf = GridSearchCV(xgbclf, search_parameters)
gridclf.fit(X_train, y_train)

GridSearchCV(estimator=XGBClassifier(base_score=None, booster=None,
                                     colsample_bylevel=None,
                                     colsample_bynode=None,
                                     colsample_bytree=None,
                                     eval_metric='logloss', gamma=None,
                                     gpu_id=None, importance_type='gain',
                                     interaction_constraints=None,
                                     learning_rate=None, max_delta_step=None,
                                     max_depth=None, min_child_weight=None,
                                     missing=nan, monotone_constraints=None,
                                     n_estimators=100, n_jobs=None,
                                     num_parallel_tree=None, random_state=None,
                                     reg_alpha=None, reg_lambda=None,
                                     scale_pos_weight=None, subsample=None,
             

In [8]:
# Get best parameters
gridclf.best_params_

{'colsample_bytree': 0.7, 'learning_rate': 0.2, 'max_depth': 4}

In [9]:
# Evaluate on test data
y_pred = gridclf.predict(X_test)
print(classification_report(y_test, y_pred))

              precision    recall  f1-score   support

           0       0.93      0.93      0.93        44
           1       0.96      0.96      0.96        70

    accuracy                           0.95       114
   macro avg       0.94      0.94      0.94       114
weighted avg       0.95      0.95      0.95       114



## Feature Importances

In [10]:
# Get feature importances
clf = gridclf.best_estimator_
tab = list(zip(df_feat.columns, clf.feature_importances_))
tab = sorted(tab, key=lambda x: x[1], reverse=True)

for name, imp in tab:
    print('{:<24} {:.3f}'.format(name, imp))

worst perimeter          0.439
worst radius             0.212
worst concave points     0.075
mean concave points      0.033
worst area               0.025
mean concavity           0.024
mean radius              0.020
worst compactness        0.019
mean area                0.017
worst smoothness         0.016
worst texture            0.014
mean texture             0.014
worst concavity          0.014
area error               0.012
mean smoothness          0.011
texture error            0.010
mean compactness         0.008
perimeter error          0.006
radius error             0.005
smoothness error         0.005
fractal dimension error  0.004
mean fractal dimension   0.004
worst symmetry           0.004
symmetry error           0.004
mean symmetry            0.002
compactness error        0.002
worst fractal dimension  0.002
concave points error     0.001
mean perimeter           0.000
concavity error          0.000
