In [7]:
# Dataset
from sklearn import datasets

# Data processing
import pandas as pd
import numpy as np

# Standardize the data
from sklearn.preprocessing import StandardScaler

# Model and performace evaluation
from sklearn.model_selection import train_test_split
from xgboost import XGBClassifier
from sklearn.metrics import precision_recall_fscore_support as score

# Hyperparameter tuning
from sklearn.model_selection import StratifiedKFold, cross_val_score, GridSearchCV, RandomizedSearchCV
from hyperopt import tpe, STATUS_OK, Trials, hp, fmin, space_eval

%reload_ext watermark
%watermark -iv -v

Python implementation: CPython
Python version       : 3.12.2
IPython version      : 8.21.0

sklearn : 1.4.1.post1
pandas  : 2.2.0
numpy   : 1.26.4
hyperopt: 0.2.7



In [10]:
# Load the breast cancer dataset
data = datasets.load_breast_cancer()

# Put the data in pandas dataframe format
df = pd.DataFrame(data=data.data, columns=data.feature_names)
df['target'] = data.target

# Check the data information
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 569 entries, 0 to 568
Data columns (total 31 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   mean radius              569 non-null    float64
 1   mean texture             569 non-null    float64
 2   mean perimeter           569 non-null    float64
 3   mean area                569 non-null    float64
 4   mean smoothness          569 non-null    float64
 5   mean compactness         569 non-null    float64
 6   mean concavity           569 non-null    float64
 7   mean concave points      569 non-null    float64
 8   mean symmetry            569 non-null    float64
 9   mean fractal dimension   569 non-null    float64
 10  radius error             569 non-null    float64
 11  texture error            569 non-null    float64
 12  perimeter error          569 non-null    float64
 13  area error               569 non-null    float64
 14  smoothness error         5

In [11]:
# Check the target value distribution
df['target'].value_counts(normalize=True)

target
1    0.627417
0    0.372583
Name: proportion, dtype: float64

In [12]:
# Train test split
X_train, X_test, y_train, y_test = train_test_split(df[df.columns.difference(['target'])],
                                                    df['target'],
                                                    test_size=.2,
                                                    random_state=42)

# Check the number of records in training and testing dataset
print(f'The training dataset has {len(X_train)} records.')
print(f'The training dataset has {len(X_test)} records.')

The training dataset has 455 records.
The training dataset has 114 records.


In [13]:
# Initiate scaler
sc = StandardScaler()

# StandardScaler the training dataset
X_train_transformed = pd.DataFrame(sc.fit_transform(X_train), index=X_train.index, columns=X_train.columns)

# StandardScaler the testing dataset
X_test_transformed = pd.DataFrame(sc.fit_transform(X_test), index=X_test.index, columns=X_test.columns)

# Summary statistics after standardization
X_train_transformed.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
area error,455.0,6.24653e-17,1.001101,-0.705091,-0.464164,-0.325347,0.077435,10.641841
compactness error,455.0,-2.395154e-15,1.001101,-1.258102,-0.694353,-0.280607,0.358304,5.905671
concave points error,455.0,3.455112e-16,1.001101,-1.891775,-0.668493,-0.126279,0.437566,6.504667
concavity error,455.0,2.479091e-16,1.001101,-1.022218,-0.55134,-0.207836,0.303371,11.310294
fractal dimension error,455.0,5.085065e-16,1.001101,-1.050856,-0.573964,-0.218908,0.24534,9.34587
mean area,455.0,-2.537653e-16,1.001101,-1.365036,-0.660205,-0.289597,0.319339,5.208312
mean compactness,455.0,1.011157e-15,1.001101,-1.607228,-0.777087,-0.24134,0.528128,3.964311
mean concave points,455.0,5.817081e-16,1.001101,-1.26991,-0.734905,-0.391123,0.673757,4.022271
mean concavity,455.0,9.857804e-16,1.001101,-1.119899,-0.750539,-0.344646,0.547387,4.256736
mean fractal dimension,455.0,-3.36727e-15,1.001101,-1.776889,-0.709792,-0.177285,0.464223,4.815921
