In [1]:
# importing all the required libraries

import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.metrics import roc_auc_score
from sklearn.metrics import roc_curve
from sklearn.metrics import roc_curve, auc


In [2]:
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
cancer = load_breast_cancer()

In [3]:
print(cancer.keys())

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename'])


In [4]:
data = pd.DataFrame(cancer.data, columns=[cancer.feature_names])
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst radius,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,25.38,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,24.99,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,23.57,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,14.91,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,22.54,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678


In [5]:
data = data.assign(target=pd.Series(cancer.target))
data.head()

Unnamed: 0,mean radius,mean texture,mean perimeter,mean area,mean smoothness,mean compactness,mean concavity,mean concave points,mean symmetry,mean fractal dimension,...,worst texture,worst perimeter,worst area,worst smoothness,worst compactness,worst concavity,worst concave points,worst symmetry,worst fractal dimension,target
0,17.99,10.38,122.8,1001.0,0.1184,0.2776,0.3001,0.1471,0.2419,0.07871,...,17.33,184.6,2019.0,0.1622,0.6656,0.7119,0.2654,0.4601,0.1189,0
1,20.57,17.77,132.9,1326.0,0.08474,0.07864,0.0869,0.07017,0.1812,0.05667,...,23.41,158.8,1956.0,0.1238,0.1866,0.2416,0.186,0.275,0.08902,0
2,19.69,21.25,130.0,1203.0,0.1096,0.1599,0.1974,0.1279,0.2069,0.05999,...,25.53,152.5,1709.0,0.1444,0.4245,0.4504,0.243,0.3613,0.08758,0
3,11.42,20.38,77.58,386.1,0.1425,0.2839,0.2414,0.1052,0.2597,0.09744,...,26.5,98.87,567.7,0.2098,0.8663,0.6869,0.2575,0.6638,0.173,0
4,20.29,14.34,135.1,1297.0,0.1003,0.1328,0.198,0.1043,0.1809,0.05883,...,16.67,152.2,1575.0,0.1374,0.205,0.4,0.1625,0.2364,0.07678,0


In [6]:
data.shape

(569, 31)

In [7]:
## Independent features and dependent features
X=data.iloc[:,:-1]  
y=data.iloc[:,-1]

In [8]:
## train test split 
from sklearn.model_selection import train_test_split

# Split the data into training and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

In [9]:
X_train.shape

(426, 30)

In [10]:
from sklearn.ensemble import AdaBoostClassifier
classifier=AdaBoostClassifier()

In [11]:
param_grid = {
    'n_estimators': [50, 100],
    'learning_rate': [0.1, 0.5, 1.0],
    'algorithm': ['SAMME', 'SAMME.R']
}

Explanation of hyperparameters:

`n_estimators`: The number of base estimators (weak learners) to be used in the ensemble. It represents the maximum number of boosting stages.
`learning_rate`: The learning rate shrinks the contribution of each weak learner. A smaller learning rate requires more weak learners to achieve the same level of performance.
`algorithm`: The algorithm to use for updating the weights of each classifier in the ensemble. 'SAMME.R' is the default and generally performs better.

* You can adjust the values in each list to include more options or refine the ranges based on your specific problem and dataset.

In [12]:
# Create the GridSearchCV object
from sklearn.model_selection import GridSearchCV

# make a GridSearchCV object
grid_search = GridSearchCV(estimator = classifier,
                  param_grid = param_grid,
                  scoring = "accuracy", #sklearn.metrics.SCORERS.keys()
                  cv = 5,
                  verbose = 1)

# Perform hyperparameter tuning on the classifier
grid_search.fit(X_train, y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[Parallel(n_jobs=1)]: Done  60 out of  60 | elapsed:    9.2s finished


GridSearchCV(cv=5, estimator=AdaBoostClassifier(),
             param_grid={'algorithm': ['SAMME', 'SAMME.R'],
                         'learning_rate': [0.1, 0.5, 1.0],
                         'n_estimators': [50, 100]},
             scoring='accuracy', verbose=1)

In [13]:
print(grid_search.best_params_)

{'algorithm': 'SAMME', 'learning_rate': 0.5, 'n_estimators': 50}


In [14]:
print(grid_search.best_score_)

0.9623803009575923


In [15]:
df = pd.DataFrame(grid_search.cv_results_)
df = df.sort_values("rank_test_score")
df

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algorithm,param_learning_rate,param_n_estimators,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
2,0.134589,0.029243,0.007972,0.003403,SAMME,0.5,50,"{'algorithm': 'SAMME', 'learning_rate': 0.5, '...",0.988372,0.941176,0.964706,0.988235,0.929412,0.96238,0.024025,1
3,0.207281,0.029451,0.009624,0.001565,SAMME,0.5,100,"{'algorithm': 'SAMME', 'learning_rate': 0.5, '...",0.988372,0.929412,0.964706,0.988235,0.941176,0.96238,0.024025,1
5,0.189769,0.029201,0.007675,0.002614,SAMME,1.0,100,"{'algorithm': 'SAMME', 'learning_rate': 1.0, '...",0.988372,0.917647,0.964706,1.0,0.929412,0.960027,0.032114,3
1,0.230092,0.01332,0.00986,0.003284,SAMME,0.1,100,"{'algorithm': 'SAMME', 'learning_rate': 0.1, '...",0.988372,0.941176,0.941176,0.988235,0.929412,0.957674,0.025375,4
4,0.103172,0.013393,0.005266,0.001252,SAMME,1.0,50,"{'algorithm': 'SAMME', 'learning_rate': 1.0, '...",0.988372,0.917647,0.941176,1.0,0.941176,0.957674,0.031242,4
7,0.180161,0.010384,0.011396,0.003926,SAMME.R,0.1,100,"{'algorithm': 'SAMME.R', 'learning_rate': 0.1,...",0.988372,0.917647,0.964706,0.988235,0.929412,0.957674,0.029417,4
11,0.180338,0.01558,0.009958,0.00218,SAMME.R,1.0,100,"{'algorithm': 'SAMME.R', 'learning_rate': 1.0,...",0.988372,0.917647,0.952941,0.988235,0.941176,0.957674,0.02747,4
9,0.174293,0.010894,0.010674,0.002244,SAMME.R,0.5,100,"{'algorithm': 'SAMME.R', 'learning_rate': 0.5,...",0.988372,0.929412,0.952941,0.988235,0.917647,0.955321,0.02923,8
6,0.09763,0.006226,0.006174,0.002571,SAMME.R,0.1,50,"{'algorithm': 'SAMME.R', 'learning_rate': 0.1,...",0.976744,0.941176,0.952941,0.976471,0.917647,0.952996,0.02238,9
8,0.09297,0.007104,0.005868,0.000912,SAMME.R,0.5,50,"{'algorithm': 'SAMME.R', 'learning_rate': 0.5,...",0.988372,0.929412,0.941176,0.988235,0.917647,0.952969,0.029795,10


In [16]:
##prediction
y_pred=grid_search.predict(X_test)

In [17]:
## accuracy score
from sklearn.metrics import accuracy_score,classification_report

In [18]:
score=accuracy_score(y_pred,y_test)
print(score)

0.9790209790209791


In [19]:
print(classification_report(y_pred,y_test))

              precision    recall  f1-score   support

           0       0.95      1.00      0.97        52
           1       1.00      0.97      0.98        91

    accuracy                           0.98       143
   macro avg       0.97      0.98      0.98       143
weighted avg       0.98      0.98      0.98       143



`AdaBoost` is sensitive to `outliers`, as it tries to iteratively fit the subsequent weak learners to the samples that were misclassified by previous weak learners. Outliers can have a significant impact on the training process, as they may be repeatedly misclassified and therefore receive more weight in subsequent iterations.