In [1]:
from sklearn.datasets import load_breast_cancer

In [2]:
data = load_breast_cancer()
X = data.data
y = data.target

In [3]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [4]:
from sklearn.tree import DecisionTreeClassifier
tree_clf = DecisionTreeClassifier(random_state=42)
tree_clf.fit(X_train, y_train)

DecisionTreeClassifier(random_state=42)

In [5]:
y_pred = tree_clf.predict(X_test)

In [6]:
from sklearn.metrics import accuracy_score
accuracy_score(y_test, y_pred)

0.9122807017543859

In [8]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier

bag_clf = BaggingClassifier(
    DecisionTreeClassifier(random_state=42),
    n_estimators=500,
    max_samples=200,
    bootstrap=True
)
bag_clf.fit(X_train, y_train)

BaggingClassifier(base_estimator=DecisionTreeClassifier(random_state=42),
                  max_samples=200, n_estimators=500)

In [9]:
y_preds = bag_clf.predict(X_test)
accuracy_score(y_test, y_preds)

0.9473684210526315

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_clf = RandomForestClassifier(
    n_estimators=500,
    bootstrap=True,
    oob_score=True,
    warm_start=True,
    max_features=10
)
rf_clf.fit(X_train, y_train)

RandomForestClassifier(max_features=10, n_estimators=500, oob_score=True,
                       warm_start=True)

In [11]:
y_preds = rf_clf.predict(X_test)

In [12]:
accuracy_score(y_test, y_preds)

0.956140350877193

In [13]:
rf_clf.oob_score_

0.9626373626373627

In [14]:
# Ada Boosting
from sklearn.ensemble import AdaBoostClassifier
ada_clf = AdaBoostClassifier(
    DecisionTreeClassifier(max_depth=1),
    n_estimators=200,
    algorithm="SAMME.R",
    learning_rate=0.5
)
ada_clf.fit(X_train, y_train)

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=1),
                   learning_rate=0.5, n_estimators=200)

In [15]:
y_preds = ada_clf.predict(X_test)
accuracy_score(y_test, y_preds)

0.9649122807017544

In [16]:
# Gradient Boosting
from sklearn.ensemble import GradientBoostingClassifier
gb_clf = GradientBoostingClassifier(
    n_estimators=200,
    max_depth=3,
    learning_rate=0.1,
)
gb_clf.fit(X_train, y_train)

GradientBoostingClassifier(n_estimators=200)

In [17]:
y_pred = gb_clf.predict(X_test)
accuracy_score(y_test, y_pred)

0.956140350877193

In [18]:
import xgboost as xgb # pip install xgboost 
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.datasets import make_classification

X, y = make_classification(n_samples=10_000, random_state=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

xgb_clf = xgb.XGBClassifier(
    objective="binary:logistic",
    n_estimators=100,
    seed=42
)
xgb_clf.fit(X_train, y_train)

  from pandas import MultiIndex, Int64Index




XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, enable_categorical=False,
              gamma=0, gpu_id=-1, importance_type=None,
              interaction_constraints='', learning_rate=0.300000012,
              max_delta_step=0, max_depth=6, min_child_weight=1, missing=nan,
              monotone_constraints='()', n_estimators=100, n_jobs=8,
              num_parallel_tree=1, predictor='auto', random_state=42,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=42,
              subsample=1, tree_method='exact', validate_parameters=1,
              verbosity=None)

In [19]:
y_preds = xgb_clf.predict(X_test)
print(roc_auc_score(y_test, y_preds))

0.9356266825605721


In [20]:
DM_train = xgb.DMatrix(data=X_train, label=y_train)
DM_test = xgb.DMatrix(data=X_test, label=y_test)

params = {
    "booster": "gbtree",
    "objective": "binary:logistic"
}
xgb_clf = xgb.train(params, DM_train, num_boost_round=100)

y_preds = xgb_clf.predict(DM_test)
print(roc_auc_score(y_test, y_preds))
print(y_preds)

0.9730230288290378
[0.9971207  0.99966013 0.9947634  ... 0.9352906  0.00544361 0.01549272]


In [21]:
params = {
    "booster": "gbtree",
    "objective": "binary:logistic",
    "colsample_bytree": 0.3,
    "learning_rate": 0.1,
    "max_depth": 5
}

tuned_xgb = xgb.cv(
    dtrain=DM_train,
    params=params,
    nfold=4,
    num_boost_round=100,
    metrics="auc",
    as_pandas = True,
    seed=42
)

In [22]:
tuned_xgb

Unnamed: 0,train-auc-mean,train-auc-std,test-auc-mean,test-auc-std
0,0.902267,0.002596,0.892293,0.007576
1,0.933166,0.027712,0.922706,0.030534
2,0.942090,0.021901,0.929510,0.028449
3,0.943636,0.022171,0.929381,0.029189
4,0.952096,0.022050,0.935694,0.030761
...,...,...,...,...
95,0.994662,0.000388,0.970763,0.001341
96,0.994750,0.000395,0.970720,0.001322
97,0.994837,0.000396,0.970669,0.001345
98,0.994930,0.000377,0.970718,0.001318
