In [1]:
import pandas as pd
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

# import models
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import BaggingClassifier

# set seed
SEED = 1

# read data
wbc = pd.read_csv("https://assets.datacamp.com/production/repositories/1796/datasets/0eb6987cb9633e4d6aa6cfd11e00993d2387caa4/wbc.csv")
wbc.head()

wbc_df = wbc.drop(['id', 'Unnamed: 32'], axis = 1)
wbc_df = pd.get_dummies(wbc_df, drop_first = True)

X = wbc_df.drop('diagnosis_M', axis = 1).values
y = wbc_df['diagnosis_M'].values


# split data
X_train, X_test, y_train, y_test = train_test_split(X, y,
                                                    test_size=0.3,
                                                    stratify=y,
                                                    random_state=SEED)

dt = DecisionTreeClassifier(max_depth=4,
                            min_samples_leaf=0.16,
                            random_state=SEED)

# instantiate bagging classifier
# n_jobs set to -1 to ensure all cores are used in calculations
bc = BaggingClassifier(base_estimator=dt,
                       n_estimators=300,
                       oob_score=True,
                       n_jobs=-1)

bc.fit(X_train, y_train)

y_pred = bc.predict(X_test)

test_accuracy = accuracy_score(y_test, y_pred)

oob_accuracy = bc.oob_score_

print('Test set accuracy: {:.3f}'.format(test_accuracy))

print('OOB accuracy: {:.3f}'.format(oob_accuracy))




Test set accuracy: 0.936
OOB accuracy: 0.922
