In [28]:
#imports
import numpy as np
import pandas as pd

from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix
from sklearn.ensemble import AdaBoostClassifier

In [29]:
test_data_fraction = 0.10
random_seed = 25

In [30]:
#load the dataset and split it to test and train sets
bc_sk = datasets.load_breast_cancer()
bc_data = pd.DataFrame(data= np.c_[bc_sk['data'], bc_sk['target']],columns= list(bc_sk['feature_names'])+['target'])
bc_features = bc_data.iloc[:,0:-1]
bc_labels = bc_data["target"]
X_train, X_test, Y_train, Y_test = train_test_split(bc_features, bc_labels, test_size=test_data_fraction,  random_state=random_seed)


In [31]:
#create a gini tree classifier with no max depth and predict outcomes
gini_tree = DecisionTreeClassifier(criterion = "gini", random_state=random_seed).fit(X=X_train, y=Y_train)
predicted_y = gini_tree.predict(X_test)

In [32]:
confusion_matrix(predicted_y,Y_test)

array([[17,  3],
       [ 3, 34]], dtype=int64)

In [33]:
print(classification_report(predicted_y,Y_test))

              precision    recall  f1-score   support

         0.0       0.85      0.85      0.85        20
         1.0       0.92      0.92      0.92        37

    accuracy                           0.89        57
   macro avg       0.88      0.88      0.88        57
weighted avg       0.89      0.89      0.89        57



In [35]:
#create an ADA boost classifier
#by default ADA boost uses a gini tree with max depth = 1
bdt = AdaBoostClassifier(n_estimators=20,random_state=random_seed)
bdt.fit(X_train,Y_train)
predicted_y = bdt.predict(X_test)

In [36]:
confusion_matrix(predicted_y,Y_test)

array([[18,  1],
       [ 2, 36]], dtype=int64)

In [37]:
print(classification_report(predicted_y,Y_test))

              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92        19
         1.0       0.97      0.95      0.96        38

    accuracy                           0.95        57
   macro avg       0.94      0.95      0.94        57
weighted avg       0.95      0.95      0.95        57



In [39]:
#in the following I create a gini tree with max depth = 1
#I pass the created gini tree into the ada boost classifier and predict
gini_tree = DecisionTreeClassifier(criterion = "gini", max_depth=1, random_state=random_seed)
bdt = AdaBoostClassifier(base_estimator=gini_tree,n_estimators=20,random_state=random_seed)
bdt.fit(X_train,Y_train)
predicted_y = bdt.predict(X_test)

In [42]:
#the results come out the same for both the default and the manual set
confusion_matrix(predicted_y,Y_test)

array([[18,  1],
       [ 2, 36]], dtype=int64)

In [43]:
print(classification_report(predicted_y,Y_test))

              precision    recall  f1-score   support

         0.0       0.90      0.95      0.92        19
         1.0       0.97      0.95      0.96        38

    accuracy                           0.95        57
   macro avg       0.94      0.95      0.94        57
weighted avg       0.95      0.95      0.95        57

