In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

## 2. Data preparation

In [3]:
from sklearn.model_selection import train_test_split
X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target'].astype(np.uint8)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 3.2 - 3.3 Hard/soft ensemble

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

tree_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('knn', knn_clf)],
    voting='hard')

voting_clf_soft = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('knn', knn_clf)],
    voting='soft')

## 3.4 Saving results to pickle

In [5]:
accuracy_list = []

for clf in (tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    accuracy_list.append((accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)))

print(accuracy_list)
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(accuracy_list, f)

list_of_clf = [tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft]
with open('vote.pkl', 'wb') as f:
    pickle.dump(list_of_clf, f)

[(1.0, 0.6666666666666666), (0.6923076923076923, 0.7456140350877193), (0.7516483516483516, 0.7192982456140351), (0.8373626373626374, 0.7543859649122807), (0.967032967032967, 0.7192982456140351)]


## 3.5 - 3.6 Bagging and Pasting

In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30)
bag_50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)
past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
past_50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.5)
rnd_clf = RandomForestClassifier(n_estimators=30)
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=30)
gbrt_clf = GradientBoostingClassifier(n_estimators=30)

list_of_bagging = []

for clf in (bag_clf, bag_50_clf, past_clf, past_50_clf, rnd_clf, ada_clf, gbrt_clf):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    list_of_bagging.append((accuracy_score(y_train, y_pred_train), 
                            accuracy_score(y_test, y_pred_test)))

print(list_of_bagging)

with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(list_of_bagging, f)

list_of_bag_clf = [bag_clf, bag_50_clf, past_clf, past_50_clf, rnd_clf, ada_clf, gbrt_clf]
with open('bag.pkl', 'wb') as f:
    pickle.dump(list_of_bag_clf, f)

[(0.9912087912087912, 0.7368421052631579), (0.9032967032967033, 0.7368421052631579), (1.0, 0.6578947368421053), (0.9582417582417583, 0.7719298245614035), (1.0, 0.7456140350877193), (1.0, 0.6842105263157895), (0.8087912087912088, 0.7456140350877193)]


## 3.7 - 3.8 Sampling of 2 features

In [10]:
tree = DecisionTreeClassifier()
bagging_clf = BaggingClassifier(n_estimators=30, 
                            max_samples=0.5, 
                            max_features=2, 
                            bootstrap=True)
X_train_1, X_test_1, y_train_1, y_test_1 = train_test_split(data_breast_cancer.data, data_breast_cancer.target, test_size=0.2)
bagging_clf.fit(X_train_1, y_train_1)

bag_acc = [accuracy_score(y_train_1, bagging_clf.predict(X_train_1)), accuracy_score(y_test_1, bagging_clf.predict(X_test_1))]

with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(bag_acc, f)
    
with open('fea.pkl', 'wb') as f:
    pickle.dump([bagging_clf], f)
print(bagging_clf)
bag_acc

BaggingClassifier(max_features=2, max_samples=0.5, n_estimators=30)


[0.9978021978021978, 0.9210526315789473]

## 3.9 Ranking features accuracy

In [9]:
estimators = bagging_clf.estimators_
estimator_features = bagging_clf.estimators_features_

acc_train_scores = []
acc_test_scores = []
features_names = []

for estimator, features in zip(estimators, estimator_features):
    acc_train_scores.append(accuracy_score(y_train_1, estimator.predict(X_train_1.iloc[:, features])))
    acc_test_scores.append(accuracy_score(y_test_1, estimator.predict(X_test_1.iloc[:, features])))
    features_names.append(features)
    
df = pd.DataFrame({'train': acc_train_scores, 'test': acc_test_scores, 'features': features_names})
df.sort_values(by=['test', 'train'], ascending=False, inplace=True)
df.to_pickle('acc_fea_rank.pkl')
df



Unnamed: 0,train,test,features
1,0.945055,0.929825,"[28, 20]"
4,0.945055,0.921053,"[0, 20]"
14,0.936264,0.921053,"[28, 3]"
7,0.92967,0.912281,"[20, 25]"
13,0.927473,0.903509,"[18, 20]"
12,0.898901,0.885965,"[2, 14]"
25,0.901099,0.877193,"[10, 13]"
17,0.940659,0.868421,"[29, 27]"
22,0.925275,0.868421,"[6, 16]"
24,0.936264,0.842105,"[2, 21]"
