In [1]:
import pandas as pd
import numpy as np
import pickle

In [2]:
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

## 2. Data preparation

In [14]:
from sklearn.model_selection import train_test_split
X = data_breast_cancer['data'][['mean texture', 'mean symmetry']]
y = data_breast_cancer['target'].astype(np.uint8)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

## 3.2 - 3.3 Hard/soft ensemble

In [4]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.metrics import accuracy_score

tree_clf = DecisionTreeClassifier()
log_clf = LogisticRegression()
knn_clf = KNeighborsClassifier()

voting_clf_hard = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('knn', knn_clf)],
    voting='hard')

voting_clf_soft = VotingClassifier(
    estimators=[('lr', log_clf), ('dt', tree_clf), ('knn', knn_clf)],
    voting='soft')

## 3.4 Saving results to pickle

In [5]:
accuracy_list = []

for clf in (tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    accuracy_list.append((accuracy_score(y_train, y_pred_train), accuracy_score(y_test, y_pred_test)))

print(accuracy_list)
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(accuracy_list, f)

list_of_clf = [tree_clf, log_clf, knn_clf, voting_clf_hard, voting_clf_soft]
with open('vote.pkl', 'wb') as f:
    pickle.dump(list_of_clf, f)

[(1.0, 0.6666666666666666), (0.7186813186813187, 0.7192982456140351), (0.7802197802197802, 0.631578947368421), (0.8571428571428571, 0.7105263157894737), (0.9714285714285714, 0.6842105263157895)]


## 3.5 - 3.6 Bagging and Pasting

In [6]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30)
bag_50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5)
past_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False)
past_50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples=0.5)
rnd_clf = RandomForestClassifier(n_estimators=30)
ada_clf = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=30)
gbrt_clf = GradientBoostingClassifier(n_estimators=30)

list_of_bagging = []

for clf in (bag_clf, bag_50_clf, past_clf, past_50_clf, rnd_clf, ada_clf, gbrt_clf):
    clf.fit(X_train, y_train)
    y_pred_train = clf.predict(X_train)
    y_pred_test = clf.predict(X_test)
    list_of_bagging.append((accuracy_score(y_train, y_pred_train), 
                            accuracy_score(y_test, y_pred_test)))

print(list_of_bagging)

with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(list_of_bagging, f)

list_of_bag_clf = [bag_clf, bag_50_clf, past_clf, past_50_clf, rnd_clf, ada_clf, gbrt_clf]
with open('bag.pkl', 'wb') as f:
    pickle.dump(list_of_bag_clf, f)

[(1.0, 0.6842105263157895), (0.9296703296703297, 0.7105263157894737), (1.0, 0.6754385964912281), (0.9626373626373627, 0.7017543859649122), (0.9978021978021978, 0.6842105263157895), (1.0, 0.6842105263157895), (0.8197802197802198, 0.7280701754385965)]


## 3.7 - 3.8 Sampling of 2 features

In [15]:
result = []
bagging_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, max_samples=0.5, max_features=2, bootstrap=True, bootstrap_features=True)
bagging_clf.fit(X_train, y_train)

y_pred_train = bagging_clf.predict(X_train)
y_pred_test = bagging_clf.predict(X_test)

result.append(accuracy_score(y_train, y_pred_train))
result.append(accuracy_score(y_test, y_pred_test))

with open("acc_fea.pkl", 'wb') as f:
    pickle.dump(result, f)

with open("fea.pkl", 'wb') as f:
    pickle.dump([bagging_clf], f)

## 3.9 Ranking features accuracy

In [13]:
acc_train_score = []
acc_test_score = []
feature_names = []

for estimator, features in zip(bagging_clf.estimators_, bagging_clf.estimators_features_):
    y_pred_train = estimator.predict(X_train.iloc[:, features])
    y_pred_test = estimator.predict(X_test.iloc[:, features])

    acc_train_score.append(accuracy_score(y_train, y_pred_train))
    acc_test_score.append(accuracy_score(y_test, y_pred_test))
    feature_names.append(features)

df = pd.DataFrame({
    'train_acc_score': acc_train_score, 'test_acc_score': acc_test_score, 'feature_names': feature_names})

df_sorted = df.sort_values(by=['test_acc_score', 'train_acc_score'], ascending=False)
df_sorted.to_pickle('acc_fea_rank.pkl')
df_sorted



Unnamed: 0,train_acc_score,test_acc_score,feature_names
5,0.815385,0.745614,"[1, 0]"
10,0.789011,0.72807,"[0, 1]"
14,0.813187,0.719298,"[0, 1]"
2,0.802198,0.701754,"[1, 0]"
9,0.817582,0.692982,"[0, 1]"
19,0.782418,0.692982,"[0, 1]"
23,0.813187,0.684211,"[1, 0]"
4,0.789011,0.675439,"[1, 0]"
26,0.756044,0.675439,"[0, 0]"
28,0.793407,0.666667,"[1, 0]"
