In [1]:
from sklearn import datasets
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)

In [2]:
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer['data'][['mean texture', 'mean symmetry']],
                                                    data_breast_cancer['target'],
                                                    test_size=0.2, 
                                                    random_state=42)

In [3]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)
kn = KNeighborsClassifier()
kn.fit(X_train, y_train)

In [4]:
from sklearn.ensemble import VotingClassifier
voting_hard_clf = VotingClassifier(estimators = [('lr', log_reg),('dt', tree_clf),('kn', kn)], voting='hard')
voting_soft_clf = VotingClassifier(estimators = [('lr', log_reg),('dt', tree_clf),('kn', kn)], voting='soft')
voting_hard_clf.fit(X_train, y_train)
voting_soft_clf.fit(X_train, y_train)

In [5]:
import pickle
from sklearn.metrics import accuracy_score
list = []
list2 = []
for clf in (tree_clf, log_reg, kn, voting_hard_clf, voting_soft_clf):
    y_train_score = accuracy_score(y_train, clf.predict(X_train))
    y_test_score = accuracy_score(y_test, clf.predict(X_test))
    list.append([y_train_score, y_test_score])
    list2.append(clf)

print(list)

with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(list, f)

with open('vote.pkl', 'wb') as f:
    pickle.dump(list2, f)


[[1.0, 0.6140350877192983], [0.7230769230769231, 0.7017543859649122], [0.7714285714285715, 0.6403508771929824], [0.8351648351648352, 0.7017543859649122], [0.9648351648351648, 0.6666666666666666]]


In [6]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
bag_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=True, random_state = 42).fit(X_train, y_train)
bag_clf05 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=True, max_samples = 0.5, random_state = 42).fit(X_train, y_train)
paste_clf = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False, random_state = 42).fit(X_train, y_train)
paste_clf05 = BaggingClassifier(estimator=DecisionTreeClassifier(), n_estimators=30, bootstrap=False, max_samples = 0.5, random_state = 42).fit(X_train, y_train)
rforest_clf = RandomForestClassifier(n_estimators=30, random_state=42).fit(X_train, y_train)
ab_clf = AdaBoostClassifier(n_estimators=30, random_state=42).fit(X_train, y_train)
gb_clf = GradientBoostingClassifier(n_estimators=30).fit(X_train, y_train)



In [7]:
list3 = []
list4 = []
for clf in (bag_clf, bag_clf05, paste_clf, paste_clf05, rforest_clf, ab_clf, gb_clf):
    y_train_score = accuracy_score(y_train, clf.predict(X_train))
    y_test_score = accuracy_score(y_test, clf.predict(X_test))
    list3.append([y_train_score, y_test_score])
    list4.append(clf)

print(list3)

with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(list3, f)

with open('bag.pkl', 'wb') as f:
    pickle.dump(list4, f)

[[0.9956043956043956, 0.6754385964912281], [0.9296703296703297, 0.6842105263157895], [1.0, 0.6228070175438597], [0.9736263736263736, 0.6491228070175439], [0.9956043956043956, 0.6754385964912281], [0.8, 0.7368421052631579], [0.8373626373626374, 0.7105263157894737]]


In [8]:
X_train2, X_test2, y_train2, y_test2 = train_test_split(data_breast_cancer.data, data_breast_cancer.target, test_size=0.2)
bag_clf2 = BaggingClassifier(n_estimators=30, bootstrap_features=False, bootstrap = True, max_features = 2, max_samples = 0.5).fit(X_train2, y_train2)

In [9]:
y_train_score2 = accuracy_score(y_train2, bag_clf2.predict(X_train2))
y_test_score2 = accuracy_score(y_test2, bag_clf2.predict(X_test2))
print(y_train_score2, y_test_score2)

0.9912087912087912 0.9298245614035088


In [10]:
list6 = [y_train_score2, y_test_score2]
list5 = [bag_clf2]
print(list5)
print(list6)

[BaggingClassifier(max_features=2, max_samples=0.5, n_estimators=30)]
[0.9912087912087912, 0.9298245614035088]


In [11]:
with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(list6, f)

with open('fea.pkl', 'wb') as f:
    pickle.dump(list5, f)

In [12]:
bagr = BaggingClassifier(n_estimators=30, bootstrap_features=False, bootstrap = True, max_features = 2, max_samples = 0.5).fit(X_train2, y_train2)
col1 = []
col2 = []
col3 = []
for i in range(len(bagr.estimators_)):
    train_score2 = accuracy_score(y_train2, bagr.estimators_[i].predict(X_train2.iloc[:, bagr.estimators_features_[i]]))
    test_score2 = accuracy_score(y_test2, bagr.estimators_[i].predict(X_test2.iloc[:, bagr.estimators_features_[i]]))
    col1.append(train_score2)
    col2.append(test_score2)
    col3.append(bagr.estimators_features_[i])

data = {'Train accuracy': col1,
        'Test accuracy': col2,
       'Feature names': col3}


df = pd.DataFrame(data)
df = df.sort_values(by=['Test accuracy', 'Train accuracy'], ascending=False)
df.head()



Unnamed: 0,Train accuracy,Test accuracy,Feature names
12,0.951648,0.938596,"[22, 28]"
23,0.947253,0.929825,"[20, 3]"
0,0.940659,0.929825,"[22, 7]"
10,0.927473,0.929825,"[22, 8]"
13,0.936264,0.921053,"[7, 20]"


In [13]:
df.to_pickle('acc_fea_rank.pkl')