In [234]:
import pandas as pd
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import  KNeighborsClassifier
from sklearn.ensemble import VotingClassifier, BaggingClassifier, RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier, GradientBoostingClassifier
import pickle

In [235]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True).frame

In [236]:
X = data_breast_cancer.drop('target', axis=1)
y = data_breast_cancer['target']
X = X[['mean texture', 'mean symmetry']]

In [237]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.8, random_state=42)

In [238]:
# tree classifier
tree_clf = DecisionTreeClassifier()
tree_clf.fit(X_train, y_train)
tree_train_predict = tree_clf.predict(X_train)
tree_test_predict = tree_clf.predict(X_test)

tree_train_accuracy = accuracy_score(y_train, tree_train_predict)
tree_test_accuracy = accuracy_score(y_test, tree_test_predict)

print(tree_train_accuracy, tree_test_accuracy)


1.0 0.6228070175438597


In [239]:
# logistic regression
log_clf = LogisticRegression()
log_clf.fit(X_train, y_train)
log_train_predict = log_clf.predict(X_train)
log_test_predict = log_clf.predict(X_test)

log_train_accuracy = accuracy_score(y_train, log_train_predict)
log_test_accuracy = accuracy_score(y_test, log_test_predict)

print(log_train_accuracy, log_test_accuracy)

0.7230769230769231 0.7017543859649122


In [240]:
# k nearest neighbors
knn_clf = KNeighborsClassifier()
knn_clf.fit(X_train, y_train)
knn_train_predict = knn_clf.predict(X_train)
knn_test_predict = knn_clf.predict(X_test)

knn_train_accuracy = accuracy_score(y_train, knn_train_predict)
knn_test_accuracy = accuracy_score(y_test, knn_test_predict)

print(knn_train_accuracy, knn_test_accuracy)

0.7714285714285715 0.6403508771929824


In [241]:
# hard voting
hard_clf = VotingClassifier(estimators=[('lr', log_clf), ('tr', tree_clf), ('knn', knn_clf)], voting='hard')
hard_clf.fit(X_train, y_train)
hard_train_predict = hard_clf.predict(X_train)
hard_test_predict = hard_clf.predict(X_test)

hard_train_accuracy = accuracy_score(y_train, hard_train_predict)
hard_test_accuracy = accuracy_score(y_test, hard_test_predict)

print(hard_train_accuracy, hard_test_accuracy)

0.8351648351648352 0.7017543859649122


In [242]:
# soft voting
soft_clf = VotingClassifier(estimators=[('lr', log_clf), ('tr', tree_clf), ('knn', knn_clf)], voting='soft')
soft_clf.fit(X_train, y_train)
soft_train_predict = soft_clf.predict(X_train)
soft_test_predict = soft_clf.predict(X_test)

soft_train_accuracy = accuracy_score(y_train, soft_train_predict)
soft_test_accuracy = accuracy_score(y_test, soft_test_predict)

print(soft_train_accuracy, soft_test_accuracy)

0.9648351648351648 0.6666666666666666


In [243]:
print(log_train_accuracy, log_test_accuracy)
print(tree_train_accuracy, tree_test_accuracy)
print(knn_train_accuracy, knn_test_accuracy)
print(hard_train_accuracy, hard_test_accuracy)
print(soft_train_accuracy, soft_test_accuracy)

0.7230769230769231 0.7017543859649122
1.0 0.6228070175438597
0.7714285714285715 0.6403508771929824
0.8351648351648352 0.7017543859649122
0.9648351648351648 0.6666666666666666


In [244]:
accuracy_list = [(tree_train_accuracy, tree_test_accuracy),
                 (log_train_accuracy, log_test_accuracy),
                 (knn_train_accuracy, knn_test_accuracy),
                 (hard_train_accuracy, hard_test_accuracy),
                 (soft_train_accuracy, soft_test_accuracy)]
print(accuracy_list)

[(1.0, 0.6228070175438597), (0.7230769230769231, 0.7017543859649122), (0.7714285714285715, 0.6403508771929824), (0.8351648351648352, 0.7017543859649122), (0.9648351648351648, 0.6666666666666666)]


In [245]:
with open('acc_vote.pkl', 'wb') as file:
    pickle.dump(accuracy_list, file)

In [246]:
classificator_list = [tree_clf, log_clf, knn_clf, hard_clf, soft_clf]

In [247]:
with open('vote.pkl', 'wb') as file:
    pickle.dump(classificator_list, file)

In [274]:
# bagging classifier
bag_clf = BaggingClassifier(n_estimators=30, bootstrap=True)
bag_clf.fit(X_train, y_train)
bag_train_predict = bag_clf.predict(X_train)
bag_test_predict = bag_clf.predict(X_test)

bag_train_accuracy = accuracy_score(y_train, bag_train_predict)
bag_test_accuracy = accuracy_score(y_test, bag_test_predict)

print(bag_train_accuracy, bag_test_accuracy)

1.0 0.956140350877193


In [282]:
# bagging 50% instances
bag05_clf = BaggingClassifier(n_estimators=30, max_samples=0.5, bootstrap=True)
bag05_clf.fit(X_train, y_train)
bag05_train_predict = bag05_clf.predict(X_train)
bag05_test_predict = bag05_clf.predict(X_test)

bag05_train_accuracy = accuracy_score(y_train, bag05_train_predict)
bag05_test_accuracy = accuracy_score(y_test, bag05_test_predict)

print(bag05_train_accuracy, bag05_test_accuracy)

0.9912087912087912 0.9649122807017544


In [281]:
# pasting classifier
pasting_clf = BaggingClassifier(n_estimators=30, bootstrap=False)
pasting_clf.fit(X_train, y_train)
pasting_train_predict = pasting_clf.predict(X_train)
pasting_test_predict = pasting_clf.predict(X_test)

pasting_train_accuracy = accuracy_score(y_train, pasting_train_predict)
pasting_test_accuracy = accuracy_score(y_test, pasting_test_predict)

print(pasting_train_accuracy, pasting_test_accuracy)

1.0 0.9385964912280702


In [283]:
# pasting 50% instances
pasting05_clf = BaggingClassifier(n_estimators=30, max_samples=0.5, bootstrap=False)
pasting05_clf.fit(X_train, y_train)
pasting05_train_predict = pasting05_clf.predict(X_train)
pasting05_test_predict = pasting05_clf.predict(X_test)

pasting05_train_accuracy = accuracy_score(y_train, pasting05_train_predict)
pasting05_test_accuracy = accuracy_score(y_test, pasting05_test_predict)

print(pasting05_train_accuracy, pasting05_test_accuracy)

0.9978021978021978 0.956140350877193


In [252]:
# random forrest classifier
rnd_clf = RandomForestClassifier(n_estimators=30)
rnd_clf.fit(X_train, y_train)
rnd_train_predict = rnd_clf.predict(X_train)
rnd_test_predict = rnd_clf.predict(X_test)

rnd_train_accuracy = accuracy_score(y_train, rnd_train_predict)
rnd_test_accuracy = accuracy_score(y_test, rnd_test_predict)

print(rnd_train_accuracy, rnd_test_accuracy)

0.9978021978021978 0.6754385964912281


In [253]:
# adaBoost classifier
ada_clf = AdaBoostClassifier(n_estimators=30)
ada_clf.fit(X_train, y_train)
ada_train_predict = ada_clf.predict(X_train)
ada_test_predict = ada_clf.predict(X_test)

ada_train_accuracy = accuracy_score(y_train, ada_train_predict)
ada_test_accuracy = accuracy_score(y_test, ada_test_predict)

print(ada_train_accuracy, ada_test_accuracy)

0.8 0.7368421052631579


In [254]:
# gradient classifier
gradient_clf = GradientBoostingClassifier(n_estimators=30)
gradient_clf.fit(X_train, y_train)
gradient_train_predict = gradient_clf.predict(X_train)
gradient_test_predict = gradient_clf.predict(X_test)

gradient_train_accuracy = accuracy_score(y_train, gradient_train_predict)
gradient_test_accuracy = accuracy_score(y_test, gradient_test_predict)

print(gradient_train_accuracy, gradient_test_accuracy)

0.8373626373626374 0.7105263157894737


In [255]:
bagging_accuracy_list = [
    (bag_train_accuracy, bag_test_accuracy),
    (bag05_train_accuracy, bag05_test_accuracy),
    (pasting_train_accuracy, pasting_test_accuracy),
    (pasting05_train_accuracy, pasting05_test_accuracy),
    (rnd_train_accuracy, rnd_test_accuracy),
    (ada_train_accuracy, ada_test_accuracy),
    (gradient_train_accuracy, gradient_test_accuracy)]

In [256]:
print(bagging_accuracy_list)

[(0.9956043956043956, 0.6929824561403509), (0.9846153846153847, 0.6403508771929824), (1.0, 0.6140350877192983), (1.0, 0.6228070175438597), (0.9978021978021978, 0.6754385964912281), (0.8, 0.7368421052631579), (0.8373626373626374, 0.7105263157894737)]


In [257]:
with open('acc_bag.pkl', 'wb') as file:
    pickle.dump(bagging_accuracy_list, file)

In [284]:
bagging_classificator_list = [bag_clf, bag05_clf, pasting_clf, pasting05_clf, rnd_clf,
                               ada_clf, gradient_clf]

In [285]:
print(bagging_classificator_list)

[BaggingClassifier(n_estimators=30), BaggingClassifier(max_samples=0.5, n_estimators=30), BaggingClassifier(bootstrap=False, n_estimators=30), BaggingClassifier(bootstrap=False, max_samples=0.5, n_estimators=30), RandomForestClassifier(n_estimators=30), AdaBoostClassifier(n_estimators=30), GradientBoostingClassifier(n_estimators=30)]


In [260]:
with open('bag.pkl', 'wb') as file:
    pickle.dump(bagging_classificator_list, file)

In [261]:
data_breast_cancer = datasets.load_breast_cancer(as_frame=True).frame
X = data_breast_cancer.drop('target', axis=1)
y = data_breast_cancer['target']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [262]:
pasting2_clf = BaggingClassifier(n_estimators=30, max_samples=0.5, max_features=2, bootstrap=False)
pasting2_clf.fit(X_train, y_train)
pasting2_train_predict = pasting2_clf.predict(X_train)
pasting2_test_predict = pasting2_clf.predict(X_test)

pasting2_train_accuracy = accuracy_score(y_train, pasting2_train_predict)
pasting2_test_accuracy = accuracy_score(y_test, pasting2_test_predict)

print(pasting2_train_accuracy, pasting2_test_accuracy)

1.0 0.956140350877193


In [263]:
pasting2_accuracy_list = [pasting2_train_accuracy, pasting2_test_accuracy]

In [264]:
with open('acc_fea.pkl', 'wb') as file:
    pickle.dump(pasting2_accuracy_list, file)

In [265]:
pasting2_clf_list = [pasting2_clf]

In [266]:
with open('fea.pkl', 'wb') as file:
    pickle.dump(pasting2_clf_list, file)

In [268]:
my_clf = BaggingClassifier()
my_clf.fit(X_train, y_train)
train_accuracy = []
test_accuracy = []
feature_names = []
for estimator in my_clf.estimators_:
    clf = BaggingClassifier(estimator, max_features=2)
    clf.fit(X_train, y_train)
    clf_train_predict = clf.predict(X_train)
    clf_test_predict = clf.predict(X_test)

    clf_train_accuracy = accuracy_score(y_train, clf_train_predict)
    clf_test_accuracy = accuracy_score(y_test, clf_test_predict)
    train_accuracy.append(clf_train_accuracy)
    test_accuracy.append(clf_test_accuracy)
    tmp_feature_names = []
    for name in clf.estimators_features_:
        tmp_tmp_feature_names = []
        for element in name:
            tmp_tmp_feature_names.append(X.columns[element])
        tmp_feature_names.append(tmp_tmp_feature_names)
    feature_names.append(tmp_feature_names)

In [269]:
df = pd.DataFrame({'train_accuracy': train_accuracy,
                   'test_accuracy': test_accuracy,
                   'features': feature_names})

In [270]:
df = df.sort_values(by=['test_accuracy', 'train_accuracy'], ascending=False)
df

Unnamed: 0,train_accuracy,test_accuracy,features
2,1.0,0.973684,"[[worst texture, worst smoothness], [mean symm..."
8,1.0,0.964912,"[[worst compactness, worst concavity], [worst ..."
9,1.0,0.964912,"[[compactness error, mean area], [mean concavi..."
3,0.997802,0.95614,"[[worst symmetry, mean radius], [worst texture..."
7,1.0,0.947368,"[[smoothness error, mean perimeter], [compactn..."
4,1.0,0.938596,"[[mean concavity, concave points error], [symm..."
6,0.995604,0.938596,"[[mean texture, mean symmetry], [mean compactn..."
0,1.0,0.929825,"[[worst smoothness, texture error], [mean conc..."
5,0.995604,0.921053,"[[concavity error, mean area], [compactness er..."
1,0.995604,0.850877,"[[smoothness error, worst fractal dimension], ..."


In [271]:
df.to_pickle('acc_fea_rank.pkl')