In [1]:
import numpy as np
import pandas as pd
import pickle
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import VotingClassifier
from sklearn.ensemble import BaggingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

In [2]:
data_breast = datasets.load_breast_cancer(as_frame=True)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(data_breast.data,data_breast.target,test_size=0.2,random_state=42)

In [4]:
X_train2 = X_train[['mean texture', 'mean symmetry']]
X_test2 = X_test[['mean texture', 'mean symmetry']]

In [5]:
log_clf = LogisticRegression()
tree_clf = DecisionTreeClassifier()
knn_clf = KNeighborsClassifier()

hard_clf = VotingClassifier(
    estimators=[('lr', log_clf),
                ('tr', tree_clf),
                ('knn', knn_clf)],
    voting='hard')

soft_clf = VotingClassifier(
    estimators=[('lr', log_clf),
                ('tr', tree_clf),
                ('knn', knn_clf)],
    voting='soft')

In [6]:
classifiers = [tree_clf,log_clf,knn_clf,hard_clf,soft_clf]
acc = []
for clf in classifiers:
    clf.fit(X_train2, y_train)
    first_val = accuracy_score(y_train, clf.predict(X_train2))
    second_val = accuracy_score(y_test, clf.predict(X_test2))
    acc.append((first_val,second_val))

In [7]:
acc

[(1.0, 0.6228070175438597),
 (0.7230769230769231, 0.7017543859649122),
 (0.7714285714285715, 0.6403508771929824),
 (0.8351648351648352, 0.7017543859649122),
 (0.9648351648351648, 0.6754385964912281)]

In [8]:
with open('acc_vote.pkl','wb') as f:
    pickle.dump(acc,f)

In [9]:
with open('vote.pkl','wb') as f:
    pickle.dump(classifiers,f)

In [10]:
bag_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                            bootstrap=True)
bag50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                              max_samples=0.5, bootstrap=True)
pas_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                            bootstrap=False)
pas50_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30,
                              max_samples=0.5, bootstrap=False)
rnd_clf = RandomForestClassifier(n_estimators=30)
ada_clf = AdaBoostClassifier(n_estimators=30)
gbc_clf = GradientBoostingClassifier(n_estimators=30)

In [11]:
classif = [bag_clf,bag50_clf,pas_clf,pas50_clf,rnd_clf,ada_clf,gbc_clf]
accurac = []
for clf in classif:
    clf.fit(X_train2, y_train)
    first_val = accuracy_score(y_train, clf.predict(X_train2))
    second_val = accuracy_score(y_test, clf.predict(X_test2))
    accurac.append((first_val,second_val))

In [12]:
accurac

[(0.9978021978021978, 0.6228070175438597),
 (0.9362637362637363, 0.6842105263157895),
 (1.0, 0.6228070175438597),
 (0.9604395604395605, 0.6666666666666666),
 (0.9956043956043956, 0.7017543859649122),
 (0.8, 0.7368421052631579),
 (0.8373626373626374, 0.7105263157894737)]

In [13]:
with open('acc_bag.pkl','wb') as f:
    pickle.dump(accurac,f)

In [45]:
with open('bag.pkl','wb') as f:
    pickle.dump(classif,f)

In [46]:
bagrnd_clf = BaggingClassifier(DecisionTreeClassifier(), n_estimators=30, bootstrap=True, 
                               bootstrap_features=False, max_samples=0.5, max_features=2)
bagrnd_clf.fit(X_train,y_train)
fea_acc = [accuracy_score(y_train, bagrnd_clf.predict(X_train)),
           accuracy_score(y_test, bagrnd_clf.predict(X_test))]

In [47]:
fea_acc

[0.9846153846153847, 0.9736842105263158]

In [32]:
with open('acc_fea.pkl','wb') as f:
    pickle.dump(fea_acc,f)

In [33]:
with open('fea.pkl','wb') as f:
    pickle.dump([bagrnd_clf],f)

In [26]:
bagrnd_clf.estimators_features_

[array([1, 3]),
 array([11,  4]),
 array([18,  9]),
 array([ 2, 25]),
 array([ 8, 27]),
 array([15,  6]),
 array([13,  9]),
 array([ 9, 27]),
 array([10,  9]),
 array([26, 28]),
 array([16, 20]),
 array([15, 25]),
 array([ 2, 13]),
 array([17,  5]),
 array([14,  6]),
 array([19, 15]),
 array([27,  7]),
 array([0, 8]),
 array([24, 20]),
 array([22,  2]),
 array([10,  5]),
 array([6, 9]),
 array([4, 0]),
 array([ 4, 29]),
 array([14, 17]),
 array([22, 16]),
 array([16, 12]),
 array([3, 6]),
 array([15, 14]),
 array([26, 29])]

In [25]:
bagrnd_clf.estimators_

[DecisionTreeClassifier(random_state=2077105647),
 DecisionTreeClassifier(random_state=1025579058),
 DecisionTreeClassifier(random_state=1957417715),
 DecisionTreeClassifier(random_state=97947721),
 DecisionTreeClassifier(random_state=192557904),
 DecisionTreeClassifier(random_state=1539714998),
 DecisionTreeClassifier(random_state=1596220300),
 DecisionTreeClassifier(random_state=11838184),
 DecisionTreeClassifier(random_state=1724777228),
 DecisionTreeClassifier(random_state=552641819),
 DecisionTreeClassifier(random_state=1001666565),
 DecisionTreeClassifier(random_state=216921139),
 DecisionTreeClassifier(random_state=859582323),
 DecisionTreeClassifier(random_state=924612426),
 DecisionTreeClassifier(random_state=1342071699),
 DecisionTreeClassifier(random_state=829759867),
 DecisionTreeClassifier(random_state=442169386),
 DecisionTreeClassifier(random_state=1861291257),
 DecisionTreeClassifier(random_state=2134296100),
 DecisionTreeClassifier(random_state=803632456),
 DecisionTre

In [28]:
df = pd.DataFrame({'train_acc': pd.Series(dtype='float'),
                   'test_acc':  pd.Series(dtype='float'),
                   'feat_list': pd.Series(dtype='object')})

In [31]:
df.head()

Unnamed: 0,train_acc,test_acc,feat_list


In [52]:
for index in range(len(bagrnd_clf.estimators_)):
    x_train = X_train.iloc[bagrnd_clf.estimators_features_[index]]
    print(bagrnd_clf.estimators_features_[index])
    print(x_train)
    clf = bagrnd_clf.estimators_[index]
    # clf.fit(x_train,y_train)

[13 15]
     mean radius  mean texture  mean perimeter  mean area  mean smoothness  \
470        9.667         18.49           61.49      289.1          0.08946   
33        19.270         26.47          127.90     1162.0          0.09401   

     mean compactness  mean concavity  mean concave points  mean symmetry  \
470           0.06258         0.02948              0.01514         0.2238   
33            0.17190         0.16570              0.07593         0.1853   

     mean fractal dimension  ...  worst radius  worst texture  \
470                 0.06413  ...         11.14          25.62   
33                  0.06261  ...         24.15          30.90   

     worst perimeter  worst area  worst smoothness  worst compactness  \
470            70.88       385.2            0.1234             0.1542   
33            161.40      1813.0            0.1509             0.6590   

     worst concavity  worst concave points  worst symmetry  \
470           0.1277                0.0656     