In [12]:
import pandas as pd
from sklearn import datasets
data_breast_cancer = datasets.load_breast_cancer(as_frame=True)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data_breast_cancer['data'][['mean texture', 'mean symmetry']], data_breast_cancer['target'], test_size=0.2)

## 3.1 - 3.4

In [13]:
from sklearn.ensemble import VotingClassifier #ensemble
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
import pickle

decision_tree_clf = DecisionTreeClassifier()
logistic_regression_clf = SGDClassifier(loss='log_loss')
knn_clf = KNeighborsClassifier()

ensemble_hard = VotingClassifier(estimators=[('dt', decision_tree_clf), 
                                          ('lr', logistic_regression_clf), 
                                          ('knn', knn_clf)], 
                              voting='hard')
ensemble_soft = VotingClassifier(estimators=[('dt', decision_tree_clf),
                                             ('lr', logistic_regression_clf),
                                             ('knn', knn_clf)],
                                 voting='soft')
# fit
ensemble_hard.fit(X_train, y_train)
ensemble_soft.fit(X_train, y_train)

In [14]:
# list of touples (accuracy for train, accuracy for test)
list_of_accuracy = []


# decision_tree_clf
list_of_accuracy.append((accuracy_score(y_train, decision_tree_clf.fit(X_train, y_train).predict(X_train)),
                         accuracy_score(y_test, decision_tree_clf.fit(X_train, y_train).predict(X_test))))

# logistic_regression_clf
list_of_accuracy.append((accuracy_score(y_train, logistic_regression_clf.fit(X_train, y_train).predict(X_train)),
                         accuracy_score(y_test, logistic_regression_clf.fit(X_train, y_train).predict(X_test))))

# knn_clf
list_of_accuracy.append((accuracy_score(y_train, knn_clf.fit(X_train, y_train).predict(X_train)),
                         accuracy_score(y_test, knn_clf.fit(X_train, y_train).predict(X_test))))

# ensemble_hard
list_of_accuracy.append((accuracy_score(y_train, ensemble_hard.predict(X_train)), 
                         accuracy_score(y_test, ensemble_hard.predict(X_test))))

# ensemble_soft
list_of_accuracy.append((accuracy_score(y_train, ensemble_soft.predict(X_train)),
                         accuracy_score(y_test, ensemble_soft.predict(X_test))))

# save to pickle
with open('acc_vote.pkl', 'wb') as f:
    pickle.dump(list_of_accuracy, f)

for pair in list_of_accuracy:
    print('train: {:.5f}, test: {:.5f}'.format(pair[0], pair[1]))


train: 1.00000, test: 0.64912
train: 0.56923, test: 0.62281
train: 0.78242, test: 0.64912
train: 0.84176, test: 0.67544
train: 0.84396, test: 0.68421


In [15]:
# list of classifiers
list_of_clf = [decision_tree_clf, logistic_regression_clf, knn_clf, ensemble_hard, ensemble_soft]
with open('vote.pkl', 'wb') as f:
    pickle.dump(list_of_clf, f)

## 3.5

In [16]:
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
decision_tree = DecisionTreeClassifier()

bagging = BaggingClassifier(base_estimator=decision_tree, n_estimators=30)
bagging.fit(X_train, y_train)

bagging_50 = BaggingClassifier(base_estimator=decision_tree, n_estimators=30, max_samples=0.5)
bagging_50.fit(X_train, y_train)

pasting = BaggingClassifier(base_estimator=decision_tree, n_estimators=30, bootstrap=False)
pasting.fit(X_train, y_train)

pasting_50 = BaggingClassifier(base_estimator=decision_tree, n_estimators=30, bootstrap=False, max_samples=0.5)
pasting_50.fit(X_train, y_train)

random_forest = RandomForestClassifier(n_estimators=30)
random_forest.fit(X_train, y_train)

adaboost = AdaBoostClassifier(base_estimator=decision_tree, n_estimators=30)
adaboost.fit(X_train, y_train)

gradient_boosting = GradientBoostingClassifier(n_estimators=30)
gradient_boosting.fit(X_train, y_train)





In [17]:
list_of_bagging = []

# bagging
list_of_bagging.append((accuracy_score(y_train, bagging.predict(X_train)),
                        accuracy_score(y_test, bagging.predict(X_test))))

# bagging_50
list_of_bagging.append((accuracy_score(y_train, bagging_50.predict(X_train)),
                        accuracy_score(y_test, bagging_50.predict(X_test))))

# passting
list_of_bagging.append((accuracy_score(y_train, pasting.predict(X_train)),
                        accuracy_score(y_test, pasting.predict(X_test))))

# pasting_50
list_of_bagging.append((accuracy_score(y_train, pasting_50.predict(X_train)),
                        accuracy_score(y_test, pasting_50.predict(X_test))))

# random_forest
list_of_bagging.append((accuracy_score(y_train, random_forest.predict(X_train)),
                        accuracy_score(y_test, random_forest.predict(X_test))))

# ada_boost
list_of_bagging.append((accuracy_score(y_train, adaboost.predict(X_train)),
                        accuracy_score(y_test, adaboost.predict(X_test))))

# gradient_boosting
list_of_bagging.append((accuracy_score(y_train, gradient_boosting.predict(X_train)),
                        accuracy_score(y_test, gradient_boosting.predict(X_test))))

with open('acc_bag.pkl', 'wb') as f:
    pickle.dump(list_of_bagging, f)
    
for pair in list_of_bagging:
    print('train: {:.5f}, test: {:.5f}'.format(pair[0], pair[1]))
    
    
    
    
list_of_bag = [bagging, bagging_50, pasting, pasting_50, random_forest, adaboost, gradient_boosting]
with open('bag.pkl', 'wb') as f:
    pickle.dump(list_of_bag, f)

train: 0.99121, test: 0.59649
train: 0.92088, test: 0.67544
train: 1.00000, test: 0.64912
train: 0.96703, test: 0.65789
train: 0.99780, test: 0.64912
train: 1.00000, test: 0.64035
train: 0.83736, test: 0.68421


## 3.7 - 3.8

In [18]:
tree = DecisionTreeClassifier()

# Define bagging classifier with feature sampling
bagging = BaggingClassifier(base_estimator=tree, n_estimators=30, max_samples=0.5, max_features=2, bootstrap=True, bootstrap_features=True)

# Fit
bagging.fit(X_train, y_train)



In [19]:
list_of_tree_acc = [accuracy_score(y_train, tree.fit(X_train, y_train).predict(X_train)), accuracy_score(y_test, tree.fit(X_train, y_train).predict(X_test))]

with open('acc_fea.pkl', 'wb') as f:
    pickle.dump(list_of_tree_acc, f)
    
with open('fea.pkl', 'wb') as f:
    pickle.dump([bagging], f)

## 3.9

In [20]:
# Data

estimators = bagging.estimators_
estimators_features = bagging.estimators_features_

train_scores = []
test_scores = []
features_names = []

# Accuracy
for tree, features in zip(estimators, estimators_features):
    train_scores.append(accuracy_score(y_train, tree.predict(X_train.iloc[:, features])))
    test_scores.append(accuracy_score(y_test, tree.predict(X_test.iloc[:, features])))
    features_names.append(features)
    
# Create dataframe
df = pd.DataFrame({'train': train_scores, 'test': test_scores, 'features': features_names})
# Sort
df.sort_values(by=['test', 'train'], ascending=False, inplace=True)

# Save to pickle
df.to_pickle('acc_fea_rank.pkl')

