In [1]:
import random
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.datasets import load_iris

In [2]:
iris = load_iris()
df = pd.DataFrame(data=np.c_[iris['data'], iris['target']], columns=iris['feature_names'] + ['class'])
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),class
0,5.1,3.5,1.4,0.2,0.0
1,4.9,3.0,1.4,0.2,0.0
2,4.7,3.2,1.3,0.2,0.0
3,4.6,3.1,1.5,0.2,0.0
4,5.0,3.6,1.4,0.2,0.0


In [3]:
def own_bagging(clf, n_estimators, X_train, y_train, X_test):
    data = X_train.join(y_train)
    agg_probabilities = []
    
    for i in range(n_estimators):
        subsample = data.sample(n=random.randint(1, data.shape[0]), replace=True)
        clf.fit(X_train, y_train)
        probabilities = clf.predict_proba(X_test)
        agg_probabilities.append(probabilities)
    
    agg_probabilities = np.mean(agg_probabilities, axis=0)
    return agg_probabilities

In [4]:
def predict_class(agg_probabilities):
    predictions = [np.argmax(probabilities) for probabilities in agg_probabilities]
    return predictions

In [5]:
from sklearn.metrics import confusion_matrix

def false_predictions(y_test, y_pred):
    falses = 0
    for pred_value, test_value in zip(y_pred, y_test):
        if pred_value != test_value:
            falses += 1
    return falses

In [6]:
from sklearn.model_selection import train_test_split

X = df.iloc[:, :-1]
y = df.iloc[:, -1]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

In [7]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis

classifiers = [LinearDiscriminantAnalysis(), DecisionTreeClassifier()]
names = ['LDA', 'Decision Tree']
n_estimator = [1, 2, 5, 10, 20, 50]

for name, clf in zip(names, classifiers):
    print('False predictions for {}:'.format(name))
    
    for n in n_estimator:
        agg_prob = own_bagging(clf, n, X_train, y_train, X_test)
        y_pred = predict_class(agg_prob)
        print('{} estimators: {}'.format(n, false_predictions(y_test, y_pred)))
    
    print('------------------\n')

False predictions for LDA:
1 estimators: 1
2 estimators: 1
5 estimators: 1
10 estimators: 1
20 estimators: 1
50 estimators: 1
------------------

False predictions for Decision Tree:
1 estimators: 3
2 estimators: 2
5 estimators: 2
10 estimators: 2
20 estimators: 2
50 estimators: 2
------------------

