#### Malwina Wojewoda
# Task 3.
Implement your own version of bagging algorithm. You can use available implementations of decision trees (or other base learners). Compare the accuracy (compute for different train-test splits) of bagging and single tree using two datasetes.

In [1]:
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [73]:
df = pd.read_csv('data/SAheart.data')
df['famhist'] = df['famhist'].apply(lambda x: 1 if x == 'Present' else 0)
y = df['chd']
X = df.drop('chd', axis=1)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [69]:
clf = DecisionTreeClassifier()
clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [70]:
accuracy

0.5591397849462365

In [76]:
def create_bootstrap_sample(X, y):
    # Get the number of rows in X
    n_rows = X.shape[0]
    bootstrap_indices = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
    bootstrap_X = X.iloc[bootstrap_indices]
    bootstrap_y = y.iloc[bootstrap_indices]
    return bootstrap_X, bootstrap_y

bootstrap_X, bootstrap_y = create_bootstrap_sample(X_train, y_train)

In [77]:
clf = DecisionTreeClassifier()
clf.fit(bootstrap_X, bootstrap_y)
y_pred = clf.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)

In [78]:
accuracy

0.6236559139784946

In [79]:
def train_bagging_classifier(X, y, n_estimators=10):
    trees = []
    for _ in range(n_estimators):
        bootstrap_indices = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
        bootstrap_X = X.iloc[bootstrap_indices]
        bootstrap_y = y.iloc[bootstrap_indices]
        tree = DecisionTreeClassifier()
        tree.fit(bootstrap_X, bootstrap_y)
        trees.append(tree)

    return trees

In [80]:
trees = train_bagging_classifier(X_train, y_train)

In [85]:
def predict_bagging_classifier(X_test, estimators):
    all_probs = []
    for tree in estimators:
        probs = tree.predict_proba(X_test)
        all_probs.append(probs)

    avg_probs = np.mean(all_probs, axis=0)
    return np.argmax(avg_probs, axis=1)

In [86]:
y_pred = predict_bagging_classifier(X_test, trees)

In [87]:
accuracy = accuracy_score(y_test, y_pred)

In [88]:
accuracy

0.6559139784946236

In [71]:
class BaggingTreesClassifier:
    def __init__(self, n_trees=10):
        self.n_trees = n_trees
        self.trees = []
        
    def fit(self, X, y):
        for _ in range(self.n_trees):
            idx = np.random.choice(X.shape[0], size=X.shape[0], replace=True)
            X_bootstrap = X.iloc[idx]
            y_bootstrap = y.iloc[idx]
            tree = DecisionTreeClassifier()
            tree.fit(X_bootstrap, y_bootstrap)
            self.trees.append(tree)
    
    def predict_proba(self, X):
        trees_probas = []
        for tree in self.trees:
            tree_proba= tree.predict_proba(X)
            trees_probas.append(tree_proba)

        avg_proba = np.mean(trees_probas, axis=0)
        return avg_proba
    
    def predict(self, X):
        avg_proba = self.predict_proba(X)
        return np.argmax(avg_proba, axis=1)     

In [91]:
bagging_classifier = BaggingTreesClassifier(n_trees=10)
bagging_classifier.fit(X_train, y_train)
predictions = bagging_classifier.predict(X_test)

In [92]:
accuracy = accuracy_score(y_test, predictions)

In [93]:
accuracy

0.6451612903225806