In [24]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.tree import DecisionTreeClassifier
from sklearn.tree import DecisionTreeRegressor

In [None]:
class BaggingTrees:

    def __init__(self, n_trees=10, max_depth=5, is_classifier=True):
        self.n_trees = n_trees
        self.max_depth = max_depth
        self.trees = []
        self.is_classifier = is_classifier

    def fit(self, X, y):
        for i in range(self.n_trees):
            sample = X.sample(frac=1, replace=True)
            if self.is_classifier:
                tree = DecisionTreeClassifier(max_depth=self.max_depth)
            else:
                tree = DecisionTreeRegressor(max_depth=self.max_depth)
            tree.fit(sample, y.loc[sample.index])
            self.trees.append(tree)

    def predict_classification(self, X):
        predictions = pd.DataFrame()
        for i, tree in enumerate(self.trees):
            predictions[i] = tree.predict(X)
        return predictions.mode(axis=1)[0]
    
    def predict_regression(self, X):
        predictions = pd.DataFrame()
        for i, tree in enumerate(self.trees):
            predictions[i] = tree.predict(X)
        return predictions.mean(axis=1)
    
    def predict(self, X):
        if self.is_classifier:
            return self.predict_classification(X)
        else:
            return self.predict_regression(X)
    

# Fitness

In [12]:
fitness = pd.read_csv('data/fitness.txt', sep='\s+')
fitness.head()

Unnamed: 0,Age,Weight,Oxygen,RunTime,RestPulse,RunPulse,MaxPulse
0,44,89.47,44.609,11.37,62,178,182
1,44,85.84,54.297,8.65,45,156,168
2,38,89.02,49.874,9.22,55,178,180
3,40,75.98,45.681,11.95,70,176,180
4,44,81.42,39.442,13.08,63,174,176


In [27]:
# Create the feature matrix X and the target variable y
X = fitness.drop("Oxygen", axis=1)
y = fitness["Oxygen"]

# SA heart

In [29]:
sa_data = pd.read_csv('data/SAheart.data')
sa_data = sa_data.drop('row.names', axis=1)
sa_data["famhist"] = sa_data["famhist"].map({"Present": 1, "Absent": 0})

In [30]:
sa_data

Unnamed: 0,sbp,tobacco,ldl,adiposity,famhist,typea,obesity,alcohol,age,chd
0,160,12.00,5.73,23.11,1,49,25.30,97.20,52,1
1,144,0.01,4.41,28.61,0,55,28.87,2.06,63,1
2,118,0.08,3.48,32.28,1,52,29.14,3.81,46,0
3,170,7.50,6.41,38.03,1,51,31.99,24.26,58,1
4,134,13.60,3.50,27.78,1,60,25.99,57.34,49,1
...,...,...,...,...,...,...,...,...,...,...
457,214,0.40,5.98,31.72,0,64,28.45,0.00,58,0
458,182,4.20,4.41,32.10,0,52,28.61,18.72,52,1
459,108,3.00,1.59,15.23,0,40,20.09,26.64,55,0
460,118,5.40,11.61,30.79,0,64,27.35,23.97,40,0
