## Ensemble Learning

### Desicion Tree

In [20]:
import numpy as np
import pandas as pd
from scipy.io import arff

In [3]:
class DecisionTree:
    def __init__(self):
        self.feature = None
        self.threshold = None
        self.value_left = None
        self.value_right = None

    def fit(self, X, y):
        best_gain = -1
        for feature_index in range(X.shape[1]):
            thresholds = np.unique(X[:, feature_index])
            for threshold in thresholds:
                left_mask = X[:, feature_index] <= threshold
                right_mask = ~left_mask
                left_y, right_y = y[left_mask], y[right_mask]
                if len(left_y) and len(right_y):
                    left_weight = len(left_y) / len(y)
                    right_weight = 1 - left_weight
                    gain = self._entropy(y) - (left_weight * self._entropy(left_y) + right_weight * self._entropy(right_y))
                    if gain > best_gain:
                        best_gain = gain
                        self.feature = feature_index
                        self.threshold = threshold
                        self.value_left = np.bincount(left_y).argmax()
                        self.value_right = np.bincount(right_y).argmax()

    def predict(self, X):
        return np.where(X[:, self.feature] <= self.threshold, self.value_left, self.value_right)

    def _entropy(self, y):
        proportions = np.bincount(y) / len(y)
        return -np.sum([p * np.log2(p) for p in proportions if p > 0])

In [10]:
file = r"E:\KHU\ML\Last\dataset_37_diabetes.arff"

data, meta = arff.loadarff(file)
df = pd.DataFrame(data)


X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X = X.astype(float)

y = np.array([val.decode('utf-8') for val in y])  
mapping = {'tested_positive': 1, 'tested_negative': 0}
y = np.array([mapping[val] for val in y])

In [13]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y.flatten(), test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")

Shape of X_train: (614, 8), y_train: (614,)
Shape of X_test: (154, 8), y_test: (154,)


In [14]:
from sklearn.metrics import accuracy_score, f1_score

Tree = DecisionTree()
Tree.fit(X_train, y_train)

Tree_predictions = Tree.predict(X_test)

print(f"Decision Tree Accuracy: {accuracy_score(y_test, Tree_predictions):.3f}")
print(f"Decision Tree F1-Score: {f1_score(y_test, Tree_predictions, average='weighted'):.3f}")

Decision Tree Accuracy: 0.740
Decision Tree F1-Score: 0.742


### Random Forest

In [15]:
from sklearn.utils import resample

In [18]:
class RandomForest:
    def __init__(self, n_trees=7):
        self.n_trees = n_trees
        self.trees = []

    def fit(self, X, y):
        self.trees = []
        for _ in range(self.n_trees):
            stump = DecisionTree()
            X_sample, y_sample = self._bootstrap_samples(X, y)
            stump.fit(X_sample, y_sample)
            self.trees.append(stump)

    def predict(self, X):
        stump_predictions = np.array([stump.predict(X) for stump in self.trees])
        return self._majority_vote(stump_predictions)
    
    def _bootstrap_samples(self, X, y):
        return resample(X, y, n_samples=len(X), replace=True)
    
    def _majority_vote(self, predictions):
        return np.apply_along_axis(lambda x: np.bincount(x).argmax(), axis=0, arr=predictions)


In [21]:
file = r"E:\KHU\ML\Last\dataset_37_diabetes.arff"

data, meta = arff.loadarff(file)
df = pd.DataFrame(data)


X = df.iloc[:, :-1].values
y = df.iloc[:, -1].values
X = X.astype(float)

In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

print(f"Shape of X_train: {X_train.shape}, y_train: {y_train.shape}")
print(f"Shape of X_test: {X_test.shape}, y_test: {y_test.shape}")

Shape of X_train: (614, 8), y_train: (614,)
Shape of X_test: (154, 8), y_test: (154,)


In [19]:
rf_custom = RandomForest()
rf_custom.fit(X_train, y_train)

rf_cust_predictions = rf_custom.predict(X_test)

print(f"Custom RF Accuracy: {accuracy_score(y_test, rf_cust_predictions):.3f}")
print(f"Custom RF F1-Score: {f1_score(y_test, rf_cust_predictions, average='weighted'):.3f}")

Custom RF Accuracy: 0.740
Custom RF F1-Score: 0.740
