Code for Bagging_Data_Loader

In [9]:
import numpy as np

def load_data(file_path):
    """
    Load data from a text file into a NumPy array.
    
    :param file_path: Path to the file containing the data.
    :return: NumPy array with the loaded data.
    """
    return np.loadtxt(file_path)

def bagging_data(data, n_trees):
    """
    Create bags of data for training each tree in a Random Forest.
    
    :param data: A NumPy array where rows are samples and the last column is the label.
    :param n_trees: The number of trees (bags) in the Random Forest.
    :return: A list of NumPy arrays, each array is a bagged sample of the original data.
    """
    n_samples = data.shape[0]
    bags = []

    for _ in range(n_trees):
        # Sampling with replacement
        indices = np.random.choice(n_samples, size=n_samples, replace=True)
        bag = data[indices]
        bags.append(bag)

    return bags

# Example usage
data = load_data('pa2train.txt')  # Load data from file
bags = bagging_data(data, n_trees=100)  # Create 100 bags for the Random Forest

# bags now contains 100 different subsets of the original data, each to be used to train a tree.


Code for random_forest_tuner

In [10]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def build_random_forest(bags, feature_indices, criterion='gini'):
    trees = []

    for i, bag in enumerate(bags):
        features_for_tree = feature_indices[i]
        X, y = bag[:, features_for_tree], bag[:, -1]  # Last column is the label

        # Reshape data if needed
        if X.ndim == 1:  # Check if X is 1D
            X = X.reshape(-1, 1)  # Reshape to 2D array

        tree = DecisionTreeClassifier(criterion=criterion)
        tree.fit(X, y)
        trees.append(tree)

    return trees


def evaluate_random_forest(trees, feature_indices, validation_data):
    """
    Evaluate a Random Forest on validation data.

    :param trees: List of trained decision tree classifiers.
    :param feature_indices: List of feature indices for each tree.
    :param validation_data: Validation dataset.
    :return: Accuracy of the Random Forest on the validation dataset.
    """
    predictions = []

    for tree, features in zip(trees, feature_indices):
        X_val = validation_data[:, features]

        # Ensure X_val is 2D
        if X_val.ndim == 1:
            X_val = X_val.reshape(-1, 1)

        predictions.append(tree.predict(X_val))
    
    # Majority voting
    predictions = np.array(predictions)
    final_prediction = np.round(predictions.mean(axis=0))
    accuracy = accuracy_score(validation_data[:, -1], final_prediction)
    return accuracy

def random_feature_indices(n_features, n_trees, min_features, max_features):
    """
    Generate random feature indices for each tree in the forest.

    :param n_features: Total number of features.
    :param n_trees: Number of trees in the forest.
    :param min_features: Minimum number of features to use for each tree.
    :param max_features: Maximum number of features to use for each tree.
    :return: List of feature indices for each tree.
    """
    feature_indices = []
    all_features = np.arange(n_features)

    # Ensuring all features are used at least once
    np.random.shuffle(all_features)
    feature_indices.extend(all_features[:n_trees])

    # Additional random feature selection for remaining trees
    for _ in range(n_trees - len(feature_indices)):
        n_features_for_tree = np.random.randint(min_features, max_features + 1)
        features_for_tree = np.random.choice(all_features, n_features_for_tree, replace=False)
        feature_indices.append(features_for_tree)

    return feature_indices

# Tuning hyperparameters
data = load_data('pa2train.txt')
validation_data = load_data('pa2validation.txt')
n_features = validation_data.shape[1] - 1
best_accuracy = 0
best_n_trees = 0
best_feature_indices = []

for n_trees in [10, 50, 100, 150, 200]:
    feature_indices = random_feature_indices(n_features, n_trees, 5, 15)
    bags = bagging_data(data, n_trees=n_trees)
    random_forest = build_random_forest(bags, feature_indices)
    accuracy = evaluate_random_forest(random_forest, feature_indices, validation_data)

    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_n_trees = n_trees
        best_feature_indices = feature_indices

# Best model
print(f"Best Number of Trees: {best_n_trees}, Best Accuracy: {best_accuracy}")

# Save best_n_trees to a file
with open('best_params.txt', 'w') as file:
    file.write(str(best_n_trees))

Best Number of Trees: 200, Best Accuracy: 0.881


Code for decision_tree_tuner

In [11]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score

def load_data(file_path):
    """
    Load data from a text file into a NumPy array.

    :param file_path: Path to the file containing the data.
    :return: NumPy array with the loaded data.
    """
    return np.loadtxt(file_path)

def tune_decision_tree(train_data, validation_data):
    """
    Tune a DecisionTreeClassifier using validation data.

    :param train_data: Training dataset.
    :param validation_data: Validation dataset.
    :return: Best classifier and its accuracy.
    """
    best_accuracy = 0
    best_classifier = None

    for max_depth in [5, 10, 15, None]:
        for min_samples_split in [2, 4, 6, 8]:
            for min_samples_leaf in [1, 2, 4, 6]:

                # Initialize and train the classifier
                clf = DecisionTreeClassifier(criterion='gini', max_depth=max_depth,
                                             min_samples_split=min_samples_split,
                                             min_samples_leaf=min_samples_leaf)
                clf.fit(train_data[:, :-1], train_data[:, -1])

                # Evaluate on validation data
                predictions = clf.predict(validation_data[:, :-1])
                accuracy = accuracy_score(validation_data[:, -1], predictions)

                if accuracy > best_accuracy:
                    best_accuracy = accuracy
                    best_classifier = clf

    return best_classifier, best_accuracy

# Load data
train_data = load_data('pa2train.txt')
validation_data = load_data('pa2validation.txt')

# Tune the classifier
best_clf, best_acc = tune_decision_tree(train_data, validation_data)

print(f"Best Decision Tree Classifier: {best_clf}")
print(f"Best Validation Accuracy: {best_acc}")


Best Decision Tree Classifier: DecisionTreeClassifier(max_depth=5)
Best Validation Accuracy: 0.884


Code for model_comparison

In [14]:
import numpy as np
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier

# Load test data
test_data = load_data('pa2test.txt')

# Load and preprocess validation data to get the number of features
validation_data = load_data('pa2validation.txt')
n_features = validation_data.shape[1] - 1  # Subtract 1 for the label column

# Parameters from random forest tuning process
best_n_trees_rf = 100  # Replace with the best number of trees found in random forest tuning
best_min_features = 5
best_max_features = 15

# Parameters from decision tree tuning process
best_tree, _ = tune_decision_tree(load_data('pa2train.txt'), validation_data)  # Tuning process already selects best tree

# Build random forest
feature_indices_rf = random_feature_indices(n_features, best_n_trees_rf, best_min_features, best_max_features)
bags = bagging_data(load_data('pa2train.txt'), n_trees=best_n_trees_rf)
random_forest = build_random_forest(bags, feature_indices_rf)

# Evaluate random forest on test data
accuracy_rf = evaluate_random_forest(random_forest, feature_indices_rf, test_data)
print(f"Random Forest Accuracy on Test Data: {accuracy_rf}")

# Evaluate decision tree on test data
predictions_dt = best_tree.predict(test_data[:, :-1])
accuracy_dt = accuracy_score(test_data[:, -1], predictions_dt)
print(f"Decision Tree Accuracy on Test Data: {accuracy_dt}")

Random Forest Accuracy on Test Data: 0.876
Decision Tree Accuracy on Test Data: 0.885
