In [2]:
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score


In [3]:
def load_data(path):
    return np.loadtxt(path)

def bagging(D, num_trees):
    n = D.shape[0]
    bags = []

    for _ in range(n):
        idx = np.random.choice(n, n, replace=True)
        bags.append(D[idx])
    return bags

data = load_data('pa2train.txt')
bagging(data, 100)

[array([[1.600e+05, 1.000e+00, 1.000e+00, ..., 3.160e+02, 3.160e+02,
         0.000e+00],
        [2.400e+05, 2.000e+00, 1.000e+00, ..., 5.500e+03, 5.000e+03,
         0.000e+00],
        [5.000e+04, 2.000e+00, 0.000e+00, ..., 8.160e+02, 6.880e+02,
         1.000e+00],
        ...,
        [8.000e+04, 2.000e+00, 0.000e+00, ..., 3.960e+02, 5.460e+02,
         1.000e+00],
        [2.000e+05, 1.000e+00, 1.000e+00, ..., 0.000e+00, 6.054e+03,
         0.000e+00],
        [3.000e+04, 2.000e+00, 1.000e+00, ..., 1.000e+03, 2.000e+03,
         0.000e+00]]),
 array([[1.100e+05, 1.000e+00, 1.000e+00, ..., 3.600e+03, 3.142e+03,
         1.000e+00],
        [1.300e+05, 2.000e+00, 1.000e+00, ..., 0.000e+00, 0.000e+00,
         1.000e+00],
        [1.000e+05, 1.000e+00, 0.000e+00, ..., 2.000e+03, 2.000e+03,
         1.000e+00],
        ...,
        [1.100e+05, 2.000e+00, 1.000e+00, ..., 0.000e+00, 4.000e+03,
         1.000e+00],
        [5.000e+04, 1.000e+00, 1.000e+00, ..., 5.400e+02, 3.900e+02,
   

In [4]:
def random_forest(bags, y, criterion='gini'):
    forest = []

    for i in range(len(bags)):
        bag, y_tree = bags[i], y[i % len(y)]

        X_build, y_build = bag[:, y_tree], bag[:, -1]

        if X_build.ndim == 1:
            X_build = X_build.reshape(-1, 1)
        
        dct = DecisionTreeClassifier(criterion=criterion)
        dct.fit(X_build, y_build)
        forest.append(dct)
   
    return forest

def evaluate_forests(forest, y, D):
    computations = []

    for tree, y_tree in zip(forest, y):
        X_val = D[:, y_tree]

        if X_val.ndim == 1:
            X_val = X_val.reshape(-1, 1)
        
        compute = tree.predict(X_val)
        computations.append(compute)
    
    predictions = np.round(np.array(computations).mean(axis=0))
    return accuracy_score(D[:, -1], predictions)

def generate_random_y_index(y, num_trees, min_y, max_y):
    result = []
    y_build = np.arange(y)
    np.random.shuffle(y_build)

    result.extend(y_build[:num_trees])

    for i in range(num_trees - len(result)):
        random_y = np.random.randint(min_y, max_y + 1)
        result.append(np.random.choice(y_build, random_y, replace=False))
    return result


In [17]:
# Loading data
train_data = load_data('pa2train.txt')
validation_data = load_data('pa2validation.txt')

# Determining number of features
num_features = validation_data.shape[1] - 1

# Initializing variables for best model selection
best_accuracy = 0
best_num_trees = 0
best_feature_indices = []

# Hyperparameter search
for num_trees in [10, 50, 100, 150, 200]:
    feature_indices = generate_random_y_index(num_features, num_trees, 5, 15)
    bags = bagging(train_data, num_trees)
    forest = random_forest(bags, feature_indices)
    accuracy = evaluate_forests(forest, feature_indices, validation_data)

    # Selecting best model
    if accuracy > best_accuracy:
        best_accuracy = accuracy
        best_num_trees = num_trees
        best_feature_indices = feature_indices

print(f"Best # Trees: {best_num_trees}, Best Accuracy: {best_accuracy}")



Best Number of Trees: 150, Best Accuracy: 0.885


In [29]:
def tune_tree(train_data, validation_data):
    best_accuracy = 0
    best_tree = None

    for criterion in ['gini', 'entropy']:
        for max_depth in range(1, 11): 
            dct = DecisionTreeClassifier(criterion=criterion, max_depth=max_depth)
            dct.fit(train_data[:, :-1], train_data[:, -1]) 

            results = dct.predict(validation_data[:, :-1])
            accuracy = accuracy_score(validation_data[:, -1], results)

            if accuracy > best_accuracy:
                best_accuracy = accuracy
                best_tree = dct

    return best_tree, best_accuracy

In [32]:
train_data = load_data('pa2train.txt')
validation_data = load_data('pa2validation.txt')

best_tree, best_acc = tune_tree(train_data, validation_data)

print(f"Best Decision Tree Classifier: {best_tree}")
print(f"Best Accuracy In Validation: {best_acc}")

Best Decision Tree Classifier: DecisionTreeClassifier(max_depth=1)
Best Accuracy In Validation: 0.893


In [27]:
test_data = load_data('pa2test.txt')
validation_data = load_data('pa2validation.txt')
n_features = validation_data.shape[1] - 1 


best_n_trees_rf = 100
best_min_features = 5
best_max_features = 15


best_tree, _ = tune_tree(load_data('pa2train.txt'), validation_data)

feature_indices_rf = generate_random_y_index(n_features, best_n_trees_rf, best_min_features, best_max_features)
bags = bagging(load_data('pa2train.txt'), best_n_trees_rf)
random_forest = random_forest(bags, feature_indices_rf)

accuracy_rf = evaluate_forests(random_forest, feature_indices_rf, test_data)
print(f" Test Data Accuracy: {accuracy_rf}")

predictions_dt = best_tree.predict(test_data[:, :-1])
accuracy_dt = accuracy_score(test_data[:, -1], predictions_dt)
print(f"Accuracy on Decision Tree Test Data: {accuracy_dt}")

 Test Data Accuracy: 0.872
Accuracy on Decision Tree Test Data: 0.897
