# BOW

In [None]:
import numpy as np
import pandas as pd

class DecisionTreeID3:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth
        self.tree = None

    def fit(self, features, target):
        data = features.copy()
        data[target.name] = target
        self.tree = self._id3(data, target.name, data.columns[:-1], depth=0)

    def _id3(self, data, target, attributes, depth):
        if len(np.unique(data[target])) == 1:
            return data[target].iloc[0]
        if len(attributes) == 0 or (self.max_depth is not None and depth >= self.max_depth):
            return data[target].mode()[0]
        else:
            best_attribute = self._select_best_attribute(data, target, attributes)
            tree = {best_attribute: {}}
            depth += 1
            for value in np.unique(data[best_attribute]):
                sub_data = data[data[best_attribute] == value].drop(columns=[best_attribute])
                subtree = self._id3(sub_data, target, sub_data.columns[:-1], depth)
                tree[best_attribute][value] = subtree
            return tree

    def _select_best_attribute(self, data, target, attributes):
        IGs = {attribute: self._information_gain(data, target, attribute) for attribute in attributes}
        return max(IGs, key=IGs.get)

    def _information_gain(self, data, target, attribute):
        total_entropy = self._entropy(data[target])
        values, counts = np.unique(data[attribute], return_counts=True)
        weighted_entropy = sum((counts[i] / np.sum(counts)) * self._entropy(data[data[attribute] == values[i]][target]) for i in range(len(values)))
        return total_entropy - weighted_entropy

    def _entropy(self, data):
        if data.empty:
            return 0
        probabilities = data.value_counts() / len(data)
        return sum(-p * np.log2(p) if p != 0 else 0 for p in probabilities)

    def predict(self, features):
        results = []
        for _, row in features.iterrows():
            results.append(self._predict(self.tree, row))
        return np.array(results)

    def _predict(self, tree, instance):
        if not isinstance(tree, dict):
            return tree
        attribute = next(iter(tree))
        if instance[attribute] in tree[attribute]:
            return self._predict(tree[attribute][instance[attribute]], instance)
        else:
            return np.nan  # Handle missing attribute cases

def load_data(filepaths):
    if isinstance(filepaths, list):
        frames = [pd.read_csv(fp) for fp in filepaths]
        data = pd.concat(frames, ignore_index=True)
    else:
        data = pd.read_csv(filepaths)
    
    features = data.iloc[:, 1:]  # Assuming first column is the label
    target = data.iloc[:, 0]    # First column is the target
    return features, target

def calculate_accuracy(predictions, actual):
    return np.mean(predictions == actual)

def cross_validation_and_final_test(train_folder, test_filepath, depths):
    split_files = [f"{train_folder}/training0{i}.csv" for i in range(5)]
    test_features, test_target = load_data(test_filepath)
    best_depth = None
    best_accuracy = 0

    for depth in depths:
        accuracies = []
        for i in range(5):
            test_file = split_files[i]
            train_files = [f for f in split_files if f != test_file]
            train_features, train_target = load_data(train_files)
            validation_features, validation_target = load_data(test_file)

            tree = DecisionTreeID3(max_depth=depth)
            tree.fit(train_features, train_target)
            predictions = tree.predict(validation_features)
            accuracy = calculate_accuracy(predictions, validation_target)
            accuracies.append(accuracy)
            print(f"Fold {i}, Depth {depth}, Validation Accuracy: {accuracy:.2f}")

        average_accuracy = np.mean(accuracies)
        print(f"Average Accuracy for Depth {depth}: {average_accuracy:.2f}")

        if average_accuracy > best_accuracy:
            best_accuracy = average_accuracy
            best_depth = depth

    # Train the final model on the full training dataset with the best depth
    train_features, train_target = load_data('train.csv')
    final_tree = DecisionTreeID3(max_depth=best_depth)
    final_tree.fit(train_features, train_target)
    final_predictions = final_tree.predict(test_features)
    final_test_accuracy = calculate_accuracy(final_predictions, test_target)
    print(f"Final Test Accuracy with Depth {best_depth}: {final_test_accuracy:.2f}")

# Example Usage
if __name__ == "__main__":
    cross_validation_and_final_test("CVSplits", "test.csv", [5, 10])


# faster maybe