### Import Libraries

In [1]:
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.metrics import f1_score
import numpy as np
import pandas as pd
import csv
from sklearn.metrics import accuracy_score

In [3]:
class ID3DecisionTree:
    def __init__(self, max_depth=None, min_samples_split=2, criterion="information_gain"):
        self.max_depth = max_depth
        self.min_samples_split = min_samples_split
        self.criterion = criterion
        self.tree = None

    def fit(self, X, y):
        self.tree = self._build_tree(X, y, depth=0)

    def predict(self, X):
        return np.array([self._predict_sample(sample, self.tree) for _, sample in X.iterrows()])

    def _build_tree(self, X, y, depth):
        if len(y) == 0:
            return None 
        if depth == self.max_depth or len(set(y)) == 1 or len(X) < self.min_samples_split:
            return np.bincount(y).argmax()

        best_feature, best_threshold = self._find_best_split(X, y)

        if best_feature is None:
            return np.bincount(y).argmax() 

        # Create a decision node
        left_index = X[best_feature] < best_threshold
        right_index = X[best_feature] >= best_threshold

        left_subtree = self._build_tree(X[left_index], y[left_index], depth + 1)
        right_subtree = self._build_tree(X[right_index], y[right_index], depth + 1)

        return {best_feature: {best_threshold: left_subtree, "else": right_subtree}}

    def _find_best_split(self, X, y):
        best_gain = -float("inf")
        best_feature = None
        best_threshold = None

        for feature in X.columns:
            thresholds = X[feature].unique()
            for threshold in thresholds: 
                gain = self._calculate_gain(X, y, feature, threshold)
                if gain > best_gain:
                    best_gain = gain
                    best_feature = feature
                    best_threshold = threshold

        return best_feature, best_threshold 

    def _calculate_gain(self, X, y, feature, threshold):
        if self.criterion == "information_gain":
            return self._information_gain(X, y, feature, threshold)
        elif self.criterion == "information_gain_ratio":
            return self._information_gain_ratio(X, y, feature, threshold)
        elif self.criterion == "gini_index":
            return self._gini_index(X, y, feature, threshold)

    def _information_gain(self, X, y, feature, threshold):
        total_entropy = self._entropy(y)
        left_index = X[feature] < threshold
        right_index = X[feature] >= threshold

        left_entropy = self._entropy(y[left_index])
        right_entropy = self._entropy(y[right_index])

        weighted_left_entropy = len(y[left_index]) / len(y) * left_entropy
        weighted_right_entropy = len(y[right_index]) / len(y) * right_entropy

        return total_entropy - (weighted_left_entropy + weighted_right_entropy)

    def _information_gain_ratio(self, X, y, feature, threshold):
        information_gain = self._information_gain(X, y, feature, threshold)
        intrinsic_value = self._entropy(y)

        if intrinsic_value == 0:
            return 0
        else:
            return information_gain / intrinsic_value

    def _gini_index(self, X, y, feature, threshold):
        total_gini = self._gini(y)
        left_index = X[feature] < threshold
        right_index = X[feature] >= threshold

        left_gini = self._gini(y[left_index])
        right_gini = self._gini(y[right_index])

        weighted_left_gini = len(y[left_index]) / len(y) * left_gini
        weighted_right_gini = len(y[right_index]) / len(y) * right_gini

        return total_gini - (weighted_left_gini + weighted_right_gini)

    def _entropy(self, y):
        value_counts = np.bincount(y)
        probabilities = value_counts / len(y)
        probabilities = probabilities[probabilities > 0]
        return -np.sum(probabilities * np.log2(probabilities))

    def _gini(self, y):
        value_counts = np.bincount(y)
        probabilities = value_counts / len(y)
        return 1 - np.sum(probabilities ** 2)

    def _predict_sample(self, sample, tree):
        if isinstance(tree, dict):
            feature = next(iter(tree))
            thresholds = tree[feature]
            threshold = next(iter(thresholds))
        
            if sample[feature] is None:
                return 0 

            if sample[feature] < threshold:
                return self._predict_sample(sample, thresholds[threshold])
            else:
                return self._predict_sample(sample, thresholds["else"])
        else:
            return tree


In [4]:
data = pd.read_csv("titanic.csv")
data=  data.drop(data.columns[0], axis=1)
data["age"] = data["age"].fillna(data["age"].mean())
data["embarked"] = data["embarked"].fillna(data["embarked"].mode()[0])


data['fare_bin'] = pd.qcut(data['fare'], 4, labels=False)
data = data.drop('fare', axis=1)
label_encoder = LabelEncoder()
data["sex"] = label_encoder.fit_transform(data["sex"])
data["embarked"] = label_encoder.fit_transform(data["embarked"])

X = data.drop("survived", axis=1)
y = data["survived"]

In [5]:
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.4, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_val, y_val, test_size=0.5, random_state=42)

In [6]:
best_params = {}
best_f1 = 0
results = []

for max_depth in [3, 5, 9, 11]:
    for min_samples_split in [2, 10, 30, 40]:
        for criterion in ["information_gain", "information_gain_ratio", "gini_index"]:
            tree = ID3DecisionTree(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion)
            tree.fit(X_train, y_train)
            y_pred_val = tree.predict(X_val)
            f1 = f1_score(y_val, y_pred_val)
            results.append([criterion, max_depth, min_samples_split, f1])
            
            if f1 > best_f1:
                best_params = {"max_depth": max_depth, "min_samples_split": min_samples_split, "criterion": criterion}
                best_f1 = f1

with open("results.csv", "w", newline="") as csvfile:
    fieldnames = ["criterion", "max_depth", "min_samples_split", "f1_score"]
    writer = csv.writer(csvfile)
    writer.writerow(fieldnames)
    writer.writerows(results)

print("Best parameters:", best_params)

Best parameters: {'max_depth': 9, 'min_samples_split': 10, 'criterion': 'gini_index'}


In [7]:
tree = ID3DecisionTree(**best_params)

best_test_f1 = 0
best_test_accuracy = 0
for max_depth in [3, 5, 9, 11]:
    for min_samples_split in [2, 10, 30, 40]:
        for criterion in ["information_gain", "information_gain_ratio", "gini_index"]:
            tree = ID3DecisionTree(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion)
            tree.fit(pd.concat([X_train, X_val]), pd.concat([y_train, y_val]))
            y_pred = tree.predict(X_test)
            y_pred = np.nan_to_num(y_pred)
            y_pred = y_pred.astype(float)
            y_pred = np.nan_to_num(y_pred, nan=0)
            f1 = f1_score(y_test, y_pred)
            accuracy = accuracy_score(y_test, y_pred)
            if f1 > best_test_f1:
                best_test_f1 = f1
            if accuracy > best_test_accuracy:
                best_test_accuracy = accuracy
            print("max_depth: ", max_depth, "min_samples_split: ", min_samples_split, "f1_score: ", f1, "accuracy: ", accuracy)

max_depth:  3 min_samples_split:  2 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  2 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  2 f1_score:  0.6896551724137931 accuracy:  0.7482517482517482
max_depth:  3 min_samples_split:  10 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  10 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  10 f1_score:  0.6896551724137931 accuracy:  0.7482517482517482
max_depth:  3 min_samples_split:  30 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  30 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  30 f1_score:  0.6896551724137931 accuracy:  0.7482517482517482
max_depth:  3 min_samples_split:  40 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 min_samples_split:  40 f1_score:  0.640625 accuracy:  0.6783216783216783
max_depth:  3 mi

In [8]:
print(best_test_accuracy)
print(best_test_f1)

0.8181818181818182
0.74
