In [10]:
import numpy as np
import pandas as pd


In [11]:

def calculate_entropy(y):
    """
    Calculate the entropy of a set of labels.

    Args:
        y (array-like): Target variable labels.

    Returns:
        float: Entropy value.
    """
    unique_labels, label_counts = np.unique(y, return_counts=True)
    probabilities = label_counts / len(y)
    entropy = -np.sum(probabilities * np.log2(probabilities))
    return entropy


# Example usage:
# Assuming X and y are your data and target variables.
# node_indices initially contains all indices, i.e., np.arange(len(y)).
# You can call the function like this:
# build_tree_recursive(X, y, node_indices=np.arange(len(y)), branch_name="Root", max_depth=3, current_depth=0)


In [12]:

def find_best_split(X, y, node_indices):
    """
    Find the best feature and value to split the dataset based on information gain.

    Args:
        X (ndarray): Data matrix of shape (n_samples, n_features).
        y (array-like): Target variable labels.
        node_indices (ndarray): List containing the active indices.

    Returns:
        tuple: Best feature index, best split value.
    """
    best_feature = None
    best_value = None
    best_information_gain = -1

    current_entropy = calculate_entropy(y[node_indices])

    for feature_index in range(X.shape[1]):
        unique_values = np.unique(X[node_indices, feature_index])

        for value in unique_values:
            left_indices = node_indices[X[node_indices, feature_index] <= value]
            right_indices = node_indices[X[node_indices, feature_index] > value]

            if len(left_indices) == 0 or len(right_indices) == 0:
                continue

            left_entropy = calculate_entropy(y[left_indices])
            right_entropy = calculate_entropy(y[right_indices])

            information_gain = current_entropy - (
                    len(left_indices) / len(node_indices) * left_entropy +
                    len(right_indices) / len(node_indices) * right_entropy
            )

            if information_gain > best_information_gain:
                best_information_gain = information_gain
                best_feature = feature_index
                best_value = value

    return best_feature, best_value



In [16]:

def build_tree_recursive(X, y, node_indices, branch_name, max_depth, current_depth):
    """
    Build a tree using the recursive algorithm that splits the dataset into 2 subgroups at each node.
    This function just prints the tree.

    Args:
        X (ndarray):            Data matrix of shape (n_samples, n_features).
        y (array-like):         List or ndarray with n_samples containing the target variable.
        node_indices (ndarray): List containing the active indices, i.e., the samples being considered in this step.
        branch_name (string):   Name of the branch. ['Root', 'Left', 'Right'].
        max_depth (int):        Max depth of the resulting tree.
        current_depth (int):    Current depth. Parameter used during recursive call.
    """
    unique_labels = np.unique(y[node_indices])
    print(f"{branch_name} Depth: {current_depth}, Samples: {len(node_indices)}, Class Distribution: {dict(zip(unique_labels, np.bincount(y[node_indices])))}")

    if current_depth == max_depth or len(unique_labels) == 1:
        return

    best_feature, best_value = find_best_split(X, y, node_indices)

    if best_feature is not None:
        print(f"{branch_name} Splitting at Feature {best_feature} with Value {best_value}")
        left_indices = node_indices[X[node_indices, best_feature] <= best_value]
        right_indices = node_indices[X[node_indices, best_feature] > best_value]

        build_tree_recursive(X, y, left_indices, branch_name="Left", max_depth=max_depth, current_depth=current_depth + 1)
        build_tree_recursive(X, y, right_indices, branch_name="Right", max_depth=max_depth, current_depth=current_depth + 1)



You will start by loading the dataset for this task. The dataset you have collected is as follows:

| glasses | Tall | Long hair | man/woman(0/1) |
|:-------:|:----:|:---------:|:--------------:|
|    1    |  1   |     1     |       1        |
|    0    |  0   |     1     |       1        |
|    1    |  0   |     0     |       0        |
|    1    |  0   |     0     |       0        |
|    0    |  0   |     1     |       1        |
|    1    |  1   |     1     |       0        |
|    1    |  0   |     0     |       0        |
|    0    |  1   |     1     |       1        |
|    0    |  1   |     1     |       1        |
|    0    |  1   |     1     |       0        |

In [19]:

# Create a DataFrame from the provided data
data = {
    'glasses': [1, 0, 1, 1, 0, 1, 1, 0, 0, 0],
    'Tall': [1, 0, 0, 0, 0, 1, 0, 1, 1, 1],
    'Long_hair': [1, 1, 0, 0, 1, 1, 0, 1, 1, 1],
    'man_woman': [1, 1, 0, 0, 1, 0, 0, 1, 1, 0]
}

df = pd.DataFrame(data)

# Display the DataFrame
print("Original Dataset:")
print(df)
print("\n")

# Convert DataFrame to NumPy arrays
X = df[['glasses', 'Tall', 'Long_hair']].values
y = df['man_woman'].values

# Run the build_tree_recursive function on the dataset
build_tree_recursive(X, y, node_indices=np.arange(len(y)), branch_name="Root", max_depth=3, current_depth=0)


Original Dataset:
   glasses  Tall  Long_hair  man_woman
0        1     1          1          1
1        0     0          1          1
2        1     0          0          0
3        1     0          0          0
4        0     0          1          1
5        1     1          1          0
6        1     0          0          0
7        0     1          1          1
8        0     1          1          1
9        0     1          1          0


Root Depth: 0, Samples: 10, Class Distribution: {0: 5, 1: 5}
Root Splitting at Feature 2 with Value 0
Left Depth: 1, Samples: 3, Class Distribution: {0: 3}
Right Depth: 1, Samples: 7, Class Distribution: {0: 2, 1: 5}
Right Splitting at Feature 1 with Value 0
Left Depth: 2, Samples: 2, Class Distribution: {1: 0}
Right Depth: 2, Samples: 5, Class Distribution: {0: 2, 1: 3}
Right Splitting at Feature 0 with Value 0
Left Depth: 3, Samples: 3, Class Distribution: {0: 1, 1: 2}
Right Depth: 3, Samples: 2, Class Distribution: {0: 1, 1: 1}
