In [1]:
import numpy as np

def entropy(y):
    """
    Calculating the entropy of a list of labels.
    """
    classes, counts = np.unique(y, return_counts=True)
    prob = counts / len(y)
    entropy = -np.sum(prob * np.log2(prob))
    return entropy


In [2]:

def information_gain(X, y, feature_idx):
    """
    Calculating the information gain of a feature.
    """
    total_entropy = entropy(y)
    values, counts = np.unique(X[:, feature_idx], return_counts=True)
    weighted_entropy = 0
    for value, count in zip(values, counts):
        subset_entropy = entropy(y[X[:, feature_idx] == value])
        weighted_entropy += count / len(X) * subset_entropy
    information_gain = total_entropy - weighted_entropy
    return information_gain



In [3]:

def find_best_split(X, y):
    """
    Finding the best feature to split on based on information gain.
    """
    best_gain = -1
    best_feature = None
    for feature_idx in range(X.shape[1]):
        gain = information_gain(X, y, feature_idx)
        if gain > best_gain:
            best_gain = gain
            best_feature = feature_idx
    return best_feature


In [4]:

class DecisionTree:
    def __init__(self, max_depth=None):
        self.max_depth = max_depth

    def fit(self, X, y):
        self.tree_ = self._grow_tree(X, y)

    def _grow_tree(self, X, y, depth=0):
        if len(np.unique(y)) == 1 or (self.max_depth is not None and depth >= self.max_depth):
            return np.argmax(np.bincount(y))

        feature_idx = find_best_split(X, y)
        if feature_idx is None:
            return np.argmax(np.bincount(y))

        values = np.unique(X[:, feature_idx])
        node = {}
        node[feature_idx] = {}
        for value in values:
            mask = X[:, feature_idx] == value
            node[feature_idx][value] = self._grow_tree(X[mask], y[mask], depth + 1)
        return node

    def predict(self, X):
        return np.array([self._predict(inputs) for inputs in X])

    def _predict(self, inputs):
        node = self.tree_
        while isinstance(node, dict):
            feature_idx = list(node.keys())[0]
            node = node[feature_idx][inputs[feature_idx]]
        return node

In [5]:

X = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
y = np.array([0, 1, 1, 0])
tree = DecisionTree()
tree.fit(X, y)
print(tree.predict(X)) 


[0 1 1 0]


### Training Dataset

In [9]:
import numpy as np

data = [
    [30, 'high', 'no', 'fair', 'no'],
    [30, 'high', 'no', 'excellent', 'no'],
    [31, 'medium', 'no', 'fair', 'yes'],
    [40, 'low', 'no', 'fair', 'yes'],
    [40, 'low', 'yes', 'fair', 'yes'],
    [40, 'low', 'yes', 'excellent', 'no'],
    [31, 'medium', 'yes', 'excellent', 'yes'],
    [30, 'high', 'no', 'fair', 'no'],
    [30, 'medium', 'yes', 'fair', 'yes'],
    [31, 'medium', 'yes', 'excellent', 'yes'],
    [31, 'high', 'no', 'excellent', 'yes'],
    [40, 'medium', 'no', 'fair', 'yes'],
    [40, 'high', 'yes', 'fair', 'yes'],
    [31, 'medium', 'no', 'excellent', 'no']
]




In [10]:

label_map = {'yes': 1, 'no': 0}


data = np.array(data)

X = data[:, :-1]  
y = np.array([label_map[label] for label in data[:, -1]]) 

print("Features:")
print(X)
print("Labels:")
print(y)


Features:
[['30' 'high' 'no' 'fair']
 ['30' 'high' 'no' 'excellent']
 ['31' 'medium' 'no' 'fair']
 ['40' 'low' 'no' 'fair']
 ['40' 'low' 'yes' 'fair']
 ['40' 'low' 'yes' 'excellent']
 ['31' 'medium' 'yes' 'excellent']
 ['30' 'high' 'no' 'fair']
 ['30' 'medium' 'yes' 'fair']
 ['31' 'medium' 'yes' 'excellent']
 ['31' 'high' 'no' 'excellent']
 ['40' 'medium' 'no' 'fair']
 ['40' 'high' 'yes' 'fair']
 ['31' 'medium' 'no' 'excellent']]
Labels:
[0 0 1 1 1 0 1 0 1 1 1 1 1 0]
