In [2]:

import math
from collections import Counter

def entropy(data):
    labels = [row[-1] for row in data]
    label_counts = Counter(labels)
    total = len(labels)
    return -sum((count / total) * math.log2(count / total) for count in label_counts.values())


In [3]:
def split_dataset(data, feature_index, value):
    return [row[:feature_index] + row[feature_index+1:] for row in data if row[feature_index] == value]


In [4]:
def info_gain(data, feature_index):
    base_entropy = entropy(data)
    values = set(row[feature_index] for row in data)
    total = len(data)
    
    weighted_entropy = 0
    for value in values:
        subset = [row for row in data if row[feature_index] == value]
        prob = len(subset) / total
        weighted_entropy += prob * entropy(subset)
    
    return base_entropy - weighted_entropy



In [5]:
def best_feature(data):
    num_features = len(data[0]) - 1
    best_index = 0
    best_gain = 0
    for i in range(num_features):
        gain = info_gain(data, i)
        if gain > best_gain:
            best_gain = gain
            best_index = i
    return best_index






In [6]:
def majority_class(data):
    labels = [row[-1] for row in data]
    return Counter(labels).most_common(1)[0][0]

def id3(data, features):
    labels = [row[-1] for row in data]
    if labels.count(labels[0]) == len(labels):
        return labels[0]
    if len(data[0]) == 1:
        return majority_class(data)
    
    best = best_feature(data)
    best_feat_name = features[best]
    tree = {best_feat_name: {}}
    
    values = set(row[best] for row in data)
    for value in values:
        subset = split_dataset(data, best, value)
        sub_features = features[:best] + features[best+1:]
        subtree = id3(subset, sub_features)
        tree[best_feat_name][value] = subtree
    return tree


In [7]:
# Example dataset
dataset = [
    ['Sunny', 'Hot', 'High', 'No'],
    ['Sunny', 'Hot', 'High', 'No'],
    ['Overcast', 'Hot', 'High', 'Yes'],
    ['Rainy', 'Mild', 'High', 'Yes'],
    ['Rainy', 'Cool', 'Normal', 'Yes'],
    ['Rainy', 'Cool', 'Normal', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Yes'],
    ['Sunny', 'Mild', 'High', 'No'],
]

features = ['Outlook', 'Temperature', 'Humidity']

tree = id3(dataset, features)
from pprint import pprint
pprint(tree)


{'Outlook': {'Overcast': 'Yes',
             'Rainy': {'Temperature': {'Cool': {'Humidity': {'Normal': 'Yes'}},
                                       'Mild': 'Yes'}},
             'Sunny': 'No'}}
