In [1]:
import pandas as pd
import numpy as np
from math import log2

# Manually defining the dataset from the question
data = pd.DataFrame({
    'CreditScore': ['High', 'Low', 'Medium', 'Low', 'High'],
    'IncomeLevel': ['High', 'Medium', 'High', 'Low', 'Medium'],
    'LoanAmount': ['Low', 'High', 'Medium', 'Medium', 'Low'],
    'ExistingLoan': ['No', 'Yes', 'No', 'Yes', 'No'],
    'Approved': ['Yes', 'No', 'Yes', 'No', 'Yes']
})

data

Unnamed: 0,CreditScore,IncomeLevel,LoanAmount,ExistingLoan,Approved
0,High,High,Low,No,Yes
1,Low,Medium,High,Yes,No
2,Medium,High,Medium,No,Yes
3,Low,Low,Medium,Yes,No
4,High,Medium,Low,No,Yes


In [2]:
# Entropy calculation function
def entropy(column):
    values, counts = np.unique(column, return_counts=True)
    entropy_value = -np.sum([(counts[i]/np.sum(counts)) * log2(counts[i]/np.sum(counts)) for i in range(len(values))])
    return entropy_value

# Entropy of the target (Approved)
target_entropy = entropy(data['Approved'])
print(f"Entropy of Approved: {target_entropy:.4f}")


Entropy of Approved: 0.9710


In [3]:
# Information Gain function
def info_gain(data, feature, target='Approved'):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)

    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) * entropy(data.where(data[feature] == values[i]).dropna()[target])
        for i in range(len(values))
    ])

    gain = total_entropy - weighted_entropy
    return gain

# Calculate info gain for each feature
for column in data.columns[:-1]:  # skip 'Approved'
    gain = info_gain(data, column)
    print(f"Information Gain for {column}: {gain:.4f}")


Information Gain for CreditScore: 0.9710
Information Gain for IncomeLevel: 0.5710
Information Gain for LoanAmount: 0.5710
Information Gain for ExistingLoan: 0.9710


In [4]:
# ID3 Algorithm: simplified version
def id3(data, original_data, features, target='Approved', parent_node_class=None):
    from collections import Counter

    # If all target values have same class, return that
    if len(np.unique(data[target])) <= 1:
        return np.unique(data[target])[0]
    
    # If no more features to split
    elif len(data) == 0:
        return np.unique(original_data[target])[np.argmax(np.unique(original_data[target], return_counts=True)[1])]

    elif len(features) == 0:
        return parent_node_class

    else:
        parent_node_class = np.unique(data[target])[np.argmax(np.unique(data[target], return_counts=True)[1])]

        # Find the best feature
        gains = [info_gain(data, feature, target) for feature in features]
        best_feature = features[np.argmax(gains)]

        tree = {best_feature: {}}
        features = [f for f in features if f != best_feature]

        for value in np.unique(data[best_feature]):
            subset = data.where(data[best_feature] == value).dropna()
            subtree = id3(subset, data, features, target, parent_node_class)
            tree[best_feature][value] = subtree

        return tree

# Build the tree
features = list(data.columns)
features.remove('Approved')

tree = id3(data, data, features)
print("Decision Tree:")
import pprint
pprint.pprint(tree)


Decision Tree:
{'CreditScore': {'High': 'Yes', 'Low': 'No', 'Medium': 'Yes'}}
