In [1]:
import pandas as pd
import numpy as np
from math import log2
from pprint import pprint

In [2]:
# --- Step 1: Define the Dataset ---
data = pd.DataFrame({
    'JobSatisfaction': ['High', 'Low', 'Medium', 'Low', 'High'],
    'WorkLifeBalance': ['Good', 'Poor', 'Good', 'Poor', 'Excellent'],
    'YearsAtCompany': [5, 2, 3, 1, 6],
    'Overtime': ['No', 'Yes', 'No', 'Yes', 'No'],
    'Attrition': ['No', 'Yes', 'No', 'Yes', 'No']
})

In [3]:
# --- Step 2: Entropy Calculation ---
def entropy(column):
    values, counts = np.unique(column, return_counts=True)
    entropy_val = -np.sum([(counts[i]/np.sum(counts)) * log2(counts[i]/np.sum(counts))
                           for i in range(len(values))])
    return entropy_val


In [4]:
# --- Step 3: Information Gain ---
def info_gain(data, feature, target='Attrition'):
    total_entropy = entropy(data[target])
    values, counts = np.unique(data[feature], return_counts=True)

    weighted_entropy = np.sum([
        (counts[i] / np.sum(counts)) *
        entropy(data[data[feature] == values[i]][target])
        for i in range(len(values))
    ])

    return total_entropy - weighted_entropy


In [5]:
# --- Step 4: ID3 Algorithm (Recursive Tree Builder) ---
def id3(data, original_data, features, target='Attrition', parent_class=None):
    target_values = data[target]
    
    # If all instances have the same target label
    if len(np.unique(target_values)) <= 1:
        return target_values.iloc[0]

    # If dataset is empty
    elif len(data) == 0:
        return parent_class

    # If no more features to split
    elif len(features) == 0:
        return np.unique(target_values)[np.argmax(np.unique(target_values, return_counts=True)[1])]

    # Else, compute best feature to split
    else:
        parent_class = np.unique(target_values)[np.argmax(np.unique(target_values, return_counts=True)[1])]
        gains = [info_gain(data, f, target) for f in features]
        best_feature = features[np.argmax(gains)]

        tree = {best_feature: {}}
        remaining_features = [f for f in features if f != best_feature]

        for value in np.unique(data[best_feature]):
            sub_data = data[data[best_feature] == value]
            subtree = id3(sub_data, data, remaining_features, target, parent_class)
            tree[best_feature][value] = subtree

        return tree


In [6]:
# --- Step 5: Compute Entropy and Information Gain ---
target_entropy = entropy(data['Attrition'])
print("Entropy of Attrition:", round(target_entropy, 5))

# Info gain for each feature
features = ['JobSatisfaction', 'WorkLifeBalance', 'YearsAtCompany', 'Overtime']
info_gains = {f: info_gain(data, f, 'Attrition') for f in features}

print("\nInformation Gain:")
for k, v in info_gains.items():
    print(f"{k}: {round(v, 5)}")


Entropy of Attrition: 0.97095

Information Gain:
JobSatisfaction: 0.97095
WorkLifeBalance: 0.97095
YearsAtCompany: 0.97095
Overtime: 0.97095


In [7]:
# --- Step 6: Build and Display the Decision Tree ---
decision_tree = id3(data, data, features, 'Attrition')
print("\nFinal Decision Tree:")
pprint(decision_tree)


Final Decision Tree:
{'JobSatisfaction': {'High': 'No', 'Low': 'Yes', 'Medium': 'No'}}


In [8]:
# --- Step 7: Test Prediction Function ---
def predict(tree, sample):
    for key in tree.keys():
        value = sample.get(key)
        if value in tree[key]:
            result = tree[key][value]
            if isinstance(result, dict):
                return predict(result, sample)
            else:
                return result
        else:
            return "Unknown"

In [9]:
# Test with new employee
sample_employee = {
    'JobSatisfaction': 'Low',
    'WorkLifeBalance': 'Poor',
    'YearsAtCompany': 2,
    'Overtime': 'Yes'
}

print("\nPrediction for new employee:", predict(decision_tree, sample_employee))



Prediction for new employee: Yes
