In [187]:
import numpy as np

In [188]:
# Dataset
data = [
    {"Deadline": "Urgent", "Party": "Yes", "Lazy": "Yes"},
    {"Deadline": "Urgent", "Party": "No", "Lazy": "Yes"},
    {"Deadline": "Near", "Party": "Yes", "Lazy": "Yes"},
    {"Deadline": "None", "Party": "Yes", "Lazy": "No"},
    {"Deadline": "None", "Party": "No", "Lazy": "Yes"},
    {"Deadline": "None", "Party": "Yes", "Lazy": "No"},
    {"Deadline": "Near", "Party": "No", "Lazy": "No"},
    {"Deadline": "Near", "Party": "No", "Lazy": "Yes"},
    {"Deadline": "Urgent", "Party": "Yes", "Lazy": "Yes"},
    {"Deadline": "Urgent", "Party": "No", "Lazy": "No"},
]

classes = ["Party", "Study", "Party", "Party", "Pub", "Party", "Study", "TV", "Party", "Study"]
features = ["Deadline", "Party", "Lazy"]

In [189]:
data

[{'Deadline': 'Urgent', 'Party': 'Yes', 'Lazy': 'Yes'},
 {'Deadline': 'Urgent', 'Party': 'No', 'Lazy': 'Yes'},
 {'Deadline': 'Near', 'Party': 'Yes', 'Lazy': 'Yes'},
 {'Deadline': 'None', 'Party': 'Yes', 'Lazy': 'No'},
 {'Deadline': 'None', 'Party': 'No', 'Lazy': 'Yes'},
 {'Deadline': 'None', 'Party': 'Yes', 'Lazy': 'No'},
 {'Deadline': 'Near', 'Party': 'No', 'Lazy': 'No'},
 {'Deadline': 'Near', 'Party': 'No', 'Lazy': 'Yes'},
 {'Deadline': 'Urgent', 'Party': 'Yes', 'Lazy': 'Yes'},
 {'Deadline': 'Urgent', 'Party': 'No', 'Lazy': 'No'}]

In [190]:
def entropy_function(p):
    if p != 0:
        return -p * np.log2(p)
    else:
        return 0

In [198]:
def information_gain(data, classes, feature):
    number_of_data_points = len(data)
    unique_class_labels, count_each_class = np.unique(classes, return_counts=True) #np.unique() : finds the unique elements of the array

    # Calculate the root entropy
    root_entropy = sum(entropy_function(count_each_class[i] / number_of_data_points) for i in range(len(unique_class_labels)))
    print("Root Entropy H(S) =", root_entropy)
    print("Unique Classes:", unique_class_labels)
    print("Counts of each class:", count_each_class)

    # Calculate Weighted Entropy for Each Unique Value
                                        #creates a list of values for the specified feature
    values, feature_counts = np.unique([datapoint[feature] for datapoint in data], return_counts=True) # return_counts=True : return the counts of each unique value.
    weighted_entropy = 0

    #Note : values will contain the unique values of the feature, and feature_counts will contain how many times each value appears in the dataset.

    #calculating the weighted entropy of the subset
    for value, count in zip(values, feature_counts):
        subset_classes = [classes[i] for i in range(number_of_data_points) if data[i][feature] == value]
        unique_subset_labels, subset_counts = np.unique(subset_classes, return_counts=True)
        subset_entropy = sum(entropy_function(subset_counts[i] / count) for i in range(len(unique_subset_labels)))
        weighted_entropy += (count / number_of_data_points) * subset_entropy

    # Calculate information gain
    gain = root_entropy - weighted_entropy
    return gain

In [199]:
information_gain(data, classes, feature)

Root Entropy H(S) = 1.6854752972273344
Unique Classes: ['Party' 'Pub' 'Study' 'TV']
Counts of each class: [5 1 3 1]


np.float64(0.20998654701098762)

In [195]:
# Calculate and display information gain for each feature
information_gains = {}

for feature in features:
    info_gain = information_gain(data, classes, feature)
    print(f"Feature: {feature}")

    # List the unique values that the feature can take
    values, feature_counts = np.unique([datapoint[feature] for datapoint in data], return_counts=True)

    print(f"Unique values for {feature}: {values}")
    print(f"Counts for each unique value: {feature_counts}")
    print(f"Information Gain for '{feature}': {info_gain:.4f}\n")

Root Entropy H(S) = 1.6854752972273344
Unique Classes: ['Party' 'Pub' 'Study' 'TV']
Counts of each class: [5 1 3 1]
Feature: Deadline
Unique values for Deadline: ['Near' 'None' 'Urgent']
Counts for each unique value: [3 3 4]
Information Gain for 'Deadline': 0.5345

Root Entropy H(S) = 1.6854752972273344
Unique Classes: ['Party' 'Pub' 'Study' 'TV']
Counts of each class: [5 1 3 1]
Feature: Party
Unique values for Party: ['No' 'Yes']
Counts for each unique value: [5 5]
Information Gain for 'Party': 1.0000

Root Entropy H(S) = 1.6854752972273344
Unique Classes: ['Party' 'Pub' 'Study' 'TV']
Counts of each class: [5 1 3 1]
Feature: Lazy
Unique values for Lazy: ['No' 'Yes']
Counts for each unique value: [4 6]
Information Gain for 'Lazy': 0.2100



In [197]:
# Determine the best feature after displaying the gains
information_gains[feature] = info_gain
best_feature = max(information_gains, key=information_gains.get)
max_gain = information_gains[best_feature]

print(f"\nBest Feature: {best_feature} with Information Gain: {max_gain:.4f}")


Best Feature: Lazy with Information Gain: 0.2100
