In [15]:
import pandas as pd
import plotly.express as px
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import math
from collections import Counter, defaultdict

# Dataset
dataset = [
    ['Sunny', 'Hot', 'High', 'Weak', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
    ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Strong', 'No']
]
labels = ['Outlook', 'Temperature', 'Humidity', 'Wind']


# Create DataFrame
df = pd.DataFrame(dataset, columns= labels + ['PlayTennis'])

# Determine rows and columns dynamically
num_plots = len(labels)
cols = math.ceil(math.sqrt(num_plots))
rows = math.ceil(num_plots / cols)

# Create subplot titles
subplot_titles = labels

# Create subplots
fig = make_subplots(rows=rows, cols=cols, subplot_titles=subplot_titles)

# Helper function to add bar plots
def add_bar(fig, feature, row, col):
    counts = df.groupby([feature, 'PlayTennis']).size().reset_index(name='Count')
    for outcome in counts['PlayTennis'].unique():
        subset = counts[counts['PlayTennis'] == outcome]
        fig.add_trace(
            go.Bar(
                x=subset[feature],
                y=subset['Count'],
                name=outcome,
                legendgroup=outcome,
                showlegend=(row == 1 and col == 1)
            ),
            row=row,
            col=col
        )

# Add all subplots dynamically
for idx, feature in enumerate(labels):
    row = (idx // cols) + 1
    col = (idx % cols) + 1
    add_bar(fig, feature, row, col)

# Layout
fig.update_layout(
    height=300 * rows,  # scale height with rows
    width=300 * cols,   # scale width with cols
    title_text="Categorical Features vs. PlayTennis",
    barmode='group'
)

# Show figure
fig.show()

In [None]:


# Calculate entropy
def entropy(data):
    labels = [row[-1] for row in data]
    label_counts = Counter(labels)
    total = len(data)
    return -sum((count / total) * math.log2(count / total) for count in label_counts.values())

# Split dataset based on attribute value
def split_dataset(data, attribute, value):
    return [row[:attribute] + row[attribute+1:] for row in data if row[attribute] == value]

# Choose the best attribute based on Information Gain
def choose_best_attribute(data):
    base_entropy = entropy(data)
    num_attributes = len(data[0]) - 1  # exclude class label
    best_info_gain = 0
    best_attribute = -1

    for i in range(num_attributes):
        values = set(row[i] for row in data)
        new_entropy = 0.0
        for value in values:
            subset = split_dataset(data, i, value)
            if subset:
                prob = len(subset) / len(data)
                new_entropy += prob * entropy(subset)
        info_gain = base_entropy - new_entropy
        if info_gain > best_info_gain:
            best_info_gain = info_gain
            best_attribute = i

    return best_attribute

# Majority class in a dataset
def majority_class(data):
    labels = [row[-1] for row in data]
    return Counter(labels).most_common(1)[0][0]

# ID3 main function
def id3(data, labels):
    class_list = [row[-1] for row in data]

    # If all labels are the same, return that label
    if class_list.count(class_list[0]) == len(class_list):
        return class_list[0]

    # If no attributes left to split on
    if len(data[0]) == 1:
        return majority_class(data)

    best_attr = choose_best_attribute(data)
    best_label = labels[best_attr]
    tree = {best_label: {}}

    unique_values = set(row[best_attr] for row in data)
    sub_labels = labels[:best_attr] + labels[best_attr+1:]

    for value in unique_values:
        subset = split_dataset(data, best_attr, value)
        subtree = id3(subset, sub_labels)
        tree[best_label][value] = subtree

    return tree


# Example usage:
dataset = [
    ['Sunny', 'Hot', 'High', 'Weak', 'No'],
    ['Sunny', 'Hot', 'High', 'Strong', 'No'],
    ['Overcast', 'Hot', 'High', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Cool', 'Normal', 'Strong', 'No'],
    ['Overcast', 'Cool', 'Normal', 'Strong', 'Yes'],
    ['Sunny', 'Mild', 'High', 'Weak', 'No'],
    ['Sunny', 'Cool', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'Normal', 'Weak', 'Yes'],
    ['Sunny', 'Mild', 'Normal', 'Strong', 'Yes'],
    ['Overcast', 'Mild', 'High', 'Strong', 'Yes'],
    ['Overcast', 'Hot', 'Normal', 'Weak', 'Yes'],
    ['Rain', 'Mild', 'High', 'Strong', 'No']
]


tree = id3(dataset, labels)
print(tree)


{'Outlook': {'Overcast': 'Yes', 'Sunny': {'Humidity': {'Normal': 'Yes', 'High': 'No'}}, 'Rain': {'Wind': {'Weak': 'Yes', 'Strong': 'No'}}}}


In [4]:
# Example usage of split_dataset:
# Let's split the dataset where attribute index 0 ('Outlook') is 'Sunny'
split_result = split_dataset(dataset, 0, 'Sunny')
print(split_result)

[['Hot', 'High', 'Weak', 'No'], ['Hot', 'High', 'Strong', 'No'], ['Mild', 'High', 'Weak', 'No'], ['Cool', 'Normal', 'Weak', 'Yes'], ['Mild', 'Normal', 'Strong', 'Yes']]
