In [None]:
import pandas as pd
import numpy as np


In [None]:
data = pd.read_csv("PlayTennis.csv")

print(data)

     Outlook Temperature Humidity    Wind Play Tennis
0      Sunny         Hot     High    Weak          No
1      Sunny         Hot     High  Strong          No
2   Overcast         Hot     High    Weak         Yes
3       Rain        Mild     High    Weak         Yes
4       Rain        Cool   Normal    Weak         Yes
5       Rain        Cool   Normal  Strong          No
6   Overcast        Cool   Normal  Strong         Yes
7      Sunny        Mild     High    Weak          No
8      Sunny        Cool   Normal    Weak         Yes
9       Rain        Mild   Normal    Weak         Yes
10     Sunny        Mild   Normal  Strong         Yes
11  Overcast        Mild     High  Strong         Yes
12  Overcast         Hot   Normal    Weak         Yes
13      Rain        Mild     High  Strong          No


In [None]:
import pandas as pd
import math
from graphviz import Digraph

class Node:
    def __init__(self, attribute=None, value=None, result=None):
        self.attribute = attribute
        self.value = value
        self.result = result        # Class label if the node is a leaf node
        self.children = {}          # Dictionary to store child nodes

def entropy(data):
    # counts calculate how many times {yes,no} occures
    counts = data['Play Tennis'].value_counts()
    total_instances = len(data) # yeses + no's
    entropy_value = 0
    for count in counts:
        prob = count / total_instances
        entropy_value -= prob * math.log2(prob)
    return entropy_value

def average_entropy(data, attribute):
    # for example attribute = outlook has three attributes {sunny , overcast , rain} so variable values will have this
    values = data[attribute].unique()
    total_instances = len(data)
    average_entropy = 0
    for value in values:
        # data[attribute] = {sunny, overcast, rain}
        # if the value == sunny it will return the sunny with no and yes
        subcategory_of_feature = data[data[attribute] == value]
        proportion_of_subcategory = len(subcategory_of_feature) / total_instances
        entropy_val = entropy(subcategory_of_feature)
        average_entropy += proportion_of_subcategory * entropy_val
    return average_entropy

def information_gain(data, attribute):
    total_entropy = entropy(data)
    avg_entropy = average_entropy(data, attribute)
    return total_entropy - avg_entropy

def build_tree(data, attributes):
    # If all instances have the same class label
    if len(data['Play Tennis'].unique()) == 1:
        # selecting the first element as the leaf node all the elements are the same
        return Node(result=data['Play Tennis'].iloc[0])

    # If no attributes left to split on
    if len(attributes) == 0:  #idxmax() returns the index of the majority class
        return Node(result=data['Play Tennis'].value_counts().idxmax())

    max_gain = -1
    best_attribute = None
    for attribute in attributes:
        gain = information_gain(data, attribute)
        if gain > max_gain:
            max_gain = gain
            best_attribute = attribute

    root = Node(attribute=best_attribute)
    values = data[best_attribute].unique()

    for value in values:
        subset = data[data[best_attribute] == value]
        if len(subset) == 0:
            root.children[value] = Node(result=data['Play Tennis'].value_counts().idxmax())
        else:
            # new attributes without the best attribute that we picked
            new_attributes = [attr for attr in attributes if attr != best_attribute]
            root.children[value] = build_tree(subset, new_attributes)
    return root

def predict(node, instance):
    if node.result is not None:
        return node.result
    attribute = node.attribute
    value = instance[attribute]
    if value not in node.children:
        return None
    return predict(node.children[value], instance)


def visualize_tree(node, dot=None):
    if dot is None:
        dot = Digraph()

    if node.result is not None:
        dot.node(str(id(node)), str(node.result))
        return dot

    dot.node(str(id(node)), str(node.attribute))

    for value, child_node in node.children.items():
        dot = visualize_tree(child_node, dot)
        dot.edge(str(id(node)), str(id(child_node)), label=str(value))

    return dot


def print_tree(node, depth=0):
    if node.result is not None:
        print('  ' * depth, 'Result:', node.result)
    else:
        print('  ' * depth, 'Attribute:', node.attribute)
        for value, child_node in node.children.items():
            print('  ' * (depth + 1), 'Value:', value)
            print_tree(child_node, depth + 2)

print("Decision Tree:")
print_tree(root_node)

# Example usage:
data = pd.read_csv("PlayTennis.csv")
attributes = data.columns[:-1].tolist()
root_node = build_tree(data, attributes)

dot = visualize_tree(root_node)
dot.render("decision_tree", format="png", cleanup=True)


Decision Tree:
 Attribute: Outlook
   Value: Sunny
     Attribute: Humidity
       Value: High
         Result: No
       Value: Normal
         Result: Yes
   Value: Overcast
     Result: Yes
   Value: Rain
     Attribute: Wind
       Value: Weak
         Result: Yes
       Value: Strong
         Result: No


'decision_tree.png'