In [11]:
import pandas as pd
import math

# Sample dataset based on provided data
data = [
    {"Outlook": "Sunny", "Temp.": 85, "Humidity": 85, "Wind": "Weak", "Decision": "No"},
    {"Outlook": "Sunny", "Temp.": 80, "Humidity": 90, "Wind": "Strong", "Decision": "No"},
    {"Outlook": "Overcast", "Temp.": 83, "Humidity": 78, "Wind": "Weak", "Decision": "Yes"},
    {"Outlook": "Rain", "Temp.": 70, "Humidity": 96, "Wind": "Weak", "Decision": "Yes"},
    {"Outlook": "Rain", "Temp.": 68, "Humidity": 80, "Wind": "Weak", "Decision": "Yes"},
    {"Outlook": "Rain", "Temp.": 65, "Humidity": 70, "Wind": "Strong", "Decision": "No"},
    {"Outlook": "Overcast", "Temp.": 64, "Humidity": 65, "Wind": "Strong", "Decision": "Yes"},
    {"Outlook": "Sunny", "Temp.": 72, "Humidity": 95, "Wind": "Weak", "Decision": "No"},
    {"Outlook": "Sunny", "Temp.": 69, "Humidity": 70, "Wind": "Weak", "Decision": "Yes"},
    {"Outlook": "Rain", "Temp.": 75, "Humidity": 80, "Wind": "Weak", "Decision": "Yes"},
    {"Outlook": "Sunny", "Temp.": 75, "Humidity": 70, "Wind": "Strong", "Decision": "Yes"},
    {"Outlook": "Overcast", "Temp.": 72, "Humidity": 90, "Wind": "Strong", "Decision": "Yes"},
    {"Outlook": "Overcast", "Temp.": 81, "Humidity": 75, "Wind": "Weak", "Decision": "Yes"},
    {"Outlook": "Rain", "Temp.": 71, "Humidity": 80, "Wind": "Strong", "Decision": "No"}
]

df = pd.DataFrame(data)

# Helper functions to calculate entropy, information gain, and split information
def label_frequency(data):
    label_counts = {}
    for label in data["Decision"]:
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    return label_counts

def entropy(data):
    label_counts = label_frequency(data)
    total_instances = len(data)
    entropy_value = 0
    for count in label_counts.values():
        probability = count / total_instances
        entropy_value -= probability * math.log2(probability)
    return entropy_value

def information_gain(data, attribute):
    total_entropy = entropy(data)
    attribute_values = data[attribute].unique()
    weighted_entropy = 0
    for value in attribute_values:
        subset = data[data[attribute] == value]
        subset_entropy = entropy(subset)
        weighted_entropy += (len(subset) / len(data)) * subset_entropy
    return total_entropy - weighted_entropy

# Calculate split information (intrinsic value)
def split_information(data, attribute):
    total_instances = len(data)
    attribute_values = data[attribute].unique()
    split_info = 0
    for value in attribute_values:
        subset = data[data[attribute] == value]
        probability = len(subset) / total_instances
        split_info -= probability * math.log2(probability) if probability > 0 else 0
    return split_info

# Calculate gain ratio for C4.5
def gain_ratio(data, attribute):
    info_gain = information_gain(data, attribute)
    split_info = split_information(data, attribute)
    return info_gain / split_info if split_info != 0 else 0  # Avoid division by zero

def gini_impurity(data):
    label_counts = label_frequency(data)
    total_instances = len(data)
    gini = 1
    for count in label_counts.values():
        probability = count / total_instances
        gini -= probability ** 2
    return gini

def gini_gain(data, attribute):
    total_gini = gini_impurity(data)
    attribute_values = data[attribute].unique()
    weighted_gini = 0
    for value in attribute_values:
        subset = data[data[attribute] == value]
        subset_gini = gini_impurity(subset)
        weighted_gini += (len(subset) / len(data)) * subset_gini
    return total_gini - weighted_gini

# Function to find the best attribute based on selected criterion (ID3, C4.5, CART)
def best_split(data, attributes, criterion="ID3"):
    best_gain = -1
    best_attribute = None
    for attribute in attributes:
        if criterion == "ID3":
            gain = information_gain(data, attribute)
        elif criterion == "C4.5":
            gain = gain_ratio(data, attribute)
        elif criterion == "CART":
            gain = gini_gain(data, attribute)
        else:
            raise ValueError("Invalid criterion specified. Choose 'ID3', 'C4.5', or 'CART'.")
        if gain > best_gain:
            best_gain = gain
            best_attribute = attribute
    return best_attribute

# Recursive function to build the decision tree
def build_tree(data, attributes, criterion="ID3"):
    # If all target labels are the same, return that label
    if len(data["Decision"].unique()) == 1:
        return data["Decision"].iloc[0]
    
    # If there are no more attributes to split, return the most common label
    if len(attributes) == 0:
        return data["Decision"].mode()[0]
    
    # Find the best attribute to split on based on the criterion
    best_attr = best_split(data, attributes, criterion)
    tree = {best_attr: {}}
    remaining_attributes = [attr for attr in attributes if attr != best_attr]
    
    # Split on each possible value of the best attribute
    for value in data[best_attr].unique():
        subset = data[data[best_attr] == value]
        subtree = build_tree(subset, remaining_attributes, criterion)
        tree[best_attr][value] = subtree
    return tree

# Function to classify a new sample
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    value = sample.get(attribute)
    if value not in tree[attribute]:
        return "Unknown"
    subtree = tree[attribute][value]
    return classify(subtree, sample)

# Pretty print function to display the tree structure
def print_tree(tree, indent=""):
    if not isinstance(tree, dict):
        print(indent + "Leaf:", tree)
    else:
        for attribute, branches in tree.items():
            print(indent + f"[Feature: {attribute}]")
            for value, subtree in branches.items():
                print(indent + f"  Value = {value}:")
                print_tree(subtree, indent + "    ")

# Build the decision trees using ID3, C4.5, and CART
attributes = ["Outlook", "Temp.", "Humidity", "Wind"]
decision_tree_id3 = build_tree(df, attributes, criterion="ID3")
decision_tree_c4_5 = build_tree(df, attributes, criterion="C4.5")
decision_tree_cart = build_tree(df, attributes, criterion="CART")

# Sample for testing the classification
new_sample = {"Outlook": "Sunny", "Temp.": 72, "Humidity": 90, "Wind": "Weak"}
classification_id3 = classify(decision_tree_id3, new_sample)
classification_c4_5 = classify(decision_tree_c4_5, new_sample)
classification_cart = classify(decision_tree_cart, new_sample)

# Display the results
print("Decision Tree (ID3):")
print_tree(decision_tree_id3)
print("\nClassification of new sample (ID3):", classification_id3)

print("\nDecision Tree (C4.5):")
print_tree(decision_tree_c4_5)
print("\nClassification of new sample (C4.5):", classification_c4_5)

print("\nDecision Tree (CART):")
print_tree(decision_tree_cart)
print("\nClassification of new sample (CART):", classification_cart)


Decision Tree (ID3):
[Feature: Temp.]
  Value = 85:
    Leaf: No
  Value = 80:
    Leaf: No
  Value = 83:
    Leaf: Yes
  Value = 70:
    Leaf: Yes
  Value = 68:
    Leaf: Yes
  Value = 65:
    Leaf: No
  Value = 64:
    Leaf: Yes
  Value = 72:
    [Feature: Outlook]
      Value = Sunny:
        Leaf: No
      Value = Overcast:
        Leaf: Yes
  Value = 69:
    Leaf: Yes
  Value = 75:
    Leaf: Yes
  Value = 81:
    Leaf: Yes
  Value = 71:
    Leaf: No

Classification of new sample (ID3): No

Decision Tree (C4.5):
[Feature: Temp.]
  Value = 85:
    Leaf: No
  Value = 80:
    Leaf: No
  Value = 83:
    Leaf: Yes
  Value = 70:
    Leaf: Yes
  Value = 68:
    Leaf: Yes
  Value = 65:
    Leaf: No
  Value = 64:
    Leaf: Yes
  Value = 72:
    [Feature: Outlook]
      Value = Sunny:
        Leaf: No
      Value = Overcast:
        Leaf: Yes
  Value = 69:
    Leaf: Yes
  Value = 75:
    Leaf: Yes
  Value = 81:
    Leaf: Yes
  Value = 71:
    Leaf: No

Classification of new sample (C4.5): No

In [5]:
# Function to calculate accuracy of the model
def calculate_accuracy(tree, data):
    correct_predictions = 0
    for _, row in data.iterrows():
        sample = row.to_dict()
        true_label = sample["Decision"]
        predicted_label = classify(tree, sample)
        if predicted_label == true_label:
            correct_predictions += 1
    accuracy = correct_predictions / len(data) * 100  # Accuracy in percentage
    return accuracy

# Calculate accuracy for both C4.5 and CART decision trees
accuracy_c4_5 = calculate_accuracy(decision_tree_c4_5, df)
accuracy_cart = calculate_accuracy(decision_tree_cart, df)

# Display the accuracy results
print(f"Accuracy of Decision Tree (C4.5): {accuracy_c4_5:.2f}%")
print(f"Accuracy of Decision Tree (CART): {accuracy_cart:.2f}%")


Accuracy of Decision Tree (C4.5): 100.00%
Accuracy of Decision Tree (CART): 100.00%


In [3]:
import pandas as pd
import math

# Sample dataset based on provided data
data = [
    {"Income": "Low", "Credit": "Good", "Loan Approved": "Yes"},
    {"Income": "Low", "Credit": "Bad", "Loan Approved": "No"},
    {"Income": "Medium", "Credit": "Good", "Loan Approved": "Yes"},
    {"Income": "Medium", "Credit": "Bad", "Loan Approved": "Yes"},
    {"Income": "High", "Credit": "Good", "Loan Approved": "Yes"},
    {"Income": "High", "Credit": "Bad", "Loan Approved": "No"}
]

df = pd.DataFrame(data)

# Function to calculate frequency of labels
def label_frequency(data):
    label_counts = {}
    for label in data["Loan Approved"]:
        if label not in label_counts:
            label_counts[label] = 0
        label_counts[label] += 1
    return label_counts

# Function to calculate entropy (for C4.5)
def entropy(data):
    label_counts = label_frequency(data)
    total_instances = len(data)
    entropy_value = 0
    for count in label_counts.values():
        probability = count / total_instances
        entropy_value -= probability * math.log2(probability)
    return entropy_value

# Function to calculate Gini impurity (for CART)
def gini_impurity(data):
    label_counts = label_frequency(data)
    total_instances = len(data)
    gini = 1
    for count in label_counts.values():
        probability = count / total_instances
        gini -= probability ** 2
    return gini

# Function to calculate information gain for C4.5
def information_gain(data, attribute):
    total_entropy = entropy(data)
    attribute_values = data[attribute].unique()
    weighted_entropy = 0
    for value in attribute_values:
        subset = data[data[attribute] == value]
        subset_entropy = entropy(subset)
        weighted_entropy += (len(subset) / len(data)) * subset_entropy
    return total_entropy - weighted_entropy

# Function to calculate Gini gain for CART
def gini_gain(data, attribute):
    total_gini = gini_impurity(data)
    attribute_values = data[attribute].unique()
    weighted_gini = 0
    for value in attribute_values:
        subset = data[data[attribute] == value]
        subset_gini = gini_impurity(subset)
        weighted_gini += (len(subset) / len(data)) * subset_gini
    return total_gini - weighted_gini

# Function to find the best attribute to split on based on selected criterion
def best_split(data, attributes, criterion="C4.5"):
    best_gain = -1
    best_attribute = None
    for attribute in attributes:
        if criterion == "C4.5":
            gain = information_gain(data, attribute)
        elif criterion == "CART":
            gain = gini_gain(data, attribute)
        else:
            raise ValueError("Invalid criterion specified. Choose 'C4.5' or 'CART'.")
        if gain > best_gain:
            best_gain = gain
            best_attribute = attribute
    return best_attribute

# Recursive function to build the decision tree
def build_tree(data, attributes, criterion="C4.5"):
    # If all target labels are the same, return that label
    if len(data["Loan Approved"].unique()) == 1:
        return data["Loan Approved"].iloc[0]
    
    # If there are no more attributes to split, return the most common label
    if len(attributes) == 0:
        return data["Loan Approved"].mode()[0]
    
    # Find the best attribute to split on
    best_attr = best_split(data, attributes, criterion)
    tree = {best_attr: {}}
    remaining_attributes = [attr for attr in attributes if attr != best_attr]
    
    # Split on each possible value of the best attribute
    for value in data[best_attr].unique():
        subset = data[data[best_attr] == value]
        subtree = build_tree(subset, remaining_attributes, criterion)
        tree[best_attr][value] = subtree
    return tree

# Function to classify a new sample
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree
    attribute = next(iter(tree))
    value = sample.get(attribute)
    if value not in tree[attribute]:
        return "Unknown"
    subtree = tree[attribute][value]
    return classify(subtree, sample)

# Building the decision trees using both C4.5 and CART
attributes = ["Income", "Credit"]
decision_tree_c4_5 = build_tree(df, attributes, criterion="C4.5")
decision_tree_cart = build_tree(df, attributes, criterion="CART")

# Sample for testing the classification
new_sample = {"Income": "Medium", "Credit": "Good"}
classification_c4_5 = classify(decision_tree_c4_5, new_sample)
classification_cart = classify(decision_tree_cart, new_sample)

# Display the results
print("Decision Tree (C4.5):", decision_tree_c4_5)
print("Classification of new sample (C4.5):", classification_c4_5)
print("\nDecision Tree (CART):", decision_tree_cart)
print("Classification of new sample (CART):", classification_cart)


Decision Tree (C4.5): {'Credit': {'Good': 'Yes', 'Bad': {'Income': {'Low': 'No', 'Medium': 'Yes', 'High': 'No'}}}}
Classification of new sample (C4.5): Yes

Decision Tree (CART): {'Credit': {'Good': 'Yes', 'Bad': {'Income': {'Low': 'No', 'Medium': 'Yes', 'High': 'No'}}}}
Classification of new sample (CART): Yes
