In [1]:
import pandas as pd
import numpy as np
import math
from collections import Counter
import pprint

# Sample data
data = {
    'Outlook': ['Sunny', 'Sunny', 'Overcast', 'Rain', 'Rain', 'Rain',
                'Overcast', 'Sunny', 'Sunny', 'Rain', 'Sunny', 
                'Overcast', 'Overcast', 'Rain'],
    'Temp.': [85, 80, 83, 70, 68, 65, 64, 72, 69, 75, 75, 72, 81, 71],
    'Humidity': [85, 90, 78, 96, 80, 70, 65, 95, 70, 80, 70, 90, 75, 80],
    'Wind': ['Weak', 'Strong', 'Weak', 'Weak', 'Weak', 'Strong',
             'Strong', 'Weak', 'Weak', 'Weak', 'Strong', 'Strong', 
             'Weak', 'Strong'],
    'Decision': ['No', 'No', 'Yes', 'Yes', 'Yes', 'No', 'Yes',
                 'No', 'Yes', 'Yes', 'Yes', 'Yes', 'Yes', 'No']
}

# Create DataFrame
df = pd.DataFrame(data)

# Function to calculate entropy
def entropy(labels):
    total = len(labels)
    if total == 0:
        return 0
    label_counts = Counter(labels)
    return -sum((count / total) * math.log2(count / total) for count in label_counts.values())

# Function to calculate information gain
def information_gain(df, attribute):
    total_entropy = entropy(df['Decision'])
    total = len(df)
    
    # Weighted entropy for splits
    weighted_entropy = 0
    for value in df[attribute].unique():
        subset = df[df[attribute] == value]
        weighted_entropy += (len(subset) / total) * entropy(subset['Decision'])
    
    return total_entropy - weighted_entropy

# Function to build the decision tree
def build_tree(df, attributes):
    labels = df['Decision'].values
    # Leaf node if all labels are the same
    if len(set(labels)) == 1:
        return labels[0]
    
    # Leaf node if no attributes are left
    if not attributes:
        return Counter(labels).most_common(1)[0][0]
    
    # Find the best attribute to split
    best_attr = max(attributes, key=lambda attr: information_gain(df, attr))
    tree = {best_attr: {}}
    
    for value in df[best_attr].unique():
        subset = df[df[best_attr] == value]
        if subset.empty:
            tree[best_attr][value] = Counter(labels).most_common(1)[0][0]
        else:
            new_attributes = [attr for attr in attributes if attr != best_attr]
            tree[best_attr][value] = build_tree(subset, new_attributes)
    
    return tree

# Function to classify a new sample
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree  # Leaf node
    attribute = next(iter(tree))
    value = sample[attribute]
    if value in tree[attribute]:
        return classify(tree[attribute][value], sample)
    else:
        return None  # Unknown value

# Build the decision tree
attributes = ['Outlook', 'Temp.', 'Humidity', 'Wind']
decision_tree = build_tree(df, attributes)

# Print the tree
import pprint
pprint.pprint(decision_tree)

# Classify a new sample
new_sample = {'Outlook': 'Sunny', 'Temp.': 75, 'Humidity': 70, 'Wind': 'Weak'}
decision = classify(decision_tree, new_sample)
print(f"The decision for the new sample is: {decision}")


{'Temp.': {64: 'Yes',
           65: 'No',
           68: 'Yes',
           69: 'Yes',
           70: 'Yes',
           71: 'No',
           72: {'Outlook': {'Overcast': 'Yes', 'Sunny': 'No'}},
           75: 'Yes',
           80: 'No',
           81: 'Yes',
           83: 'Yes',
           85: 'No'}}
The decision for the new sample is: Yes


In [2]:
# Function to calculate Gini Impurity
def gini_impurity(data):
    total = len(data)
    if total == 0:
        return 0
    label_counts = Counter(data['Decision'])
    return 1 - sum((count / total) ** 2 for count in label_counts.values())

# Function to split dataset
def split_dataset(df, attribute, value):
    if isinstance(value, (int, float)):  # Numerical attribute
        left_split = df[df[attribute] <= value]
        right_split = df[df[attribute] > value]
    else:  # Categorical attribute
        left_split = df[df[attribute] == value]
        right_split = df[df[attribute] != value]
    
    return left_split, right_split

# Function to find the best split
def best_split(df, attributes):
    best_gini = float('inf')
    best_attribute = None
    best_value = None
    
    for attribute in attributes:
        values = df[attribute].unique()
        for value in values:
            left_split, right_split = split_dataset(df, attribute, value)
            gini_left = gini_impurity(left_split)
            gini_right = gini_impurity(right_split)
            gini = (len(left_split) / len(df)) * gini_left + (len(right_split) / len(df)) * gini_right
            
            if gini < best_gini:
                best_gini = gini
                best_attribute = attribute
                best_value = value
    
    return best_attribute, best_value

# Function to build the decision tree
def build_tree(df, attributes):
    labels = df['Decision'].values
    
    # Stop if all labels are the same
    if len(set(labels)) == 1:
        return labels[0]
    
    # Stop if no attributes are left to split
    if not attributes:
        return Counter(labels).most_common(1)[0][0]
    
    best_attribute, best_value = best_split(df, attributes)
    tree = {best_attribute: {}}
    
    # Recursively build the tree
    left_split, right_split = split_dataset(df, best_attribute, best_value)
    
    tree[best_attribute]['<= ' + str(best_value)] = build_tree(left_split, [attr for attr in attributes if attr != best_attribute])
    tree[best_attribute]['> ' + str(best_value)] = build_tree(right_split, [attr for attr in attributes if attr != best_attribute])
    
    return tree

# Function to classify a new sample
def classify(tree, sample):
    if not isinstance(tree, dict):
        return tree  # Leaf node
    attribute = next(iter(tree))
    value = sample[attribute]

    for key in tree[attribute]:
        if isinstance(value, str):  # Categorical attribute
            if key.startswith('<=') and value == key[3:]:
                return classify(tree[attribute][key], sample)
            elif key.startswith('>') and value != key[2:]:
                return classify(tree[attribute][key], sample)
        else:  # Numerical attribute
            threshold = float(key[3:]) if key.startswith('<=') else float(key[2:])
            if (key.startswith('<=') and value <= threshold) or (key.startswith('>') and value > threshold):
                return classify(tree[attribute][key], sample)
    
    return None 

# Build the decision tree
attributes = ['Outlook', 'Temp.', 'Humidity', 'Wind']
decision_tree = build_tree(df, attributes)

# Print the tree
pprint.pprint(decision_tree)

# Classify a new sample
new_sample = {'Outlook': 'Sunny', 'Temp.': 75, 'Humidity': 70, 'Wind': 'Weak'}
decision = classify(decision_tree, new_sample)
print(f"The decision for the new sample is: {decision}")


{'Outlook': {'<= Overcast': 'Yes',
             '> Overcast': {'Temp.': {'<= 75': 'Yes',
                                      '> 75': {'Wind': {'<= Weak': {'Humidity': {'<= 85': 'No',
                                                                                 '> 85': 'Yes'}},
                                                        '> Weak': 'No'}}}}}}
The decision for the new sample is: Yes


In [3]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

# Sample dataset
data = {
    'Income': ['Low', 'Low', 'Medium', 'Medium', 'High', 'High'],
    'Credit': ['Good', 'Bad', 'Good', 'Bad', 'Good', 'Bad'],
    'Loan Approved': ['Yes', 'No', 'Yes', 'Yes', 'Yes', 'No']
}

# Convert to DataFrame
df = pd.DataFrame(data)

# Encode categorical variables (Income, Credit, Loan Approved) to numeric labels
label_encoders = {}
for column in df.columns:
    le = LabelEncoder()
    df[column] = le.fit_transform(df[column])
    label_encoders[column] = le

# Split data into features (X) and target (y)
X = df[['Income', 'Credit']]
y = df['Loan Approved']

# Train CART Decision Tree (Gini index by default)
cart_model = DecisionTreeClassifier(criterion='gini')
cart_model.fit(X, y)

# Train C4.5-like Decision Tree (using Entropy)
c45_model = DecisionTreeClassifier(criterion='entropy')
c45_model.fit(X, y)

# Test the models with a new sample
new_sample = pd.DataFrame({'Income': ['Medium'], 'Credit': ['Good']})

# Encode the new sample
new_sample_encoded = new_sample.copy()
for column in new_sample.columns:
    new_sample_encoded[column] = label_encoders[column].transform(new_sample[column])

# Make predictions
cart_prediction = cart_model.predict(new_sample_encoded)
c45_prediction = c45_model.predict(new_sample_encoded)

# Decode predictions back to Yes/No
cart_prediction_decoded = label_encoders['Loan Approved'].inverse_transform(cart_prediction)
c45_prediction_decoded = label_encoders['Loan Approved'].inverse_transform(c45_prediction)

print(f"CART Prediction for {new_sample.to_dict()}: {cart_prediction_decoded[0]}")
print(f"C4.5 Prediction for {new_sample.to_dict()}: {c45_prediction_decoded[0]}")


CART Prediction for {'Income': {0: 'Medium'}, 'Credit': {0: 'Good'}}: Yes
C4.5 Prediction for {'Income': {0: 'Medium'}, 'Credit': {0: 'Good'}}: Yes
