<a href="https://colab.research.google.com/github/oelboussouni11/LifeManager/blob/main/tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import random

# Load the Excel file
file_path = "data.xlsx"  # Update this with your actual file path
df = pd.read_excel(file_path, engine="openpyxl")  # Ensure openpyxl is used for .xlsx files

# Extract header and data
header = df.columns.tolist()  # Column names
data_list = df.values.tolist()  # Convert DataFrame to a list of lists (each row as a separate sub-array)

# Function to split the data into training (70%) and testing (30%)
def split_data(data, train_ratio=0.7):
    random.shuffle(data)  # Shuffle data for randomness
    split_index = int(len(data) * train_ratio)  # Compute split index
    training_data = data[:split_index]  # First 70% for training
    testing_data = data[split_index:]  # Remaining 30% for testing
    return training_data, testing_data

# Split the dataset
training_data, testing_data = split_data(data_list)

# training_data = training_data[:]
# testing_data = testing_data[:100]

# Counts the occurrences of each unique label return a dict
def class_counts(rows):
    counts = {}
    for row in rows:
        label = row[-1]
        counts[label] = counts.get(label, 0) + 1
    return counts

# Check if a value is numeric
def is_numeric(value):
    return isinstance(value, int) or isinstance(value, float)

# Class to represent a decision rule
class Question:
    def __init__(self, column, value):
        self.column = column
        self.value = value

    def match(self, example):
        val = example[self.column]
        return val >= self.value if is_numeric(val) else val == self.value

    def __repr__(self):
        condition = ">=" if is_numeric(self.value) else "=="
        return f"Is {header[self.column]} {condition} {self.value}?"

# Partition dataset based on a question
def partition(rows, question):
    true_rows, false_rows = [], []
    for row in rows:
        (true_rows if question.match(row) else false_rows).append(row)
    return true_rows, false_rows

# Calculate Gini impurity
def gini(rows):
    counts = class_counts(rows)
    impurity = 1 - sum((count / len(rows)) ** 2 for count in counts.values())
    return impurity

# Calculate information gain
def info_gain(left, right, current_impurity):
    p = len(left) / (len(left) + len(right))
    return current_impurity - p * gini(left) - (1 - p) * gini(right)

# Find the best question to split data
def find_best_split(rows):
    best_gain, best_question = 0, None
    current_uncertainty = gini(rows)
    n_features = len(rows[0]) - 1

    for col in range(n_features):
        values = set(row[col] for row in rows)
        for val in values:
            question = Question(col, val)
            true_rows, false_rows = partition(rows, question)
            if not true_rows or not false_rows:
                continue
            gain = info_gain(true_rows, false_rows, current_uncertainty)
            if gain >= best_gain:
                best_gain, best_question = gain, question
    return best_gain, best_question

# Leaf node (end of decision tree)
class Leaf:
    def __init__(self, rows):
        self.predictions = class_counts(rows)

# Decision node (splitting point)
class Decision_Node:
    def __init__(self, question, true_branch, false_branch):
        self.question = question
        self.true_branch = true_branch
        self.false_branch = false_branch

# Build decision tree recursively
def build_tree(rows):
    gain, question = find_best_split(rows)
    if gain == 0:
        return Leaf(rows)
    true_rows, false_rows = partition(rows, question)
    true_branch = build_tree(true_rows)
    false_branch = build_tree(false_rows)
    return Decision_Node(question, true_branch, false_branch)

# Print decision tree structure
def print_tree(node, spacing=""):
    if isinstance(node, Leaf):
        print(f"{spacing}📌 Predict: {node.predictions}")
        return
    print(f"{spacing}🔍 {node.question}")
    print(f"{spacing}├── True:")
    print_tree(node.true_branch, spacing + "│   ")
    print(f"{spacing}└── False:")
    print_tree(node.false_branch, spacing + "    ")

# Classify a new example
def classify(row, node):
    if isinstance(node, Leaf):
        return node.predictions
    return classify(row, node.true_branch if node.question.match(row) else node.false_branch)

# Get the predicted label from classification result
def get_prediction(class_counts_dict):
    return max(class_counts_dict, key=class_counts_dict.get)

if __name__ == '__main__':
    my_tree = build_tree(training_data)
    print_tree(my_tree)

    # Print actual vs predicted values
    for row in testing_data:
        prediction = get_prediction(classify(row, my_tree))
        print(f"Actual: {row[-1]}, Predicted: {prediction}")

    # Compute accuracy
    correct_predictions = sum(1 for row in testing_data if get_prediction(classify(row, my_tree)) == row[-1])
    accuracy = correct_predictions / len(testing_data) * 100
    print(f"\nModel Accuracy: {accuracy:.2f}%")


🔍 Is MajorAxisLength >= 277.57139915259285?
├── True:
│   🔍 Is MinorAxisLength >= 210.59271513371448?
│   ├── True:
│   │   🔍 Is EquivDiameter >= 386.0211352743979?
│   │   ├── True:
│   │   │   📌 Predict: {'BOMBAY': 364}
│   │   └── False:
│   │       🔍 Is ShapeFactor3 >= 0.6101963145737259?
│   │       ├── True:
│   │       │   🔍 Is roundness >= 0.8746267109580121?
│   │       │   ├── True:
│   │       │   │   🔍 Is Perimeter >= 907.665?
│   │       │   │   ├── True:
│   │       │   │   │   🔍 Is ShapeFactor3 >= 0.6655293476884537?
│   │       │   │   │   ├── True:
│   │       │   │   │   │   🔍 Is ShapeFactor4 >= 0.9942415041797283?
│   │       │   │   │   │   ├── True:
│   │       │   │   │   │   │   📌 Predict: {'BARBUNYA': 7}
│   │       │   │   │   │   └── False:
│   │       │   │   │   │       📌 Predict: {'CALI': 1}
│   │       │   │   │   └── False:
│   │       │   │   │       🔍 Is ShapeFactor4 >= 0.9964398974784006?
│   │       │   │   │       ├── True:
│   │       │   │   │     