In [13]:
import pandas as pd
import numpy as np

from functions._gini import calculate_gini_impurity
from functions._best_threshold import find_optimal_threshold

In [14]:
# Define a function to calculate accuracy
def calculate_accuracy(predictions, true_labels):
    correct_predictions = sum(pred == true_label for pred, true_label in zip(predictions, true_labels))
    accuracy = correct_predictions / len(predictions) * 100
    return accuracy

## Sample Dataset

In [15]:
data = np.array([4.9, 5.0, 5.5, 5.7, 6.0, 6.2, 6.5, 6.8])
labels = np.array(['A', 'A', 'B', 'A', 'B', 'B', 'B', 'A'])

# Create a DataFrame
sample_df = pd.DataFrame({'feature': data, 'target': labels})
sample_df

Unnamed: 0,feature,target
0,4.9,A
1,5.0,A
2,5.5,B
3,5.7,A
4,6.0,B
5,6.2,B
6,6.5,B
7,6.8,A


## Import your Dataset : Default : IRIS Dataset

In [2]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

iris_data = load_iris()
iris_df = pd.DataFrame(data=iris_data.data, columns=iris_data.feature_names)
iris_df['target'] = iris_data.target

iris = iris_df.sample(frac=1, random_state=42)  # frac=1 means shuffling all rows


In [3]:
X_train, X_test, y_train, y_test = train_test_split(iris, iris[['target']], test_size=0.2, random_state=42)

X_train.head(5)

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
26,5.0,3.4,1.6,0.4,0
56,6.3,3.3,4.7,1.6,1
39,5.1,3.4,1.5,0.2,0
12,4.8,3.0,1.4,0.1,0
86,6.7,3.1,4.7,1.5,1


## Main Function To Split our Continous Dataset 

In [27]:
def numeric_best_splits(data, target):
    tree = {} 
    
    lowest_impurity = calculate_gini_impurity(data, target)
    min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
    
    tree['type'] = 'node'  
    tree['feature'] = min_key 
    
    threshold = find_optimal_threshold(data[min_key], data[target])
    tree['threshold'] = threshold 
    tree['subtrees'] = {} 
    
    for direction in ['left', 'right']:
        mask = data[min_key] <= threshold if direction == 'left' else data[min_key] > threshold
        subset_data = data[mask]
        
        unique_values = subset_data[target].unique()
        
        if len(unique_values) > 1:
            tree['subtrees'][direction] = numeric_best_splits(subset_data, target) 
        else:
            tree['subtrees'][direction] = {'type': 'leaf', 'prediction': unique_values[0]} 
        
    return tree

tree = numeric_best_splits(sample_df, 'target')
tree

{'type': 'node',
 'feature': 'feature',
 'threshold': 5.0,
 'subtrees': {'left': {'type': 'leaf', 'prediction': 'A'},
  'right': {'type': 'node',
   'feature': 'feature',
   'threshold': 6.5,
   'subtrees': {'left': {'type': 'node',
     'feature': 'feature',
     'threshold': 5.7,
     'subtrees': {'left': {'type': 'node',
       'feature': 'feature',
       'threshold': 5.5,
       'subtrees': {'left': {'type': 'leaf', 'prediction': 'B'},
        'right': {'type': 'leaf', 'prediction': 'A'}}},
      'right': {'type': 'leaf', 'prediction': 'B'}}},
    'right': {'type': 'leaf', 'prediction': 'A'}}}}}

In [29]:
from functions._numeric_best_splits import numeric_best_splits

class ContinousDecisionClassifier:
    def __init__(self):
        self.tree = None
    
    def fit(self, data, target):
        self.tree = numeric_best_splits(data, target)
        return self.tree
    
    def predict_sample(self, sample, tree):
        if tree['type'] == 'leaf':
            return tree['prediction']

        feature = tree['feature']
        threshold = tree['threshold']

        if sample[feature] <= threshold:
            subtree_key = 'left'
        else:
            subtree_key = 'right'

        next_subtree = tree['subtrees'][subtree_key]
        return self.predict_sample(sample, next_subtree)

    def predict(self, data, tree):
        predictions = []
        for index, row in data.iterrows():
            sample = row.to_dict()
            prediction = self.predict_sample(sample, tree)
            predictions.append(prediction)
        return predictions
    
model = ContinousDecisionClassifier()
tree = model.fit( X_train, 'target' )
    
# Make predictions on X_test based on Trained Tree
test_predictions = model.predict( X_test, tree )

test_accuracy = calculate_accuracy(test_predictions, y_test['target'])

predictions_df = pd.DataFrame({'Prediction': test_predictions}, index=X_test.index)
result_df = pd.concat([y_test, predictions_df], axis=1)

print("Accuracy on X_test:", test_accuracy, "%")
display(result_df)

Accuracy on X_test: 93.33333333333333 %


Unnamed: 0,target,Prediction
101,2,2
55,1,1
79,1,1
5,0,0
148,2,2
15,0,0
94,1,1
74,1,1
47,0,0
35,0,0


### make a file  without any extension, and add this dot code to that file.
### and run this command to generate The tree :
### dot  -Tpdf   name_of_file   -o  name_of_file.pdf

In [30]:
import graphviz

def create_tree(dot, tree, parent=None, prefix=''):
    node = tree['type']
    if node == 'node':
        feature = tree['feature']
        threshold = tree['threshold']
        dot.node(prefix, f'{feature} <= {threshold}')
        if parent is not None:
            dot.edge(parent, prefix, label='left')
        left_child = tree['subtrees']['left']
        right_child = tree['subtrees']['right']
        create_tree(dot, left_child, prefix=prefix + 'L', parent=prefix)
        create_tree(dot, right_child, prefix=prefix + 'R', parent=prefix)
    elif node == 'leaf':
        prediction = tree['prediction']
        dot.node(prefix, f'prediction: {prediction}')
        dot.edge(parent, prefix)
        
def visualize_decision_tree(tree_dict, output_file):
    dot = graphviz.Digraph(format='pdf')
    create_tree(dot, tree_dict)
    dot.render(output_file, view=True)

visualize_decision_tree(tree, 'decision_tree_2')
