In [3]:
import pandas as pd
import numpy as np

from  _gini import calculate_gini_impurity

In [17]:
def gini_impurity(labels):
    total_samples = len(labels)
    class_counts = labels.value_counts()
    gini = 1.0

    for count in class_counts:
        p_i = count / total_samples
        gini -= p_i ** 2

    return gini

def find_optimal_threshold(feature_values, labels):
    best_gini = float('inf')
    best_threshold = None

    for threshold in feature_values:
        left_mask = feature_values <= threshold
        right_mask = feature_values > threshold

        left_gini = gini_impurity(labels[left_mask])
        right_gini = gini_impurity(labels[right_mask])
        weighted_gini = (left_gini * sum(left_mask) + right_gini * sum(right_mask)) / len(labels)
        
        if weighted_gini < best_gini:
            best_gini = weighted_gini
            best_threshold = threshold

    return best_threshold

def split_based_on_gini_impurity(data, target):
    print("##############################")
    print("Data for Splitting:")
    display(data)

    lowest_impurity = calculate_gini_impurity(data, target)
    print(f"Lowest impurity of the data is: {lowest_impurity}")

    min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
    print(f"Next split based on feature: {min_key}, Gini Impurity: {min_value}")

    print("##############################")

    unique_values = data[min_key].unique()

    # Check if the feature is continuous (not just two unique values)
    if len(unique_values) > 2:
        print(f"Continous varible detected")
        feature_values = data[min_key].values
        threshold = find_optimal_threshold(feature_values, data[target])
        print(f"Optimal Threshold for {min_key}: {threshold}")

        left_mask = feature_values <= threshold
        right_mask = feature_values > threshold

        left_split_df = data[left_mask]
        right_split_df = data[right_mask]

        print(f"Split 1 - Data:")
        display(left_split_df)
        unique_values = left_split_df[target].unique()

        if len(unique_values) > 1:
            print(f"Split 1 - Multiple target values found, further splitting:")
            split_based_on_gini_impurity(left_split_df, target)
        else:
            print(f"Split 1 - Only one target value found: {unique_values[0]}")

        print("##############################")

        print(f"Split 2 - Data:")
        display(right_split_df)
        unique_values = right_split_df[target].unique()

        if len(unique_values) > 1:
            print(f"Split 2 - Multiple target values found, further splitting:")
            split_based_on_gini_impurity(right_split_df, target)
        else:
            print(f"Split 2 - Only one target value found: {unique_values[0]}")
    else:
        print("categorical varibale detected")
        data_splits = {value: data[data[min_key] == value] for value in unique_values}

        for i, split_df in enumerate(data_splits.values(), 1):
            print(f"Split {i} - Data:")
            display(split_df)

            unique_values = split_df[target].unique()

            if len(unique_values) > 1:
                print(f"Split {i} - Multiple target values found, further splitting:")
                split_based_on_gini_impurity(split_df, target)
            else:
                print(f"Split {i} - Only one target value found: {unique_values[0]}")

    print("##############################")

In [9]:
## IRIS DATASET ## 

In [6]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split

# Load the Iris dataset
iris = load_iris()

# Create a DataFrame for features (X) and target (y)
X_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y_df = pd.DataFrame(data=iris.target, columns=["target"])

# Concatenate y_df as an additional column in X_df
X_df_with_target = pd.concat([X_df, y_df], axis=1)

# Split the data into training and testing sets (80% training, 20% testing)
X_train, X_test, y_train, y_test = train_test_split(X_df_with_target, y_df, test_size=0.2, random_state=42)
X_train.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
22,4.6,3.6,1.0,0.2,0
15,5.7,4.4,1.5,0.4,0
65,6.7,3.1,4.4,1.4,1
11,4.8,3.4,1.6,0.2,0
42,4.4,3.2,1.3,0.2,0


In [7]:
split_based_on_gini_impurity(data= X_train, target= 'target')

##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
22,4.6,3.6,1.0,0.2,0
15,5.7,4.4,1.5,0.4,0
65,6.7,3.1,4.4,1.4,1
11,4.8,3.4,1.6,0.2,0
42,4.4,3.2,1.3,0.2,0
...,...,...,...,...,...
71,6.1,2.8,4.0,1.3,1
106,4.9,2.5,4.5,1.7,2
14,5.8,4.0,1.2,0.2,0
92,5.8,2.6,4.0,1.2,1


Lowest impurity of the data is: {'target': 0.6665277777777777, 'sepal length (cm)': 0.31856481481481486, 'sepal width (cm)': 0.45222222222222225, 'petal length (cm)': 0.06749999999999999, 'petal width (cm)': 0.07539682539682538}
Next split based on feature: petal length (cm), Gini Impurity: 0.06749999999999999
##############################
Optimal Threshold for petal length (cm): 1.9
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
22,4.6,3.6,1.0,0.2,0
15,5.7,4.4,1.5,0.4,0
11,4.8,3.4,1.6,0.2,0
42,4.4,3.2,1.3,0.2,0
27,5.2,3.5,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
32,5.2,4.1,1.5,0.1,0
16,5.4,3.9,1.3,0.4,0
10,5.4,3.7,1.5,0.2,0
0,5.1,3.5,1.4,0.2,0


Split 1 - Only one target value found: 0
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
65,6.7,3.1,4.4,1.4,1
146,6.3,2.5,5.0,1.9,2
51,6.4,3.2,4.5,1.5,1
142,5.8,2.7,5.1,1.9,2
85,6.0,3.4,4.5,1.6,1
...,...,...,...,...,...
149,5.9,3.0,5.1,1.8,2
71,6.1,2.8,4.0,1.3,1
106,4.9,2.5,4.5,1.7,2
92,5.8,2.6,4.0,1.2,1


Split 2 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
65,6.7,3.1,4.4,1.4,1
146,6.3,2.5,5.0,1.9,2
51,6.4,3.2,4.5,1.5,1
142,5.8,2.7,5.1,1.9,2
85,6.0,3.4,4.5,1.6,1
...,...,...,...,...,...
149,5.9,3.0,5.1,1.8,2
71,6.1,2.8,4.0,1.3,1
106,4.9,2.5,4.5,1.7,2
92,5.8,2.6,4.0,1.2,1


Lowest impurity of the data is: {'target': 0.49968749999999995, 'sepal length (cm)': 0.30166666666666664, 'sepal width (cm)': 0.40201330532212876, 'petal length (cm)': 0.10124999999999998, 'petal width (cm)': 0.11309523809523807}
Next split based on feature: petal length (cm), Gini Impurity: 0.10124999999999998
##############################
Optimal Threshold for petal length (cm): 4.7
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
65,6.7,3.1,4.4,1.4,1
51,6.4,3.2,4.5,1.5,1
85,6.0,3.4,4.5,1.6,1
86,6.7,3.1,4.7,1.5,1
81,5.5,2.4,3.7,1.0,1
75,6.6,3.0,4.4,1.4,1
96,5.7,2.9,4.2,1.3,1
66,5.6,3.0,4.5,1.5,1
67,5.8,2.7,4.1,1.0,1
60,5.0,2.0,3.5,1.0,1


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
65,6.7,3.1,4.4,1.4,1
51,6.4,3.2,4.5,1.5,1
85,6.0,3.4,4.5,1.6,1
86,6.7,3.1,4.7,1.5,1
81,5.5,2.4,3.7,1.0,1
75,6.6,3.0,4.4,1.4,1
96,5.7,2.9,4.2,1.3,1
66,5.6,3.0,4.5,1.5,1
67,5.8,2.7,4.1,1.0,1
60,5.0,2.0,3.5,1.0,1


Lowest impurity of the data is: {'target': 0.05259313367421467, 'sepal length (cm)': 0.02702702702702703, 'sepal width (cm)': 0.03603603603603604, 'petal length (cm)': 0.043243243243243225, 'petal width (cm)': 0.0}
Next split based on feature: petal width (cm), Gini Impurity: 0.0
##############################
Optimal Threshold for petal width (cm): 1.6
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
65,6.7,3.1,4.4,1.4,1
51,6.4,3.2,4.5,1.5,1
85,6.0,3.4,4.5,1.6,1
86,6.7,3.1,4.7,1.5,1
81,5.5,2.4,3.7,1.0,1
75,6.6,3.0,4.4,1.4,1
96,5.7,2.9,4.2,1.3,1
66,5.6,3.0,4.5,1.5,1
67,5.8,2.7,4.1,1.0,1
60,5.0,2.0,3.5,1.0,1


Split 1 - Only one target value found: 1
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
106,4.9,2.5,4.5,1.7,2


Split 2 - Only one target value found: 2
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
146,6.3,2.5,5.0,1.9,2
142,5.8,2.7,5.1,1.9,2
133,6.3,2.8,5.1,1.5,2
137,6.4,3.1,5.5,1.8,2
109,7.2,3.6,6.1,2.5,2
105,7.6,3.0,6.6,2.1,2
122,7.7,2.8,6.7,2.0,2
123,6.3,2.7,4.9,1.8,2
117,7.7,3.8,6.7,2.2,2
113,5.7,2.5,5.0,2.0,2


Split 2 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
146,6.3,2.5,5.0,1.9,2
142,5.8,2.7,5.1,1.9,2
133,6.3,2.8,5.1,1.5,2
137,6.4,3.1,5.5,1.8,2
109,7.2,3.6,6.1,2.5,2
105,7.6,3.0,6.6,2.1,2
122,7.7,2.8,6.7,2.0,2
123,6.3,2.7,4.9,1.8,2
117,7.7,3.8,6.7,2.2,2
113,5.7,2.5,5.0,2.0,2


Lowest impurity of the data is: {'target': 0.2055164954029205, 'sepal length (cm)': 0.16002214839424145, 'sepal width (cm)': 0.17984496124031005, 'petal length (cm)': 0.1511627906976744, 'petal width (cm)': 0.11162790697674417}
Next split based on feature: petal width (cm), Gini Impurity: 0.11162790697674417
##############################
Optimal Threshold for petal width (cm): 1.7
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
119,6.0,2.2,5.0,1.5,2
72,6.3,2.5,4.9,1.5,1
129,7.2,3.0,5.8,1.6,2
52,6.9,3.1,4.9,1.5,1


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
119,6.0,2.2,5.0,1.5,2
72,6.3,2.5,4.9,1.5,1
129,7.2,3.0,5.8,1.6,2
52,6.9,3.1,4.9,1.5,1


Lowest impurity of the data is: {'target': 0.5, 'sepal length (cm)': 0.25, 'sepal width (cm)': 0.125, 'petal length (cm)': 0.25, 'petal width (cm)': 0.375}
Next split based on feature: sepal width (cm), Gini Impurity: 0.125
##############################
Optimal Threshold for sepal width (cm): 3.0
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
119,6.0,2.2,5.0,1.5,2
72,6.3,2.5,4.9,1.5,1
129,7.2,3.0,5.8,1.6,2


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
119,6.0,2.2,5.0,1.5,2
72,6.3,2.5,4.9,1.5,1
129,7.2,3.0,5.8,1.6,2


Lowest impurity of the data is: {'target': 0.48979591836734704, 'sepal length (cm)': 0.2857142857142857, 'sepal width (cm)': 0.14285714285714285, 'petal length (cm)': 0.2857142857142857, 'petal width (cm)': 0.3333333333333333}
Next split based on feature: sepal width (cm), Gini Impurity: 0.14285714285714285
##############################
Optimal Threshold for sepal width (cm): 2.2
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
119,6.0,2.2,5.0,1.5,2


Split 1 - Only one target value found: 2
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1
129,7.2,3.0,5.8,1.6,2


Split 2 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1
129,7.2,3.0,5.8,1.6,2


Lowest impurity of the data is: {'target': 0.5, 'sepal length (cm)': 0.16666666666666666, 'sepal width (cm)': 0.16666666666666666, 'petal length (cm)': 0.16666666666666666, 'petal width (cm)': 0.3333333333333333}
Next split based on feature: sepal length (cm), Gini Impurity: 0.16666666666666666
##############################
Optimal Threshold for sepal length (cm): 6.7
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
77,6.7,3.0,5.0,1.7,1
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Lowest impurity of the data is: {'target': 0.48, 'sepal length (cm)': 0.2, 'sepal width (cm)': 0.0, 'petal length (cm)': 0.2, 'petal width (cm)': 0.2}
Next split based on feature: sepal width (cm), Gini Impurity: 0.0
##############################
Optimal Threshold for sepal width (cm): 2.8
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Lowest impurity of the data is: {'target': 0.5, 'sepal length (cm)': 0.25, 'sepal width (cm)': 0.0, 'petal length (cm)': 0.25, 'petal width (cm)': 0.25}
Next split based on feature: sepal width (cm), Gini Impurity: 0.0
##############################
Optimal Threshold for sepal width (cm): 2.7
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
83,6.0,2.7,5.1,1.6,1
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Lowest impurity of the data is: {'target': 0.4444444444444445, 'sepal length (cm)': 0.0, 'sepal width (cm)': 0.0, 'petal length (cm)': 0.0, 'petal width (cm)': 0.0}
Next split based on feature: sepal length (cm), Gini Impurity: 0.0
##############################
Optimal Threshold for sepal length (cm): 6.0
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
83,6.0,2.7,5.1,1.6,1


Split 1 - Only one target value found: 1
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Split 2 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
134,6.1,2.6,5.6,1.4,2
72,6.3,2.5,4.9,1.5,1


Lowest impurity of the data is: {'target': 0.5, 'sepal length (cm)': 0.0, 'sepal width (cm)': 0.0, 'petal length (cm)': 0.0, 'petal width (cm)': 0.0}
Next split based on feature: sepal length (cm), Gini Impurity: 0.0
##############################
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
134,6.1,2.6,5.6,1.4,2


Split 1 - Only one target value found: 2
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
72,6.3,2.5,4.9,1.5,1


Split 2 - Only one target value found: 1
##############################
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
133,6.3,2.8,5.1,1.5,2


Split 2 - Only one target value found: 2
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
77,6.7,3.0,5.0,1.7,1


Split 2 - Only one target value found: 1
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
129,7.2,3.0,5.8,1.6,2


Split 2 - Only one target value found: 2
##############################
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
52,6.9,3.1,4.9,1.5,1


Split 2 - Only one target value found: 1
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
146,6.3,2.5,5.0,1.9,2
142,5.8,2.7,5.1,1.9,2
137,6.4,3.1,5.5,1.8,2
109,7.2,3.6,6.1,2.5,2
105,7.6,3.0,6.6,2.1,2
122,7.7,2.8,6.7,2.0,2
123,6.3,2.7,4.9,1.8,2
117,7.7,3.8,6.7,2.2,2
113,5.7,2.5,5.0,2.0,2
138,6.0,3.0,4.8,1.8,2


Split 2 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
146,6.3,2.5,5.0,1.9,2
142,5.8,2.7,5.1,1.9,2
137,6.4,3.1,5.5,1.8,2
109,7.2,3.6,6.1,2.5,2
105,7.6,3.0,6.6,2.1,2
122,7.7,2.8,6.7,2.0,2
123,6.3,2.7,4.9,1.8,2
117,7.7,3.8,6.7,2.2,2
113,5.7,2.5,5.0,2.0,2
138,6.0,3.0,4.8,1.8,2


Lowest impurity of the data is: {'target': 0.05551020408163266, 'sepal length (cm)': 0.02857142857142857, 'sepal width (cm)': 0.04285714285714286, 'petal length (cm)': 0.0380952380952381, 'petal width (cm)': 0.05142857142857141}
Next split based on feature: sepal length (cm), Gini Impurity: 0.02857142857142857
##############################
Optimal Threshold for sepal length (cm): 5.9
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
142,5.8,2.7,5.1,1.9,2
113,5.7,2.5,5.0,2.0,2
101,5.8,2.7,5.1,1.9,2
114,5.8,2.8,5.1,2.4,2
70,5.9,3.2,4.8,1.8,1
121,5.6,2.8,4.9,2.0,2
149,5.9,3.0,5.1,1.8,2


Split 1 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
142,5.8,2.7,5.1,1.9,2
113,5.7,2.5,5.0,2.0,2
101,5.8,2.7,5.1,1.9,2
114,5.8,2.8,5.1,2.4,2
70,5.9,3.2,4.8,1.8,1
121,5.6,2.8,4.9,2.0,2
149,5.9,3.0,5.1,1.8,2


Lowest impurity of the data is: {'target': 0.24489795918367355, 'sepal length (cm)': 0.14285714285714285, 'sepal width (cm)': 0.0, 'petal length (cm)': 0.0, 'petal width (cm)': 0.14285714285714285}
Next split based on feature: sepal width (cm), Gini Impurity: 0.0
##############################
Optimal Threshold for sepal width (cm): 3.0
Split 1 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
142,5.8,2.7,5.1,1.9,2
113,5.7,2.5,5.0,2.0,2
101,5.8,2.7,5.1,1.9,2
114,5.8,2.8,5.1,2.4,2
121,5.6,2.8,4.9,2.0,2
149,5.9,3.0,5.1,1.8,2


Split 1 - Only one target value found: 2
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
70,5.9,3.2,4.8,1.8,1


Split 2 - Only one target value found: 1
##############################
##############################
Split 2 - Data:


Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
146,6.3,2.5,5.0,1.9,2
137,6.4,3.1,5.5,1.8,2
109,7.2,3.6,6.1,2.5,2
105,7.6,3.0,6.6,2.1,2
122,7.7,2.8,6.7,2.0,2
123,6.3,2.7,4.9,1.8,2
117,7.7,3.8,6.7,2.2,2
138,6.0,3.0,4.8,1.8,2
148,6.2,3.4,5.4,2.3,2
111,6.4,2.7,5.3,1.9,2


Split 2 - Only one target value found: 2
##############################
##############################
##############################
##############################


In [10]:
## House dataset ##

In [15]:
data = pd.read_csv('gpt_house_category.csv')

X_train, X_test, y_train, y_test = train_test_split(data, data['House Category'], test_size=0.2, random_state=42)

X_train.head()

Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
55,2800,4,Medium House
88,2900,3,Small House
26,1600,3,Small House
42,1700,3,Small House
69,1800,2,Small House


In [18]:
split_based_on_gini_impurity(data= X_train, target= 'House Category')

##############################
Data for Splitting:


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
55,2800,4,Medium House
88,2900,3,Small House
26,1600,3,Small House
42,1700,3,Small House
69,1800,2,Small House
...,...,...,...
60,2300,5,Large House
71,1500,3,Small House
14,1200,2,Small House
92,1700,5,Large House


Lowest impurity of the data is: {'House Category': 0.6303124999999999, 'Size (sq. ft.)': 0.4219642857142857, 'Number of Bedrooms': 0.0}
Next split based on feature: Number of Bedrooms, Gini Impurity: 0.0
##############################
Continous varible detected
Optimal Threshold for Number of Bedrooms: 3
Split 1 - Data:


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
88,2900,3,Small House
26,1600,3,Small House
42,1700,3,Small House
69,1800,2,Small House
72,2500,3,Small House
11,1400,2,Small House
93,2400,3,Small House
66,2600,3,Small House
35,1400,2,Small House
49,2000,3,Small House


Split 1 - Only one target value found: Small House
##############################
Split 2 - Data:


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
55,2800,4,Medium House
15,2600,5,Large House
40,3500,5,Large House
96,1400,5,Large House
9,2000,4,Medium House
47,2500,4,Medium House
85,2800,4,Medium House
28,2000,4,Medium House
5,1900,4,Medium House
65,2000,4,Medium House


Split 2 - Multiple target values found, further splitting:
##############################
Data for Splitting:


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
55,2800,4,Medium House
15,2600,5,Large House
40,3500,5,Large House
96,1400,5,Large House
9,2000,4,Medium House
47,2500,4,Medium House
85,2800,4,Medium House
28,2000,4,Medium House
5,1900,4,Medium House
65,2000,4,Medium House


Lowest impurity of the data is: {'House Category': 0.4973230220107079, 'Size (sq. ft.)': 0.22764227642276424, 'Number of Bedrooms': 0.0}
Next split based on feature: Number of Bedrooms, Gini Impurity: 0.0
##############################
categorical varibale detected
Split 1 - Data:


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
55,2800,4,Medium House
9,2000,4,Medium House
47,2500,4,Medium House
85,2800,4,Medium House
28,2000,4,Medium House
5,1900,4,Medium House
65,2000,4,Medium House
62,2900,4,Medium House
13,2300,4,Medium House
89,3100,4,Medium House


Split 1 - Only one target value found: Medium House
Split 2 - Data:


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
15,2600,5,Large House
40,3500,5,Large House
96,1400,5,Large House
16,2800,5,Large House
34,3300,5,Large House
95,1600,5,Large House
17,2900,5,Large House
8,2400,5,Large House
54,1900,5,Large House
43,3300,5,Large House


Split 2 - Only one target value found: Large House
##############################
##############################
