In [5]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from _entropy import calculate_entropy
from  _gini import calculate_gini_impurity

In [6]:
cinema_tennis = pd.read_csv('cinema_shopping_tenis_decision.csv')
cinema_tennis

Unnamed: 0,Weather,Parents,Money,Decision
0,Sunny,Yes,Rich,Cinema
1,Sunny,No,Rich,Tennis
2,Windy,Yes,Rich,Cinema
3,Rainy,Yes,Poor,Cinema
4,Rainy,No,Rich,Stay In
5,Rainy,Yes,Poor,Cinema
6,Windy,No,Poor,Cinema
7,Windy,No,Rich,Shopping
8,Windy,Yes,Rich,Cinema
9,Sunny,No,Rich,Tennis


In [2]:
gpt_house = pd.read_csv('gpt_house_category.csv')
gpt_house

Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
0,1500,3,Small House
1,1800,4,Medium House
2,2200,3,Small House
3,1300,2,Small House
4,2500,5,Large House
...,...,...,...
95,1600,5,Large House
96,1400,5,Large House
97,2700,2,Small House
98,2300,3,Small House


In [160]:
iris = load_iris()

# Create a DataFrame for features (X) and target (y)
X_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y_df = pd.DataFrame(data=iris.target, columns=["target"])
iris = pd.concat([X_df, y_df], axis=1)

iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,2
146,6.3,2.5,5.0,1.9,2
147,6.5,3.0,5.2,2.0,2
148,6.2,3.4,5.4,2.3,2


In [3]:
def split_based_on_gini_impurity(data, target):
    print("#############Split#################")
    lowest_impurity = calculate_gini_impurity(data, target)
    
    min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
    print(lowest_impurity)
    print(f"Next split based on feature: {min_key}, Gini Impurity: {min_value}")
    unique_values = data[min_key].unique()
    
    data_splits = {value: data[data[min_key] == value] for value in unique_values}
    
    for i, split_df in enumerate(data_splits.values(), 1):
        unique_values = split_df[target].unique()
        if len(unique_values) > 1:
            split_based_on_gini_impurity(split_df, target)
        else:
            # print(f"Split {i} - Only one target value found: {unique_values[0]}")
            display(split_df)

In [4]:
split_based_on_gini_impurity(data= gpt_house, target= 'House Category')

#############Split#################
{'House Category': 0.6374000000000001, 'Size (sq. ft.)': 0.45659523809523805, 'Number of Bedrooms': 0.0}
Next split based on feature: Number of Bedrooms, Gini Impurity: 0.0


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
0,1500,3,Small House
2,2200,3,Small House
6,2100,3,Small House
7,1700,3,Small House
10,1600,3,Small House
18,1750,3,Small House
19,1850,3,Small House
21,2050,3,Small House
24,2200,3,Small House
25,2100,3,Small House


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
1,1800,4,Medium House
5,1900,4,Medium House
9,2000,4,Medium House
13,2300,4,Medium House
20,1950,4,Medium House
28,2000,4,Medium House
31,1900,4,Medium House
37,2700,4,Medium House
47,2500,4,Medium House
48,2400,4,Medium House


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
3,1300,2,Small House
11,1400,2,Small House
14,1200,2,Small House
27,1350,2,Small House
33,1800,2,Small House
35,1400,2,Small House
36,1950,2,Small House
51,2000,2,Small House
64,1800,2,Small House
69,1800,2,Small House


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
4,2500,5,Large House
8,2400,5,Large House
12,2700,5,Large House
15,2600,5,Large House
16,2800,5,Large House
17,2900,5,Large House
22,3000,5,Large House
23,3100,5,Large House
29,3000,5,Large House
30,2400,5,Large House


In [10]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt

class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, data, target):
        self.tree = self._build_tree(data, target)

    def _build_tree(self, data, target):
        unique_values = data[target].unique()
        if len(unique_values) == 1:
            # If all target values are the same, create a leaf node
            return {'type': 'leaf', 'prediction': unique_values[0]}

        # Find the feature that minimizes the Gini Impurity
        lowest_impurity = calculate_gini_impurity(data, target)
        min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])

        # Split the data based on the selected feature
        data_splits = {value: data[data[min_key] == value] for value in data[min_key].unique()}

        # Recursively build subtrees
        subtrees = {}
        for value, split_df in data_splits.items():
            subtrees[value] = self._build_tree(split_df.drop(columns=[min_key]), target)

        return {'type': 'node', 'feature': min_key, 'subtrees': subtrees}

    def _predict_instance(self, instance, node):
        if node['type'] == 'leaf':
            return node['prediction']

        feature_value = instance[node['feature']]
        if feature_value not in node['subtrees']:
            # If the value of the feature is not present in the training data, return a default prediction
            return 0

        return self._predict_instance(instance, node['subtrees'][feature_value])

    def predict(self, features):
        predictions = []
        for _, instance in features.iterrows():
            prediction = self._predict_instance(instance, self.tree)
            predictions.append(prediction)
        return predictions

###############Training####################
tree = DecisionTree()
tree.fit(cinema_tennis, 'Decision')

X_train = cinema_tennis.drop('Decision', axis=1) 
predict_train = tree.predict(X_train)
print(predict_train)
###############Training ENDS####################

# Custom row for prediction (only features)
custom_features = pd.DataFrame({
    'Weather': ['Sunny'],
    'Parents': ['No'],
    'Money': ['Poor']
})

prediction = tree.predict(custom_features)

print("Prediction for the custom row:")
print(prediction)



['Cinema', 'Tennis', 'Cinema', 'Cinema', 'Stay In', 'Cinema', 'Cinema', 'Shopping', 'Cinema', 'Tennis']
Prediction for the custom row:
['Tennis']


TypeError: {'type': 'node', 'feature': 'Parents', 'subtrees': {'Yes': {'type': 'leaf', 'prediction': 'Cinema'}, 'No': {'type': 'node', 'feature': 'Weather', 'subtrees': {'Sunny': {'type': 'leaf', 'prediction': 'Tennis'}, 'Rainy': {'type': 'leaf', 'prediction': 'Stay In'}, 'Windy': {'type': 'node', 'feature': 'Money', 'subtrees': {'Poor': {'type': 'leaf', 'prediction': 'Cinema'}, 'Rich': {'type': 'leaf', 'prediction': 'Shopping'}}}}}}} is not an estimator instance.

<Figure size 1500x1000 with 0 Axes>

In [169]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

tree.fit( iris, 'target' )
test_predictions = tree.predict(X_test)

# Convert test_predictions list to DataFrame with the same index as y_test
test_predictions_df = pd.DataFrame(test_predictions, index=y_test.index, columns=["Predicted"])

# Combine y_test and test_predictions_df into a single DataFrame
results_df = pd.concat([y_test, test_predictions_df], axis=1)
results_df

Unnamed: 0,target,Predicted
73,1,1
18,0,0
118,2,2
78,1,1
76,1,1
31,0,0
64,1,1
141,2,2
68,1,1
82,1,1
