In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from _entropy import calculate_entropy
from  _gini import calculate_gini_impurity

In [9]:
cinema_tennis = pd.read_csv('cinema_shopping_tenis_decision.csv')
cinema_tennis

Unnamed: 0,Weather,Parents,Money,Decision
0,Sunny,Yes,Rich,Cinema
1,Sunny,No,Rich,Tennis
2,Windy,Yes,Rich,Cinema
3,Rainy,Yes,Poor,Cinema
4,Rainy,No,Rich,Stay In
5,Rainy,Yes,Poor,Cinema
6,Windy,No,Poor,Cinema
7,Windy,No,Rich,Shopping
8,Windy,Yes,Rich,Cinema
9,Sunny,No,Rich,Tennis


In [10]:
gpt_house = pd.read_csv('gpt_house_category.csv')
gpt_house = gpt_house.sample(frac=1, random_state=42)
gpt_house

Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
83,2500,5,Large House
53,3200,3,Small House
70,1700,4,Medium House
45,1800,3,Small House
44,3200,5,Large House
...,...,...,...
60,2300,5,Large House
71,1500,3,Small House
14,1200,2,Small House
92,1700,5,Large House


In [4]:
iris = load_iris()

# Create a DataFrame for features (X) and target (y)
X_df = pd.DataFrame(data=iris.data, columns=iris.feature_names)
y_df = pd.DataFrame(data=iris.target, columns=["target"])
iris = pd.concat([X_df, y_df], axis=1)

iris = iris.sample(frac=1, random_state=42)  # frac=1 means shuffling all rows

iris

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
73,6.1,2.8,4.7,1.2,1
18,5.7,3.8,1.7,0.3,0
118,7.7,2.6,6.9,2.3,2
78,6.0,2.9,4.5,1.5,1
76,6.8,2.8,4.8,1.4,1
...,...,...,...,...,...
71,6.1,2.8,4.0,1.3,1
106,4.9,2.5,4.5,1.7,2
14,5.8,4.0,1.2,0.2,0
92,5.8,2.6,4.0,1.2,1


In [11]:
def split_based_on_gini_impurity(data, target):
    print("#############Split#################")
    lowest_impurity = calculate_gini_impurity(data, target)
    
    min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
    # print(lowest_impurity)
    # print(f"Next split based on feature: {min_key}, Gini Impurity: {min_value}")
    unique_values = data[min_key].unique()
    
    data_splits = {value: data[data[min_key] == value] for value in unique_values}
    
    for i, split_df in enumerate(data_splits.values(), 1):
        unique_values = split_df[target].unique()
        if len(unique_values) > 1:
            split_based_on_gini_impurity(split_df, target)
        else:
            # print(f"Split {i} - Only one target value found: {unique_values[0]}")
            display(split_df)

In [13]:
split_based_on_gini_impurity(data= gpt_house, target= 'House Category')

#############Split#################


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
83,2500,5,Large House
44,3200,5,Large House
22,3000,5,Large House
80,2200,5,Large House
30,2400,5,Large House
73,1600,5,Large House
4,2500,5,Large House
76,2000,5,Large House
77,2700,5,Large House
12,2700,5,Large House


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
53,3200,3,Small House
45,1800,3,Small House
39,2200,3,Small House
10,1600,3,Small House
0,1500,3,Small House
18,1750,3,Small House
90,1800,3,Small House
88,2900,3,Small House
26,1600,3,Small House
42,1700,3,Small House


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
70,1700,4,Medium House
31,1900,4,Medium House
55,2800,4,Medium House
9,2000,4,Medium House
47,2500,4,Medium House
85,2800,4,Medium House
28,2000,4,Medium House
5,1900,4,Medium House
65,2000,4,Medium House
62,2900,4,Medium House


Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
33,1800,2,Small House
69,1800,2,Small House
11,1400,2,Small House
35,1400,2,Small House
27,1350,2,Small House
3,1300,2,Small House
64,1800,2,Small House
36,1950,2,Small House
97,2700,2,Small House
79,2400,2,Small House


In [5]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, data, target):
        self.tree = self._build_tree(data, target)
        # display(self.tree)
        return self.tree 
        
    def _build_tree(self, data, target):
        unique_values = data[target].unique()
        if len(unique_values) == 1:
            # print(unique_values)
            # If all target values are the same, create a leaf node
            return {'type': 'leaf', 'prediction': unique_values[0]}

        # Find the feature that minimizes the Gini Impurity
        lowest_impurity = calculate_gini_impurity(data, target)
        min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
        
        # Split the data based on the selected feature
        data_splits = {value: data[data[min_key] == value] for value in data[min_key].unique()}
        
        # Recursively build subtrees
        subtrees = {}
        for value, split_df in data_splits.items():
            subtrees[value] = self._build_tree(split_df.drop(columns=[min_key]), target)
        return {'type': 'node', 'feature': min_key, 'subtrees': subtrees}
        
    def _predict_instance(self, instance, node):
        if node['type'] == 'leaf':
            return node['prediction']

        feature_value = instance[node['feature']]
        if feature_value not in node['subtrees']:
            # If the value of the feature is not present in the training data, return a default prediction
            return 0

        return self._predict_instance(instance, node['subtrees'][feature_value])

    def predict(self, features):
        predictions = []
        for _, instance in features.iterrows():
            prediction = self._predict_instance(instance, self.tree)
            predictions.append(prediction)
        return predictions

###############Training####################
x_train = cinema_tennis
x_test = cinema_tennis
model = DecisionTree()
tree = model.fit(x_train, 'Decision')
tree
###############Training ENDS####################
# x_test_x = x_test.drop('Decision', axis=1) 
# prediction = model.predict(x_test_x)

# type(tree), display(tree)
# display(prediction), display(x_test['House Category'])

{'type': 'node',
 'feature': 'Parents',
 'subtrees': {'Yes': {'type': 'leaf', 'prediction': 'Cinema'},
  'No': {'type': 'node',
   'feature': 'Weather',
   'subtrees': {'Sunny': {'type': 'leaf', 'prediction': 'Tennis'},
    'Rainy': {'type': 'leaf', 'prediction': 'Stay In'},
    'Windy': {'type': 'node',
     'feature': 'Money',
     'subtrees': {'Poor': {'type': 'leaf', 'prediction': 'Cinema'},
      'Rich': {'type': 'leaf', 'prediction': 'Shopping'}}}}}}}

In [None]:
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split

x_train = iris[:-6]
x_test_x = iris[-6:] ; x_test = x_test_x.drop('target', axis=1) 
tree = model.fit( x_train, 'target' )
test_predictions = model.predict(x_test)

# Convert test_predictions list to DataFrame with the same index as y_test
test_predictions_df = pd.DataFrame(test_predictions, index=x_test_x.index, columns=["Predicted"])

# Combine y_test and test_predictions_df into a single DataFrame
results_df = pd.concat([x_test_x['target'], test_predictions_df], axis=1)
display(tree)
display(results_df)