In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from _entropy import calculate_entropy
from  _gini import calculate_gini_impurity

In [7]:
cinema_tennis = pd.read_csv('cinema_shopping_tenis_decision.csv')
cinema_tennis

Unnamed: 0,Weather,Parents,Money,Decision
0,Sunny,Yes,Rich,Cinema
1,Sunny,No,Rich,Tennis
2,Windy,Yes,Rich,Cinema
3,Rainy,Yes,Poor,Cinema
4,Rainy,No,Rich,Stay In
5,Rainy,Yes,Poor,Cinema
6,Windy,No,Poor,Cinema
7,Windy,No,Rich,Shopping
8,Windy,Yes,Rich,Cinema
9,Sunny,No,Rich,Tennis


In [5]:
exercise = pd.read_csv('exercise.csv')
exercise

Unnamed: 0,diet,pulse,time,kind
0,low fat,85,1 min,rest
1,low fat,85,15 min,rest
2,low fat,88,30 min,rest
3,low fat,90,1 min,rest
4,low fat,92,15 min,rest
...,...,...,...,...
85,no fat,135,15 min,running
86,no fat,130,30 min,running
87,no fat,99,1 min,running
88,no fat,111,15 min,running


In [188]:
gpt_house = pd.read_csv('gpt_house_category.csv')
gpt_house

Unnamed: 0,Size (sq. ft.),Number of Bedrooms,House Category
0,1500,3,Small House
1,1800,4,Medium House
2,2200,3,Small House
3,1300,2,Small House
4,2500,5,Large House
...,...,...,...
95,1600,5,Large House
96,1400,5,Large House
97,2700,2,Small House
98,2300,3,Small House


In [6]:
def split_based_on_gini_impurity(data, target):
    print("#############Split#################")
    lowest_impurity = calculate_gini_impurity(data, target)
    
    min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
    # print(lowest_impurity)
    # print(f"Next split based on feature: {min_key}, Gini Impurity: {min_value}")
    unique_values = data[min_key].unique()
    
    data_splits = {value: data[data[min_key] == value] for value in unique_values}
    
    for i, split_df in enumerate(data_splits.values(), 1):
        unique_values = split_df[target].unique()
        if len(unique_values) > 1:
            split_based_on_gini_impurity(split_df, target)
        else:
            display(split_df)

In [7]:
split_based_on_gini_impurity(data= cinema_tennis, target= 'Decision')

#############Split#################


Unnamed: 0,Weather,Parents,Money,Decision
0,Sunny,Yes,Rich,Cinema
2,Windy,Yes,Rich,Cinema
3,Rainy,Yes,Poor,Cinema
5,Rainy,Yes,Poor,Cinema
8,Windy,Yes,Rich,Cinema


#############Split#################


Unnamed: 0,Weather,Parents,Money,Decision
1,Sunny,No,Rich,Tennis
9,Sunny,No,Rich,Tennis


Unnamed: 0,Weather,Parents,Money,Decision
4,Rainy,No,Rich,Stay In


#############Split#################


Unnamed: 0,Weather,Parents,Money,Decision
6,Windy,No,Poor,Cinema


Unnamed: 0,Weather,Parents,Money,Decision
7,Windy,No,Rich,Shopping


In [10]:
from sklearn.tree import DecisionTreeClassifier, plot_tree
import matplotlib.pyplot as plt
from sklearn.tree import plot_tree

class DecisionTree:
    def __init__(self):
        self.tree = None

    def fit(self, data, target):
        self.tree = self._build_tree(data, target)
        return self.tree 
        
    def _build_tree(self, data, target):
        unique_values = data[target].unique()
        if len(unique_values) == 1:
            return {'type': 'leaf', 'prediction': unique_values[0]}

        lowest_impurity = calculate_gini_impurity(data, target)
        min_key, min_value = min(lowest_impurity.items(), key=lambda x: x[1])
        
        data_splits = {value: data[data[min_key] == value] for value in data[min_key].unique()}
        
        subtrees = {}
        for value, split_df in data_splits.items():
            subtrees[value] = self._build_tree(split_df.drop(columns=[min_key]), target)
        return {'type': 'node', 'feature': min_key, 'subtrees': subtrees}
        
    def _predict_instance(self, instance, node):
        if node['type'] == 'leaf':
            return node['prediction']

        feature_value = instance[node['feature']]
        if feature_value not in node['subtrees']:
            return 0

        return self._predict_instance(instance, node['subtrees'][feature_value])

    def predict(self, features):
        predictions = []
        for _, instance in features.iterrows():
            prediction = self._predict_instance(instance, self.tree)
            predictions.append(prediction)
        return predictions

###############Training####################
model = DecisionTree()
tree = model.fit(cinema_tennis, 'Decision')
tree
###############Training ENDS####################
# x_test_x = x_test.drop('Decision', axis=1) 
# prediction = model.predict(x_test_x)
# x_train
# type(tree), display(tree)
# display(prediction), display(x_test['House Category'])

{'type': 'node',
 'feature': 'Parents',
 'subtrees': {'Yes': {'type': 'leaf', 'prediction': 'Cinema'},
  'No': {'type': 'node',
   'feature': 'Weather',
   'subtrees': {'Sunny': {'type': 'leaf', 'prediction': 'Tennis'},
    'Rainy': {'type': 'leaf', 'prediction': 'Stay In'},
    'Windy': {'type': 'node',
     'feature': 'Money',
     'subtrees': {'Poor': {'type': 'leaf', 'prediction': 'Cinema'},
      'Rich': {'type': 'leaf', 'prediction': 'Shopping'}}}}}}}