In [1]:
import os
import sys
sys.path.insert(0, os.path.abspath(".."))

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
# from sklearn.model_selection import train_test_split
# from sklearn.datasets import make_blobs

# from svm.soft_margin_svm import SoftMarginSVM
# from preprocessing.ohe import OneHotEncoder
# from linear_models.classification.softmax_regression import SoftMaxRegression
# from neighbors.knn_classifier import KNeighborsClassifier
# from linear_models.classification.logistic_regression import LogisticRegression
# from linear_models.regression.linear_regression import LinearRegression
# from linear_models.classification.least_squares_classifier import LeastSquareClassifier
# from classification.perceptron import Perceptron

In [2]:
def visualize_model(x_train, labels, model, degree = 1):
    f = plt.figure(figsize = (8,8))
    x1_min = np.min(x_train[:,0])
    x1_max = np.max(x_train[:,0])
    x2_min = np.min(x_train[:,1])
    x2_max = np.max(x_train[:,1])
    
    x1_test, x2_test = np.meshgrid(np.linspace(x1_min, x1_max, 100), 
                                   np.linspace(x2_min, x2_max, 100))
    x_test_poly = np.array([x1_test, x2_test]).reshape(2, -1).T
    y_test = model.predict(x_test_poly)

    flat_labels = labels.flatten()
    sns.scatterplot(data = x_train, x = x_train[:,0], y = x_train[:,1], hue = labels)
    plt.contourf(x1_test, x2_test, y_test.reshape(100,100), alpha = 0.5, levels = np.linspace(0,1,3))
    plt.xlabel('$x_1$')
    plt.ylabel('$x_2$')
    plt.xlim(x1_min, x1_max)
    plt.ylim(x2_min, x2_max)
    plt.gca().set_aspect('equal', adjustable = 'box')
    plt.show()

In [3]:
from sklearn.datasets import load_iris
X, y = load_iris(return_X_y = True)

In [4]:
# correct this code
class Node():
    def __init__(self, 
                 attribute_name = None,
                 threshold = None,
                 left_child = None, 
                 right_child = None):
        self.attribute_name = attribute_name
        if attribute_name:
            self.threshold = threshold
            self.left_child = Node()
            self.right_child = Node()
        

In [85]:
class DecisionTreeClassifier():
    def __init__(self, 
                 min_samples_split = 4,
                 max_depth = 5):
        self.min_samples_split = min_samples_split
        self.max_depth = max_depth

    def make_tree(self,  X, y, depth=0):
        if len(X) > self.min_samples_split and self.max_depth != depth:
            best_split = self.get_best_split(X, y)
            if best_split['Information_gain'] > 0:
                left_child = self.make_tree(best_split['X_left_data'], 
                                                 best_split['y_left_data'],
                                                 depth + 1)
                right_child = self.make_tree(best_split['X_right_data'], 
                                                  best_split['y_right_data'],
                                                  depth + 1)
                return Node(
                    attribute_name = best_split['Attribute'],
                    threshold = best_split['Threshold'],
                    left_child = left_child,
                    right_child = right_child
                )

        leaf_value = self.get_leaf_value(y)
        print(leaf_value)
        return Node(attribute_name = leaf_value)
    
    def get_best_split(self, X, y):
        max_information_gain = -float('inf')
        best_threshold = 0
        attribute = 0
        
        for i in range(X.shape[1]):
            uniq_thresholds = np.unique(X[:, i])
            for threshold in uniq_thresholds:
                X_left, X_right, y_left, y_right = self.split(X, y, i, threshold)
                information_gain = self.calculate_information_gain(y, y_left, y_right)
                if information_gain > max_information_gain:
                    attribute = i
                    max_information_gain = information_gain
                    best_threshold = threshold
                    X_left_data = X_left
                    X_right_data = X_right
                    y_left_data = y_left
                    y_right_data = y_right
        best_split = {
            'Attribute': attribute,
            'Information_gain': max_information_gain,
            'Threshold': best_threshold,
            'X_left_data': X_left_data,
            'X_right_data': X_right_data,
            'y_left_data': y_left_data,
            'y_right_data': y_right_data
        }
        return best_split
    
    def split(self, X, y, i, threshold):
        temp_d = np.column_stack((X, y))
        d_left = temp_d[temp_d[:, i] <= threshold]
        d_right = temp_d[temp_d[:, i] > threshold]
        return d_left[:,:-1], d_right[:,:-1], d_left[:,-1], d_right[:,-1]
    
    def get_leaf_value(self, y):
        try:
            return np.argmax(np.unique(y, return_counts=True)[0])
        except:
            return None
    
    def calculate_information_gain(self, y, y_left, y_right):
        wleft = len(y_left)/len(y)
        wright = len(y_right)/len(y)
        return self.gini_index(y) - ((wleft*self.gini_index(y_left)) + (wright*self.gini_index(y_right)))
    
    def gini_index(self, y):
        classes = np.unique(y)
        gini = 0
        for cl in classes:
            pr = len(y[y==cl]) / len(y)
            gini = gini + pr**2
        return 1 - gini
    
    def print_tree(self, node):
        if node.left_child == None and node.right_child == None:
            return [(node.attribute_name)]
        else:
            return [(node.threshold, node.attribute_name, node.threshold)] + self.print_tree(node.left_child) +  self.print_tree(node.right_child)

In [86]:
dt = DecisionTreeClassifier(max_depth = 10)
tree = dt.make_tree(X, y)
tl = dt.print_tree(tree)

0
0
0
0
1
1
0


In [84]:
tl

[(1.9, 2, 1.9),
 0,
 (1.7, 3, 1.7),
 (4.9, 2, 4.9),
 (1.6, 3, 1.6),
 0,
 0,
 (1.5, 3, 1.5),
 0,
 1,
 (4.8, 2, 4.8),
 1,
 0]

In [60]:
print(dt.tree[2].left_child)

None


In [32]:
print(tree.attribute_name)

2
