In [17]:
import numpy as np
import pandas as pd
from statistics import mode

In [36]:
class TreeNode:
    def __init__(self, ids = None, children = [], entropy = 0, depth = 0):
        self.ids = ids #index of data point in this node
        self.children = children #list of its child node
        self.entropy = entropy 
        self.depth = depth #distance to root node
        self.order = None #keeping order by which child node is splitted into
        self.split_attribute = None #the attribute chosen to split data
        self.label = None #label of node if the code is leaf
    def set_properties(self, split_attribute, order):
        self.split_attribute = split_attribute
        self.order = order
    def set_label(self, label):
        self.label = label


In [37]:
def entropy(prob):
    prob = np.array(prob)
    zero_idx = prob.nonzero()[0]
    prob_non_zero = prob[zero_idx]
    entropy = -np.sum(prob_non_zero*np.log(prob_non_zero))
    return entropy


In [85]:
class DecisionTree:
    def __init__(self, 
                 min_gain: float = 1e-4, 
                 min_sample_split: int = 2,
                 max_depth: int = 5):
        """
        Parameter:
        min_gain: the minimum gain that split must have in order for the tree to grow
        min_sample_split: mimimum sample of node to continue split
        max_depth: maximum depth of the 
        
        """
        self.min_gain = min_gain
        self.min_sample_split = min_sample_split
        self.max_depth = max_depth 
        self.root = None
        self.N = 0
    def fit(self, data, target):
        """Function to check each node and split if necessary"""
        self.N = data.shape[0]
        self.data = data
        self.attributes = list(data)
        self.target = target
        self.labels = target.unique()
        ids = range(self.N)
        self.root = TreeNode(ids = ids, entropy=self._entropy(self.target), depth = 0)
        queue = [self.root]
        #LIFO check each root that last append on queue until there is no root left
        while queue:
            node = queue.pop()
            if node.depth < self.max_depth or node.entropy < self.min_gain:
                node.children = self._split(node)
                if not node.children:
                    self._get_label(node)
                queue += node.children
            else: 
                self._get_label(node)
    def _entropy(self, target):
        """Calculate entropy of the node S"""
        if len(target) == 0:
            return 0
        _, freq = np.unique(self.target, return_counts = True)
        prob = freq/float(freq.sum())
        return -np.sum(prob*np.log(prob))
    def _information_gain(self, node, childNode):
        """"
        Calculate information gain of a Node if splitted to ChildNode
        Parameters:
        node: parent node to be splitted
        childNode: list of lists of index of ChildNode
        target: (ndarray) label of node
        
        """
        n = len(node.ids)
        sum_entropy = 0
        for ls in childNode:
            p = len(ls)/n
            sum_entropy += p * self._entropy(ls)
        gain = node.entropy - sum_entropy
        return gain
    def _get_label(self, node):
        node.set_label(self.target[node.ids].mode()[0]) 
    def _split(self, node):
        ids = node.ids
        best_split = []
        best_gain = None
        best_attribute = None
        order = None
        sub_data = self.data.iloc[ids, :]
        for i, att in enumerate(self.attributes):
            unique_val = self.data.iloc[ids, i].unique().tolist()
            if len(unique_val) == 1: continue
            # unique_val = self.data[att].unique()
            split = []
            for val in unique_val:
                sub_data_index = sub_data.index[sub_data[att]==val].tolist()
                # split += sub_data_index
                split.append(sub_data_index)
            if min(map(len, split)) < self.min_sample_split: continue
            gain = self._information_gain(node, split)
            if gain < self.min_gain: continue
            if gain > best_gain:
                best_gain = gain
                best_split = split
                best_attribute = att
                order = unique_val
                # node.set_property(best_attribute, order)
        if best_attribute:
            node.set_property(best_attribute, order)
            childnode = [TreeNode(ids=ids, entropy=self._entropy(self.target[ids]), depth=node.depth + 1) for ids in best_split]
            return childnode

# Return empty list if no valid split was found
        return []
                # childnode = [TreeNode(ids = ids, entropy=self._entropy(self.target[ids]), depth = node.depth + 1) for ids in split]
            # return childnode
    def predict(self, new_data):
        n = new_data.shape[0]
        label = [None]*n
        for i in range(n):
            sample = new_data.iloc[i, :]
            node = self.root
            while node.children:
                val = sample[node.split_attribute]
                try:
                    child_index = node.order.index(val)
                    node = node.children[child_index]
                except ValueError:
                    node = node.children[0]
            label[i] = node.label
        return np.array(label)



In [43]:
df = pd.read_csv('weather.csv')

In [None]:
df

Unnamed: 0,outlook,temperature,humidity,wind,play
0,sunny,hot,high,weak,no
1,sunny,hot,high,strong,no
2,overcast,hot,high,weak,yes
3,rainy,mild,high,weak,yes
4,rainy,cool,normal,weak,yes
5,rainy,cool,normal,strong,no
6,overcast,cool,normal,strong,yes
7,sunny,mild,high,weak,no
8,sunny,cool,normal,weak,yes
9,rainy,mild,normal,weak,yes


In [45]:
data = df.iloc[:, :4]
target = df['play']

In [49]:
target.unique()

array(['no', 'yes'], dtype=object)

In [54]:
data.shape[0]

14

In [52]:
ls = [1, 2]
ad = [4, 3]

In [53]:
ls + ad

[1, 2, 4, 3]

In [86]:
model = DecisionTree()
model.fit(data, target)

In [87]:
y_pred = model.predict(data)


In [88]:
y_pred

array(['yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes', 'yes',
       'yes', 'yes', 'yes', 'yes', 'yes'], dtype='<U3')

In [10]:
prob = [0, 0.1, 0.2, 0, 0.3, 0.4]

In [15]:
data = pd.DataFrame({"inx": [0, 1, 2, 3],
                    "feature1": ['pov1', 'pov1', 'neg1', 'pov1'], 
                     "feature2": ['high', 'high', 'low', 'high'], 
                     "target": ['neg', 'neg', 'pov', 'pov']})

In [16]:
data.feature1.mode()

0    pov1
Name: feature1, dtype: object

In [16]:
vals = data['feature1'].unique().tolist()

In [34]:
data.shape[0]

4

In [18]:
split = []
for val in vals:
    ids = data.index[data['feature1']==val].tolist()
    split.append(ids)

In [19]:
split

[[0, 1, 3], [2]]

In [42]:
np.unique(target, return_counts=True)

(array(['negative', 'positive'], dtype='<U8'), array([3, 2]))

In [5]:
data.iloc[[1, 2],:]

Unnamed: 0,inx,feature1,feature2,target
1,1,pov1,high,neg
2,2,neg1,low,pov


In [35]:
target = ['positive', 'negative', 'negative', 'positive', 'negative']

In [41]:
dict((i, target.count(i)) for i in target)

{'positive': 2, 'negative': 3}

In [76]:
values = data.iloc[[1, 2, 3], 2].unique().tolist()

In [77]:
values

['high', 'low']

In [55]:
_, freqs = np.unique(target, return_counts=True)

In [56]:
freqs

array([3, 2])

In [53]:
node = queue.pop()
node

3

In [50]:
prob = np.array(prob)
prob*prob

array([0.  , 0.01, 0.04, 0.  , 0.09, 0.16])

In [43]:
np.log(0.1)

-2.3025850929940455

In [5]:
opcl = dict(('()', '[]', '{}'))

In [3]:
class MathUtils:
    @staticmethod
    def add(x, y):
        return x + y
    
    @staticmethod
    def is_prime(num):
        if num < 2:
            return False
        for i in range(2, int(num**0.5) + 1):
            if num % i == 0:
                return False
        return True

In [4]:
# Call via the class
result1 = MathUtils.add(5, 3)  # Returns 8
is_prime = MathUtils.is_prime(17)  # Returns True

# Call via an instance
math_obj = MathUtils()
result2 = math_obj.add(10, 7)  # Returns 17

In [16]:
class Employee:
    count_emp = 0
    def __init__(self, name, salary):
        self.name = name
        self.salary = salary
        Employee.count_emp += 1
    def display(self):
        return f"Employee {self.name} has the salary of {self.salary}"
    @classmethod
    def from_string(cls, emp_string):
        name, salary = emp_string.split("-")
        return cls(name, float(salary))
    @classmethod
    def count(cls):
        return cls.count_emp


In [18]:
emp1 = Employee("Joan", 1000)

In [19]:
Employee.count()

1

In [20]:
emp1.count()

1

In [21]:
import numpy as np

In [32]:
prob = [0, 0.1, 0.2, 0, 0.3]
prob = np.array(prob)
non_zero_prob = prob[np.array(prob).nonzero()[0]]

In [33]:
non_zero_prob

array([0.1, 0.2, 0.3])