In [1]:
import numpy as np
import pandas as pd

In [12]:
#read data iris data.csv
col_names = ['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type']
data = pd.read_csv('iris.data.csv',skiprows=1,header=None, names=col_names)
data.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,4.9,3.0,1.4,0.2,Iris-setosa
1,4.7,3.2,1.3,0.2,Iris-setosa


In [13]:
##change name of type columns
data.iloc[:,4] = data.iloc[:,4].apply(lambda x:0 if x =='Iris-setosa' else(2 if x == 'Iris-virginica' else 1))
data.head(2)

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,type
0,4.9,3.0,1.4,0.2,0
1,4.7,3.2,1.3,0.2,0


### Build classification decision tree 

### Node class 

In [14]:
#Node
'''
We have two types of Node --> internal (with condition to figure out leaf or continue internal node)
Leaf node: only have value --> figure out class of this node (0,1,2 in this case (iris dataset))
Internal node --> attribute_name (actually it should be feature_index) 0,1,2,3 stand for sepal, petal length,width
thresold of this attribute ex 3.0, left( left tree), right (right tree), info_gain (infomation_gain of this --> serve print)
'''
class Node():
    def __init__(self,attribute_name=None,thresold=None,left=None,right=None,info_gain=None,value=None):
        #decision node (internal node)
        self.attribute_name = attribute_name
        self.thresold = thresold
        self.left = left
        self.right = right
        self.info_gain = info_gain
        
        #for leaf node
        self.value = value

### Tree class 

In [15]:
class DecisionTree_Classify():
    def __init__(self,element_min_split=2,max_dept=2):
        #stop conditions
        self.element_min_split = element_min_split
        self.max_dept = max_dept   
        
        #Root node
        root = None
        
    '''
    Calculale entropy of this data
    input: data (array with only labels element EX: [0,0,1,1,0,0,2,1,0,1,1])
    output: entropy of this data array
    Be careful!!: output may be have more than 2, like above, have 0,1, and 2...
    '''
    def entropy(self,data):
        en = 0
        y_labels = np.unique([data])
        
        for y_cl in y_labels:
            p_cls = len(data[data==y_cl])/len(data)
            en+= -p_cls*np.log2(p_cls)
        return en
    
    '''
    Calculale gini index of this data
    input: data (array with only labels element EX: [0,0,1,1,0,0,2,1,0,1,1])
    output: gini index of this data array
    Why? calculate p_cls**2 may be more efficent than np.log2(p_cls)
    '''
    def gini_index(self,data):
        gini = 0
        y_labels = np.unique([data])
        
        for y_cl in y_labels:
            p_cls = len(data[data==y_cl])/len(data)
            gini += p_cls**2
        
        return 1-gini
    
    '''
    Calculate information gain of this tree
    input: parent, left_tree, right_tree, type(default is gini for better calculation)
    here data is a dataframe --> slice it into 1 dimension array with only class labels
    output: information gain of that
    '''
    def information_gain(self,parent,left,right,type='gini'):
        y_parent = parent.iloc[:,4]
        y_left = left.iloc[:,4]
        y_right = right.iloc[:,4]
        
        weight_l = len(left)/ len(parent)
        weight_r = len(right) / len(parent)
        
#         print(weight_l, weight_r,len(left),len(right))
        
        if (type=='gini'):
            return self.gini_index(y_parent)-(weight_l*self.gini_index(y_left) + weight_r*self.gini_index(y_right))
        else:
            return self.entropy(y_parent)-(weight_l*self.entropy(y_left) + weight_r*self.entropy(y_right))
    
    '''
    Get best spilt in this data (dataframe), base on information gain!!
    input: data to split, num_elements, num_attributes for better loop
    output: return dict include attribute_name,thresold,left,right,info_gain (base on max info_gain) of this data (tree)
    Explain: Try to find attribute_name with thresold for get max info_gain (loop all elements in this data)
    '''
    def get_best_split(self,data,num_elements,num_attributes):
        info_gain = -float("inf")
        info_temp = 0
        
        best_split = {}
        #Loop all elements
        #Loop elements
        for i in range(num_elements):
            #Loop attributes
            for j in range(num_attributes):
                left_data = data[data.iloc[:,j] <= data.iloc[i,j]]
                right_data = data[data.iloc[:,j] > data.iloc[i,j]]
                
                if (len(left_data)>0 and len(right_data)>0):
                    info_temp = self.information_gain(data,left_data,right_data,'gini')
                    if (info_temp>info_gain):
                        best_split['attribute_name'] = j
                        best_split['thresold'] = data.iloc[i,j]
                        best_split['left'] = left_data
                        best_split['right'] = right_data
                        best_split['info_gain'] = info_temp
                        info_gain = info_temp
                        
                    
        return best_split
    
    '''
    Build this decision tree
    input: data (dataframe, cur_dept --> need to satisfy stop condition)
    output: decision tree
    Explain: get best split to data and we have left, right tree, then again build with left, right tree
    until stop condition have been satisfied
    Attention: if stop condition have been satisfied, this is leaf --> declare node with value (left)
    '''
    def build_tree(self,data,cur_dept =0):
        #Take discrete datas X,y
        X = data.iloc[:,0:4]
        y = data.iloc[:,4]
        
        #take num_elements and num_attributes
        num_elements = X.shape[0]
        num_attributes = X.shape[1]
        
        #Running when not satifies stop conditions
        #If satifies stop condition --> this is left --> output node
        if num_elements>self.element_min_split and cur_dept<self.max_dept:
            best_split = self.get_best_split(data,num_elements,num_attributes)
            left = best_split['left']
            right = best_split['right']
            if (len(left)>0) and (len(right)>0):
                left_sub_tree = self.build_tree(left,cur_dept+1)
                right_sub_tree = self.build_tree(right,cur_dept+1);
                
                return Node(best_split['attribute_name'],best_split['thresold'],left_sub_tree
                           ,right_sub_tree,best_split['info_gain'])
            
        #left nodes
        return Node(value = self.value_left_node(y))
    
    '''
    when stop conditions have been satisfied, we had array with multi class labels,
    choose most apperance in this to left value
    input: Y (array with class labels)
    output: class with most apperance
    '''
    def value_left_node(self,Y):
        Y = list(Y)
        return max(Y, key=Y.count)
    
    def fit(self,X,y):
        '''train the tree'''
        
        dataset = np.concatenate((X, y), axis=1)
        dataset = pd.DataFrame(dataset, columns=data.columns)
#         print(dataset)
        self.root = self.build_tree(dataset)

    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.attribute_name]
        if feature_val<=tree.thresold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def print_tree(self,tree=None):
        if not tree:
            tree = self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print('Condition: ', data.columns[tree.attribute_name] , '<= ', tree.thresold,'? IN FOR GAIN: ',tree.info_gain)
            print('======LEFT SIDE=========')
            self.print_tree(tree.left)
            
            print('======RIGHT SIDE=========')
            self.print_tree(tree.right)

In [16]:
data.columns

Index(['sepal_length', 'sepal_width', 'petal_length', 'petal_width', 'type'], dtype='object')

In [17]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=41)
X_train.shape,y_train.shape

((119, 4), (119, 1))

In [18]:
tree = DecisionTree_Classify(element_min_split=3,max_dept=3)
tree.fit(X_train,y_train)
tree.print_tree()

Condition:  petal_length <=  1.7 ? IN FOR GAIN:  0.33904421497105636
Condition:  sepal_length <=  5.7 ? IN FOR GAIN:  0.0
Condition:  petal_length <=  1.5 ? IN FOR GAIN:  0.0
0
0
0
Condition:  petal_width <=  1.5 ? IN FOR GAIN:  0.40269559500328744
Condition:  petal_length <=  4.9 ? IN FOR GAIN:  0.04996712689020377
1
2
Condition:  petal_length <=  4.8 ? IN FOR GAIN:  0.040912933220625364
2
2


In [29]:
Y_pred = tree.predict(X_test) 

from sklearn.metrics import accuracy_score
accuracy_score(y_test, Y_pred)

ValueError: Classification metrics can't handle a mix of unknown and multiclass targets

In [27]:
y_test

array([[0],
       [0],
       [0],
       [0],
       [1],
       [2],
       [2],
       [1],
       [2],
       [1],
       [2],
       [1],
       [2],
       [2],
       [2],
       [1],
       [2],
       [1],
       [0],
       [2],
       [2],
       [0],
       [0],
       [1],
       [1],
       [2],
       [2],
       [1],
       [0],
       [2]], dtype=object)

In [34]:
np.flatten(y_test)

AttributeError: module 'numpy' has no attribute 'flatten'