### With decision tree use for regression, we use variance reduction to determine which split is better
So much thing same with classification!!

In [1]:
import numpy as np
import pandas as pd

In [3]:
data = pd.read_csv('airfoil_noise_data.csv')
data.head()

Unnamed: 0,x0,x1,x2,x3,x4,y
0,800,0.0,0.3048,71.3,0.002663,126.201
1,1000,0.0,0.3048,71.3,0.002663,125.201
2,1250,0.0,0.3048,71.3,0.002663,125.951
3,1600,0.0,0.3048,71.3,0.002663,127.591
4,2000,0.0,0.3048,71.3,0.002663,127.461


### Decision tree for regression 

In [95]:
#Node
'''
We have two types of Node --> internal (with condition to figure out leaf or continue internal node)
Leaf node: only have value --> figure out value of leaf node
Internal node --> attribute_name (actually it should be feature_index) 0,1,2,3 stand for x0,x1,x2,x3
thresold of this attribute ex 3.0, left( left tree), right (right tree), variance_reduction 
(variance_reduction of this --> serve print)
'''
class Node():
    def __init__(self,feature_index=None,thresold=None,left=None,right=None,variance_reduction=None,value=None):
        #decision node (internal node)
        self.feature_index = feature_index
        self.thresold = thresold
        self.left = left
        self.right = right
        self.variance_reduction = variance_reduction
        
        #for leaf node
        self.value = value

In [100]:
class DecisionTree_Regression():
    def __init__(self,element_min_split=2,max_dept=2):
        #stop conditions
        self.element_min_split = element_min_split
        self.max_dept = max_dept   
        
        #Root node
        root = None
           
    '''
    Calculate variance reduction of this tree
    input: parent, left_tree, right_tree, type
    here data is a dataframe --> slice it into 1 dimension array with only class labels
    output: variance reduction of that
    '''
    def cal_variance_reduction(self,parent,left,right):
        y_parent = parent.iloc[:,-1]
        y_left = left.iloc[:,-1]
        y_right = right.iloc[:,-1]

        weight_l = len(left)/ len(parent)
        weight_r = len(right) / len(parent)

        v_parent = sum(np.square(y_parent - np.mean(y_parent)))/len(y_parent)
        v_left = sum(np.square(y_left - np.mean(y_left)))/len(y_left)
        v_right = sum(np.square(y_right - np.mean(y_right)))/len(y_right)

        return np.var(y_parent) - (weight_l*np.var(y_left) + weight_r*np.var(y_right))


    '''
    Get best spilt in this data (dataframe), base on cal_variance_reduction!!
    input: data to split, num_elements, num_attributes for better loop
    output: return dict include attribute_name,thresold,left,right,info_gain (base on max info_gain) of this data (tree)
    Explain: Try to find attribute_name with thresold for get max info_gain (loop all elements in this data)
    '''
    def get_best_split(self,data,num_elements,num_attributes):
        var = -float("inf")
        var_temp = 0
        
        best_split = {}
        #Loop all elements
        #Loop elements
        for j in range(num_attributes):
            #Loop attributes
            feature_values = data.iloc[:,j]
            possible_thresolds = np.unique(feature_values)
            for i in possible_thresolds:
                left_data = data[data.iloc[:,j] <= i]
                right_data = data[data.iloc[:,j] > i]
                
                if (len(left_data)>0 and len(right_data)>0):
                    var_temp = self.cal_variance_reduction(data,left_data,right_data)
                    if (var_temp>var):
                        best_split['feature_index'] = j
                        best_split['thresold'] = i
                        best_split['left'] = left_data
                        best_split['right'] = right_data
                        best_split['variance_reduction'] = var_temp
                        var = var_temp
                        
                    
        return best_split
    
    '''
    Build this decision tree
    input: data (dataframe, cur_dept --> need to satisfy stop condition)
    output: decision tree
    Explain: get best split to data and we have left, right tree, then again build with left, right tree
    until stop condition have been satisfied
    Attention: if stop condition have been satisfied, this is leaf --> declare node with value (left)
    '''
    def build_tree(self,data,cur_dept =0):
        #Take discrete datas X,y
        X = data.iloc[:,0:-1]
        y = data.iloc[:,-1]
        
        #take num_elements and num_attributes
        num_elements = X.shape[0]
        num_attributes = X.shape[1]
        
        #Running when not satifies stop conditions
        #If satifies stop condition --> this is left --> output node
        if num_elements>=self.element_min_split and cur_dept<=self.max_dept:
            best_split = self.get_best_split(data,num_elements,num_attributes)
            left = best_split['left']
            right = best_split['right']
            if (len(left)>0) and (len(right)>0):
                left_sub_tree = self.build_tree(left,cur_dept+1)
                right_sub_tree = self.build_tree(right,cur_dept+1);
                
                return Node(best_split['feature_index'],best_split['thresold'],left_sub_tree
                           ,right_sub_tree,best_split['variance_reduction'])
            
        #left nodes
        return Node(value = self.value_left_node(y))
    
    '''
    input: Y 
    output: mean of this Y
    '''
    def value_left_node(self,Y):
        return np.mean(Y)
    
    def fit(self,X,y):
        '''train the tree'''
        
        dataset = np.concatenate((X, y), axis=1)
        dataset = pd.DataFrame(dataset, columns=data.columns)
#         print(dataset)
        self.root = self.build_tree(dataset)
    
    #Predict dataset
#     def predict(self, dataset):
#         pass
    
#     #predict single value
#     def make_prediction(self):
#         pass

    def predict(self, X):
        ''' function to predict new dataset '''
        
        preditions = [self.make_prediction(x, self.root) for x in X]
        return preditions
    
    def make_prediction(self, x, tree):
        ''' function to predict a single data point '''
        
        if tree.value!=None: return tree.value
        feature_val = x[tree.feature_index]
        if feature_val<=tree.thresold:
            return self.make_prediction(x, tree.left)
        else:
            return self.make_prediction(x, tree.right)
    
    def print_tree(self,tree=None):
        if not tree:
            tree = self.root
        if tree.value is not None:
            print(tree.value)
        else:
            print('Condition: ', data.columns[tree.feature_index] , '<= ', tree.thresold,'? IN FOR GAIN: ',tree.variance_reduction)
            print('======LEFT SIDE=========')
            self.print_tree(tree.left)
            
            print('======RIGHT SIDE=========')
            self.print_tree(tree.right)

In [101]:
data.columns

Index(['x0', 'x1', 'x2', 'x3', 'x4', 'y'], dtype='object')

In [102]:
X = data.iloc[:, :-1].values
Y = data.iloc[:, -1].values.reshape(-1,1)
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=.2, random_state=41)
X_train.shape,y_train.shape

((1202, 5), (1202, 1))

In [103]:
nam1 = DecisionTree_Regression(element_min_split=3,max_dept=3)
nam1.fit(X_train,y_train)
nam1.print_tree()

Condition:  x0 <=  3150.0 ? IN FOR GAIN:  7.132048702017748
Condition:  x4 <=  0.0337792 ? IN FOR GAIN:  3.590330569067664
Condition:  x3 <=  55.5 ? IN FOR GAIN:  1.17898999813184
Condition:  x4 <=  0.00251435 ? IN FOR GAIN:  1.614396721819876
128.9919833333333
125.90953579676673
Condition:  x1 <=  15.4 ? IN FOR GAIN:  2.2342245360792994
129.39160280373832
123.80422222222222
Condition:  x0 <=  1250.0 ? IN FOR GAIN:  9.970884020498868
Condition:  x4 <=  0.0483159 ? IN FOR GAIN:  6.35527515982486
124.38024528301887
118.30039999999998
Condition:  x3 <=  39.6 ? IN FOR GAIN:  5.036286657241031
113.58091666666667
118.07284615384616
Condition:  x4 <=  0.00146332 ? IN FOR GAIN:  29.08299210506528
Condition:  x0 <=  8000.0 ? IN FOR GAIN:  11.886497073996964
Condition:  x2 <=  0.0508 ? IN FOR GAIN:  7.608945827689519
134.04247500000002
127.33581818181818
Condition:  x4 <=  0.00076193 ? IN FOR GAIN:  10.6229193224008
128.94078571428574
122.40768750000001
Condition:  x4 <=  0.0229028 ? IN FOR GAIN

In [104]:
Y_pred = nam1.predict(X_test) 
from sklearn.metrics import mean_squared_error
np.sqrt(mean_squared_error(y_test, Y_pred))

4.851358097184457

In [66]:
dataset = np.concatenate((X_train, y_train), axis=1)
dataset = pd.DataFrame(dataset, columns=data.columns)
dataset

Unnamed: 0,x0,x1,x2,x3,x4,y
0,400.0,17.4,0.0254,55.5,0.016571,115.461
1,630.0,3.3,0.1016,55.5,0.002211,126.599
2,1250.0,17.4,0.0254,31.7,0.017663,128.306
3,315.0,19.7,0.0508,71.3,0.034118,121.235
4,500.0,15.6,0.1016,39.6,0.052849,115.304
...,...,...,...,...,...,...
1197,250.0,7.3,0.2286,55.5,0.011171,136.384
1198,400.0,2.0,0.2286,71.3,0.002930,125.116
1199,1250.0,4.0,0.2286,55.5,0.004286,131.264
1200,1000.0,9.5,0.0254,31.7,0.004614,131.346


In [45]:
def cal_variance_reduction(parent,left,right):
    y_parent = parent.iloc[:,-1]
    y_left = left.iloc[:,-1]
    y_right = right.iloc[:,-1]

    weight_l = len(left)/ len(parent)
    weight_r = len(right) / len(parent)

    return np.var(y_parent) - (weight_l*np.var(y_left) + weight_r*np.var(y_right))

In [46]:
test1 = data
test2 = data.iloc[:100,:]
test3 = data.iloc[100:,:]
cal_variance_reduction(test1,test2,test3)

0.9265198453675794

In [82]:
test4 = dataset
test5 = dataset[dataset['x0']<=3150]
test6 = dataset[dataset['x0']>3150]
cal_variance_reduction(test4,test5,test6)

7.132048702017748

In [83]:
data.iloc[:,4]

0       0.002663
1       0.002663
2       0.002663
3       0.002663
4       0.002663
          ...   
1498    0.052849
1499    0.052849
1500    0.052849
1501    0.052849
1502    0.052849
Name: x4, Length: 1503, dtype: float64