In [1]:
import numpy as np
import pandas as pd 
import matplotlib.pyplot as plt

In [2]:
df_train = pd.read_csv("Train.csv")

In [3]:
df_train.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5,target
0,0.293416,-0.945599,-0.421105,0.406816,0.525662,-82.154667
1,-0.836084,-0.189228,-0.776403,-1.053831,0.597997,-48.89796
2,0.236425,0.132836,-0.147723,0.699854,-0.187364,77.270371
3,0.175312,0.143194,-0.581111,-0.122107,-1.292168,-2.988581
4,-1.693011,0.542712,-2.798729,-0.686723,1.244077,-37.596722


In [4]:
df_test = pd.read_csv("Test.csv")

In [5]:
df_test.head()

Unnamed: 0,feature_1,feature_2,feature_3,feature_4,feature_5
0,1.015254,2.076209,-0.266435,-2.418088,-0.980503
1,-0.375021,0.953185,0.626719,0.704406,-0.355489
2,-1.024452,0.962991,-0.407942,-1.861274,0.455201
3,-2.489841,0.544802,0.601219,-0.607021,-1.314286
4,-0.384675,-0.833624,1.358552,-0.547932,0.411925


In [19]:
def entropy(data, ycol):
    return ((data[ycol] - data[ycol].mean())**2).sum()/data.shape[0]

In [37]:
def divide(data, fkey, fval):
    left_data, right_data = data[data[fkey] <= fval], data[data[fkey] > fval]
    left_data.reset_index(drop = True, inplace = True)
    right_data.reset_index(drop = True, inplace = True)
    return left_data, right_data

def information_gain(data, fkey, fval, ycol):
    left_data, right_data = divide(data, fkey, fval)
    init_ent = entropy(data, ycol)
    fin_ent = (left_data.shape[0]/data.shape[0])*entropy(left_data, ycol) + (right_data.shape[0]/data.shape[0])*entropy(right_data, ycol)
    return init_ent - fin_ent

In [38]:
for c in df_train.columns:
    print(c, information_gain(df_train, c, df_train[c].mean(), "target"))

feature_1 407.26714141265074
feature_2 5696.688762832873
feature_3 6.845237399778853
feature_4 1179.9454009885412
feature_5 1.0717422265697678
target 7729.003529683958


In [92]:
class DecisionTree:
    def __init__(self, depth = 0, max_depth = 5):
        self.depth = depth
        self.max_depth = max_depth
        self.target = None
        self.left = None
        self.right = None
        self.fkey = None
        self.favl = None
    
    def train(self, data, xcols, ycol):
        info_gain = []
        for c in xcols:
            info_gain.append(information_gain(data, c, data[c].mean(), ycol))
        
        self.fkey = xcols[np.argmax(info_gain)]
        self.fval = data[self.fkey].mean()
        
        print("Selected Feature:", self.fkey, " Depth: ", self.depth)
        
        self.target = data[ycol].mean()
        left_data, right_data = divide(data, self.fkey, self.fval)
        
        if len(left_data) == 0 or len(right_data) == 0 or self.depth > self.max_depth:
            return
        
        self.left = DecisionTree(self.depth + 1, self.max_depth)
        self.left.train(left_data, xcols, ycol)
        
        self.right = DecisionTree(self.depth + 1, self.max_depth)
        self.right.train(right_data, xcols, ycol)
        
        return
    
    def predict(self, x_test):
        if x_test[self.fkey] <= self.fval:
            if self.left == None:
                return self.target
            return self.left.predict(x_test)
        
        
        if self.right == None:
            return self.target
        return self.right.predict(x_test)

In [134]:
model = DecisionTree(max_depth = 7)

In [135]:
xcols = df_train.columns[:-1]
ycol = df_train.columns[-1]

In [136]:
model.train(df_train, xcols, ycol)

Selected Feature: feature_2  Depth:  0
Selected Feature: feature_2  Depth:  1
Selected Feature: feature_4  Depth:  2
Selected Feature: feature_2  Depth:  3
Selected Feature: feature_2  Depth:  4
Selected Feature: feature_2  Depth:  5
Selected Feature: feature_3  Depth:  6
Selected Feature: feature_2  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_3  Depth:  8
Selected Feature: feature_3  Depth:  8
Selected Feature: feature_4  Depth:  6
Selected Feature: feature_4  Depth:  7
Selected Feature: feature_3  Depth:  8
Selected Feature: feature_5  Depth:  8
Selected Feature: feature_3  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_1  Depth:  5
Selected Feature: feature_2  Depth:  6
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_3  Depth:  8
Selected Feature: feature

Selected Feature: feature_1  Depth:  5
Selected Feature: feature_4  Depth:  6
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_5  Depth:  8
Selected Feature: feature_5  Depth:  7
Selected Feature: feature_3  Depth:  8
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_2  Depth:  6
Selected Feature: feature_2  Depth:  7
Selected Feature: feature_2  Depth:  8
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_4  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_1  Depth:  4
Selected Feature: feature_4  Depth:  5
Selected Feature: feature_3  Depth:  6
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_5  Depth:  8
Selected Feature: feature_4  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_1  Depth:  6
Selected Feature: feature

Selected Feature: feature_2  Depth:  7
Selected Feature: feature_2  Depth:  8
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_2  Depth:  6
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_3  Depth:  8
Selected Feature: feature_3  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_2  Depth:  8
Selected Feature: feature_2  Depth:  3
Selected Feature: feature_4  Depth:  4
Selected Feature: feature_1  Depth:  5
Selected Feature: feature_2  Depth:  6
Selected Feature: feature_2  Depth:  7
Selected Feature: feature_1  Depth:  8
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_4  Depth:  8
Selected Feature: feature_1  Depth:  6
Selected Feature: feature_1  Depth:  7
Selected Feature: feature_5  Depth:  8
Selected Feature: feature_2  Depth:  8
Selected Feature: feature_2  Depth:  7
Selected Feature: feature

In [137]:
pred = []
for i in range(len(df_train)):
    pred.append(model.predict(df_train.iloc[i]))

In [138]:
def r2Score(Y, Yp):
    num = ((Y - Yp)**2).sum()
    den = ((Y - Y.mean())**2).sum()    
    return 1 - (num/den)

def adjusted_r2Score(Y, Yp, n, p):
    r2 = r2Score(Y, Yp)
    return 1 - ((1-r2)*(n-1)/(n-p-1))

In [139]:
pred = np.array(pred)

In [140]:
r2Score(df_train["target"].to_numpy() ,pred)

0.9548091223979985

In [141]:
n, p = df_train.shape[0], df_train.shape[1] - 1

In [142]:
adjusted_r2Score(df_train["target"].to_numpy() ,pred, n, p)

0.9546673693314929