<a href="https://colab.research.google.com/github/mihirsh73/Decision_tree/blob/master/Decision_tree.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

In [0]:
df = pd.read_csv("elec.csv.txt")

In [0]:
df.head()

Unnamed: 0,0,0.056443,0.439155,0.003467,0.422915,0.414912,1
0,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
1,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,1
3,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0
4,0.106383,0.041161,0.207528,0.003467,0.422915,0.414912,0


In [0]:
df.columns

Index(['0', '0.056443', '0.439155', '0.003467', '0.422915', '0.414912', '1'], dtype='object')

In [0]:
selected_cols = ['0', '0.056443', '0.439155', '0.003467', '0.422915', '0.414912', '1']

In [0]:
data = df.loc[:, selected_cols]

In [0]:
data.head()

Unnamed: 0,0,0.056443,0.439155,0.003467,0.422915,0.414912,1
0,0.021277,0.051699,0.415055,0.003467,0.422915,0.414912,1
1,0.042553,0.051489,0.385004,0.003467,0.422915,0.414912,1
2,0.06383,0.045485,0.314639,0.003467,0.422915,0.414912,1
3,0.085106,0.042482,0.251116,0.003467,0.422915,0.414912,0
4,0.106383,0.041161,0.207528,0.003467,0.422915,0.414912,0


In [0]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45311 entries, 0 to 45310
Data columns (total 7 columns):
0           45311 non-null float64
0.056443    45311 non-null float64
0.439155    45311 non-null float64
0.003467    45311 non-null float64
0.422915    45311 non-null float64
0.414912    45311 non-null float64
1           45311 non-null int64
dtypes: float64(6), int64(1)
memory usage: 2.4 MB


In [0]:
X = data.loc[:, ['0', '0.056443', '0.439155', '0.003467', '0.422915', '0.414912']]
y = data['1']

In [0]:
def entropy(column):
    vals, counts = np.unique(column, return_counts=True)
    total = len(column)
    
    acc = 0
    for count in counts:
        acc += (count/total)*np.log2(count/total)

    return -acc

In [0]:
entropy(y)

0.9835038175322905

In [0]:
def info_gain(X, y, label):
    
    pivot = np.mean(X[label])
    
    y_left = y[X[label] < pivot]
    y_right = y[X[label] >= pivot]
    
    if (len(y_left) == 0) or (len(y_right) == 0):
        return -1000
    
    left = (len(y_left)/len(y))*entropy(y_left)
    right = (len(y_right)/len(y))*entropy(y_right)
    
    return entropy(y) - left - right

In [0]:
for label in X.columns:
    print(label, info_gain(X, y, label))

0 0.008215294695439523
0.056443 0.15776620316384515
0.439155 0.049116588813863826
0.003467 0.06394496391917251
0.422915 0.017679471065185193
0.414912 0.008035286353810456


In [0]:
class Node:
    def __init__(self, label=None, value=None, result=None):
        self.label = label
        self.value = value
        self.result = result

In [0]:
class DecisionTree:
    
    def __init__(self, max_depth=10):
        self.max_depth = max_depth
        
    def fit(self, X, y):
        self.root = self.generate(X, y, self.max_depth)
        
    def generate(self, X, y, depth):
        if depth == 1:
            return Node(result=np.mean(y))
        
        gains = []
        for label in X.columns:
            gain = info_gain(X, y, label)
            gains.append((gain, label))
  
        selected_label = max(gains)[1]
    
        pivot = np.mean(X[selected_label])
        left = (X[selected_label] < pivot)
        right = (X[selected_label] >= pivot)
        X_left, y_left = X[left], y[left]
        X_right, y_right = X[right], y[right]

        node = Node(selected_label, pivot)
        node.left = self.generate(X_left, y_left, depth-1)
        node.right = self.generate(X_right, y_right, depth-1)
        
        return node
    
    def display(self, node, indent=0):
        if node.label == None:
            if node.result < .5:
                print("\t"*indent, "UP :(")
            else:
                print("\t"*indent, "DOWN :)))))")
            return
        
        print(indent*"\t", node.label, node.value)
        self.display(node.left, indent+1)
        self.display(node.right, indent+1)
        
        
    def predict_point(self, row, node):
        if node.label == None:
            return node.result
        
        if row[node.label] < node.value:
            return self.predict_point(row, node.left)
        else:
            return self.predict_point(row, node.right)
        
    def predict(self, X):
        y = []
        for index, row in X.iterrows():
            res = int(self.predict_point(row, self.root) > .5)
            y.append(res)
        
        return np.array(y)
    
    def score(self, X, y):
        yp = self.predict(X)
        
        return np.sum(y.values == yp) / len(y)

In [0]:
X_train, X_test, y_train, y_test = train_test_split(
...     X, y, test_size=0.33, random_state=42)

In [0]:
model = DecisionTree()
model.fit(X_train, y_train)

In [0]:
model.score(X_test, y_test)

0.7815822911790277

In [0]:
model.score(X_train, y_train)

0.7914553000856447