In [1]:
import pandas as pd
import numpy as np
from math import log2

In [33]:
# Functions

def Gini(labels, weights = None):
    
    
    if weights == None:
        L = len(labels)
        weights = [1]*L
    
    W = {}
    Sum = 0
    for x in range(len(labels)):
        
        if labels[x] not in W:
            W[labels[x]] = 0
            
        W[labels[x]] += weights[x]
        Sum += weights[x]
    
    S = 0
    for x in W:
        S += (W[x]/Sum)**2
        
    return 1 - S

#################################################################


def majority(labels, weights = None):
    
    
  
    if weights == None:
        L = len(labels)
        weights = [1]*L
    
    W = {}
    for x in range(len(labels)):
        
        if labels[x] not in W:
            W[labels[x]] = 0
            
        W[labels[x]] += weights[x]
    
    Max = -1
    majority = None    
    for y in W:
        if W[y] > Max:
            Max = W[y]
            majority = y        
    return 1 - Max/sum(weights)

#################################################################

def entropy(labels, weights = None):
    
    n = len(labels)
    if weights == None:
        weights = [1]*n
            
    W = {}
    Sum = 0
    for x in range(n):
        if labels[x] not in W:
            W[labels[x]] = 0
        
        W[labels[x]] += weights[x]
            
        Sum += weights[x]
        
    S = 0
    for x in W:
        S += (W[x]/Sum) * log2(Sum / W[x])

    return S


#################################################################

def Majority(labels, weights = None, ignor = None):
    
    
    if weights == None:
        L = len(labels)
        weights = [1]*L
    
    W = {}
    for x in range(len(labels)):
        
        if labels[x] not in W:
            W[labels[x]] = 0
        
        if ignor is None:
            W[labels[x]] += weights[x]
        else:
            if labels[x]!= ignor:
                W[labels[x]] += weights[x]
                
    
    Max = -1
    majority = None    
    for y in W:
        if W[y] > Max:
            Max = W[y]
            majority = y        
    return(majority, len(W))



#################################################################


def Entropy_given_attribute(T, L, a, W = None, Entropy_function = entropy):
    
    n = len(L)
    if W == None:
        W = [1]*n
    
    
    split_l = {}
    split_w = {}
    sum_W = sum(W)
    
    for x in range(n):
        
        u = T[x][a]
        if u not in split_w:
            
            split_w[u] =[]
            split_l[u] = []
            
        split_w[u].append(W[x])
        split_l[u].append(L[x])  
        
    En = 0        
    for x in split_w:
        
        En += sum(split_w[x]) * Entropy_function(split_l[x], split_w[x]) / sum_W
        
    return(En, list(split_w.keys()))    


#################################################################


class DecisionTree(object):
    def __init__(self, train, labels, attributes, depth = -1, weights = None, 
                 Entropy_function = entropy):
        
        self.Entropy_function = Entropy_function
        self.leaf = False 
        self.label, n_values = Majority(labels, weights) 
        
        if len(attributes) == 0 or n_values == 1 or depth == 0:
            
            self.leaf = True  
            return
        
        self.att_split, values = self.best_att(train, labels, attributes, weights)
        
        train_s, lables_s, weight_s = split(train, labels, self.att_split, weights) 
        self.Tree = {}
        
        attributes.remove(self.att_split)
        
        for v in train_s:
               
            self.Tree[v] = DecisionTree(train_s[v], lables_s[v], attributes, depth - 1, 
                                        weight_s[v], Entropy_function)

        attributes.append(self.att_split)
            
    
    def predict(self, instance):
        
        if self.leaf:
            return self.label
        
        if instance[self.att_split] in self.Tree:
            return self.Tree[instance[self.att_split]].predict(instance)   
        
        return self.label   
    
    
    def best_att(self, train, labels, attributes, weights):
        
        lable_Ent = entropy(labels, weights)
        Max = -1
        Best = None
        Best_values = None
    
        for attribute in attributes: 
            temp, temp_values = Entropy_given_attribute(train, labels, 
                                                        attribute, weights, self.Entropy_function) 
            if lable_Ent - temp >  Max:
                Max = lable_Ent - temp
                Best = attribute
                Best_values = temp_values
                    
        return(Best, Best_values)

#################################################################    
    
def split(train, label, attribute, weights = None):
    
    n = len(label)
    if weights == None:
        weights = [1]*n
    
    split_w = {}
    split_t = {}
    split_l = {}
    
    for x in range(len(label)):
        
        #print('x = ', x)
        #print('attribute = ', attribute)
        txa = train[x][attribute]
        if txa not in split_t:
            
            split_w[txa] = []
            split_t[txa] = []
            split_l[txa] = []
            
        split_w[txa].append(weights[x])
        split_t[txa].append(train[x])
        split_l[txa].append(label[x])
        
    return (split_t, split_l, split_w)

In [34]:
# txtfile = open('bank/data-desc.txt', 'r')
# print(txtfile.read())

In [35]:
C = ['age', 'job', 'marital', 'education', 'default', 'balance', 'housing', 'loan', 
 'contact', 'day', 'month', 'duration', 'campaign', 'pdays', 'previous', 'poutcome', 'y']
types = ['numeric', 'categorical', 'categorical', 'categorical', 'binary', 'numeric', 
                      'binary', 'binary', 'categorical', 'numeric', 'categorical', 'numeric', 
                      'numeric', 'numeric', 'numeric', 'categorical', 'binary']
dic= dict(zip(C, types))

In [36]:
train = pd.read_csv('bank/train.csv', names = C)
test = pd.read_csv('bank/test.csv', names = C)
train.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,unknown,5,may,114,2,-1,0,unknown,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,unknown,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,unknown,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,unknown,yes


In [37]:
C

['age',
 'job',
 'marital',
 'education',
 'default',
 'balance',
 'housing',
 'loan',
 'contact',
 'day',
 'month',
 'duration',
 'campaign',
 'pdays',
 'previous',
 'poutcome',
 'y']

In [42]:
for att in C:
    maj = Majority(train[att], ignor = 'unknown')[0]
    for i in range(len(train)):
        if train[att][i] == 'unknown':
            train.loc[i,att] = maj
            

In [43]:
train.head() 

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,41,services,married,secondary,no,0,yes,no,cellular,5,may,114,2,-1,0,failure,no
1,48,blue-collar,single,secondary,no,312,yes,yes,cellular,3,feb,369,2,-1,0,failure,no
2,55,technician,married,secondary,no,1938,no,yes,cellular,18,aug,193,1,386,3,success,yes
3,54,admin.,married,tertiary,no,59,yes,no,cellular,10,jul,268,1,-1,0,failure,no
4,34,management,single,tertiary,no,2646,no,no,cellular,14,apr,142,1,-1,0,failure,yes


In [44]:
median_dict = {}
Train_new =pd.DataFrame()
Test_new =pd.DataFrame()
for name in C:
    if dic[name] == 'numeric':
        M = train[name].median()
        median_dict[name] = M
        Train_new[name+ '>' + str(M)] = np.where(train[name]  > M, "yes", 'no')
        Test_new[name+ '>' + str(M)] = np.where(test[name]  > M, "yes", 'no')
    else:
        Train_new[name] = train[name]
        Test_new[name] = test[name]

In [45]:
Train = []
Label = []
for i in range(len(Train_new)):
    temp = list(Train_new.loc[i])
    Train.append(temp[:-1])
    Label.append(temp[-1])

In [46]:
Test = []
Test_Label = []
for i in range(len(Test_new)):
    temp = list(Test_new.loc[i])
    Test.append(temp[:-1])
    Test_Label.append(temp[-1])

In [47]:
attributes = list(range(len(C)-1))

In [48]:
attributes

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15]

In [49]:
def prediction(data, label):
    pred = []
    for i in range(len(data)):
        temp = A.predict(data[i])
        pred.append(temp)
    return (np.array(label) == np.array(pred)).mean()

In [50]:
for f in [entropy, majority, Gini]:
    print("the method is baces on ", f, "index")
    for depth in range(1,16):
        A = DecisionTree(Train, Label, attributes, depth = depth, Entropy_function = f)
        print("for depth = {}, error for training is {}".format(depth, 1-prediction(Train, Label)))

the method is baces on  <function entropy at 0x7f9c80a1b160> index
for depth = 1, error for training is 0.11919999999999997
for depth = 2, error for training is 0.10599999999999998
for depth = 3, error for training is 0.10219999999999996
for depth = 4, error for training is 0.08760000000000001
for depth = 5, error for training is 0.06979999999999997
for depth = 6, error for training is 0.05579999999999996
for depth = 7, error for training is 0.04479999999999995
for depth = 8, error for training is 0.03600000000000003
for depth = 9, error for training is 0.03059999999999996
for depth = 10, error for training is 0.02400000000000002
for depth = 11, error for training is 0.021199999999999997
for depth = 12, error for training is 0.01859999999999995
for depth = 13, error for training is 0.018399999999999972
for depth = 14, error for training is 0.018399999999999972
for depth = 15, error for training is 0.018000000000000016
the method is baces on  <function majority at 0x7f9c80a1b0d0> index


In [51]:
for f in [entropy, majority, Gini]:
    print("the method is baces on ", f, "index")
    for depth in range(1,16):
        A = DecisionTree(Train, Label, attributes, depth = depth, Entropy_function = f)
        print("for depth = {}, error for test is {}".format(depth, 1-prediction(Test, Test_Label)))

the method is baces on  <function entropy at 0x7f9c80a1b160> index
for depth = 1, error for test is 0.12480000000000002
for depth = 2, error for test is 0.11140000000000005
for depth = 3, error for test is 0.11240000000000006
for depth = 4, error for test is 0.12280000000000002
for depth = 5, error for test is 0.12439999999999996
for depth = 6, error for test is 0.12680000000000002
for depth = 7, error for test is 0.12880000000000003
for depth = 8, error for test is 0.13
for depth = 9, error for test is 0.13019999999999998
for depth = 10, error for test is 0.13080000000000003
for depth = 11, error for test is 0.13180000000000003
for depth = 12, error for test is 0.13160000000000005
for depth = 13, error for test is 0.13139999999999996
for depth = 14, error for test is 0.13139999999999996
for depth = 15, error for test is 0.13139999999999996
the method is baces on  <function majority at 0x7f9c80a1b0d0> index
for depth = 1, error for test is 0.11660000000000004
for depth = 2, error for t