In [1]:
import numpy as np

In [292]:
class DecisionTreeTrain:
    def __init__(self):
        """Class to train decision tree
        """
    
    def decision_tree_learning(self,training_data,depth=0):
        """Method to create decision tree based on the training data input

        Args:
            depth (int): starting depth of decision tree (default to be zero)
        
        Returns:
            decision_tree (dict): decision tree output in dictionary
        """

        labels = np.unique(training_data[:,-1])
        if len(labels) == 1:
            return ({'attribute': None, 'value': np.floor(labels[0]), 'left': None, 'right': None, 'depth':depth}, depth)
        else:
            split_attribute, split = self.find_split(training_data)
            left_data = training_data[training_data[:,split_attribute-1]<=split]
            right_data = training_data[training_data[:,split_attribute-1]>split]
            left_branch, left_depth = self.decision_tree_learning(left_data,depth+1)
            right_branch, right_depth = self.decision_tree_learning(right_data, depth+1)
            node = {'attribute': np.floor(split_attribute), 'value': split, 'left': left_branch, 'right': right_branch, 'depth':depth+1}
        return (node, max(left_depth,right_depth))


    def find_split(self, data):
        """Method to find split point of attributes that gives the highest information gain

        Args:
            data (np.array): an numpy array of training dataset with each column representing a attribute and last column representing the label
        
        Returns:
            best_attribute (int): attribute resulting in highest information gain
            best_split_value (float): split value resulting in hightest information gain
        """
        
        attributes = np.shape(data)[-1]-1
        overall_entropy = self.calculate_entropy(data)
        best_information_gain = -np.Inf
        best_attribute = None
        best_split_value = None

        for attribute in range(1,attributes+1):
            potential_splits = np.sort(data[:,attribute-1])
            for split in np.unique(potential_splits):
                left = data[data[:,attribute-1]<=split]
                right = data[data[:,attribute-1]>split]
                information_gain = (
                    overall_entropy - 
                    len(left)/len(data)*self.calculate_entropy(left) - 
                    len(right)/len(data)*self.calculate_entropy(right)
                    )
                if information_gain > best_information_gain:
                    best_information_gain = information_gain
                    best_attribute = attribute
                    best_split_value = split
        return (best_attribute,best_split_value)
    
    def calculate_entropy(self, data,):
        """calculate_entropy for a given data extract

        Args:
            data (np.array): an numpy array of training dataset with each column representing a attribute and last column representing the label
        
        Returns:
            entropy (float)
        """
        labels = np.unique(data[:,-1])
        entropy = 0
        for label in labels:
            label_occurance = len(data[data[:,-1]==label])
            data_size = len(data)
            entropy += -label_occurance/data_size * np.log2(label_occurance/data_size)
        return entropy

In [298]:
data = np.loadtxt("/Users/ganweiwang/Library/CloudStorage/OneDrive-ImperialCollegeLondon/MSc_AI/introml/cw1/cw1/wifi_db/clean_dataset.txt")

In [299]:
dt = DecisionTreeTrain()

In [300]:
len(data[data[:,0]<=-55])

1012

In [301]:
a,b = dt.decision_tree_learning(data)

In [302]:
a

{'attribute': 1.0,
 'value': -55.0,
 'left': {'attribute': 5.0,
  'value': -60.0,
  'left': {'attribute': 4.0,
   'value': -56.0,
   'left': {'attribute': 3.0,
    'value': -56.0,
    'left': {'attribute': None,
     'value': 1.0,
     'left': None,
     'right': None,
     'depth': 4},
    'right': {'attribute': 7.0,
     'value': -86.0,
     'left': {'attribute': 5.0,
      'value': -63.0,
      'left': {'attribute': 6.0,
       'value': -86.0,
       'left': {'attribute': 1.0,
        'value': -60.0,
        'left': {'attribute': None,
         'value': 4.0,
         'left': None,
         'right': None,
         'depth': 8},
        'right': {'attribute': None,
         'value': 3.0,
         'left': None,
         'right': None,
         'depth': 8},
        'depth': 8},
       'right': {'attribute': None,
        'value': 1.0,
        'left': None,
        'right': None,
        'depth': 7},
       'depth': 7},
      'right': {'attribute': None,
       'value': 4.0,
       'left'

In [273]:
a,b,c,d = dt.find_split(data)
print(a,b,c,d)
len(c)

1 -55.0 [[-59. -53. -51. ... -79. -87.   4.]
 [-66. -53. -59. ... -81. -79.   1.]
 [-62. -58. -52. ... -87. -88.   4.]
 ...
 [-72. -59. -65. ... -84. -91.   1.]
 [-57. -54. -56. ... -79. -82.   1.]
 [-56. -52. -50. ... -85. -88.   3.]] [[-41. -57. -63. ... -66. -65.   2.]
 [-37. -55. -55. ... -66. -69.   2.]
 [-40. -52. -49. ... -69. -65.   2.]
 ...
 [-51. -52. -52. ... -79. -87.   3.]
 [-38. -51. -53. ... -72. -73.   2.]
 [-46. -54. -47. ... -80. -73.   3.]]


1012

In [233]:
e,f,g,h = dt.find_split(c)

In [235]:
len(g)

515

In [236]:
i,j,k,l = dt.find_split(g)

In [237]:
len(k)

504

In [239]:
m,n,o,p = dt.find_split(k)

In [241]:
len(o)

462

In [242]:
q,r,s,t = dt.find_split(o)

In [244]:
len(s)

334

In [246]:
print(a,e,i,m,q)

1 5 4 3 7


In [249]:
len(s[s[:,-1]!=1])

30

In [250]:
u,v,w,x = dt.find_split(s)

In [251]:
len(w)

132

In [252]:
print(w)

[[-63. -58. -64. ... -87. -87.   4.]
 [-58. -59. -56. ... -82. -87.   1.]
 [-60. -58. -60. ... -84. -88.   1.]
 ...
 [-64. -54. -65. ... -85. -89.   1.]
 [-60. -58. -57. ... -80. -89.   1.]
 [-72. -59. -65. ... -84. -91.   1.]]


In [255]:
len((w[w[:,-1]!=1]))

14