# Decision Tree

### Information Gain
Find the attribute that produces the highest information gain.

+ Information Entropy:
$Ent(D) = -\sum_{k=1}^{|\gamma|}{p_k \log_2{p_k}}$, where $p_k$ represents the proportion of k-th sample in the sample set D

+ Conditional Entropy:
$Ent(Y|X) = \sum_{i=1}^n{p_i Ent(Y|X=x_i)}$

+ Information Gain:
$Gain(D, A) = Ent(D) - Ent(D|A)$

## Codes
### 1. Import packages

In [1]:
import numpy as np
import pandas as pd
import cv2
from sklearn.metrics import accuracy_score
from sklearn.cross_validation import train_test_split




## 2. Define useful functions
+ information entropy
+ conditional entropy

In [2]:
def information_entropy(x):
    '''
        x: 1D array with discrete elements
    '''
    val2freq = dict()
    for val in x:
        val2freq[val] = 1 if val not in val2freq else val2freq[val] + 1
    return sum([-freq / len(x) * np.log2(freq / len(x)) for val, freq in val2freq.items()])

def conditional_entropy(y, x):
    '''
        Compute $Ent(Y|X) = \sum_{i=1}^n{p_i Ent(Y|X=x_i)}$
    
        y: 1D array
        x: 1D array 
    '''
    vs = set([val for val in x])
    res = 0.0
    for v in vs:
        sub_y = y[x == v] # select the subset of y according to x
        res += len(sub_y)/ len(y) * information_entropy(sub_y)
    return res

x = np.random.randint(0, 10, size=100)
print('information_entropy:', information_entropy(x))

x = np.array([1, 0, 1, 1, 0, 1])
y = np.array([1, 0, 3, 0, 3, 0])
print('conditional_entropy:', conditional_entropy(y, x))

information_entropy: 3.1880516698605565
conditional_entropy: 1.3333333333333333


## 3. Prepare data

In [3]:
def binarization(img):
    bin_img = img.astype(np.uint8)
    cv2.threshold(bin_img, 50, 1, cv2.THRESH_BINARY_INV, bin_img) # pixel = 0 if value > 50 else 1
    return bin_img

raw_data = pd.read_csv('../data/train.csv', header=0)
data = raw_data.values
imgs = data[0:, 1:] # for one row, the first column is the label followed by the image data
labels = data[:, 0]

# binarization
for index, img in enumerate(imgs):
    imgs[index] = binarization(img)
    

# 选取 2/3 数据作为训练集， 1/3 数据作为测试集
x_train, x_test, y_train, y_test = train_test_split(imgs, labels, test_size=0.33, random_state=23323)

## 4. Build model

In [4]:
class Node(object):
    def __init__(self, type_, label=None, attribute=None):
        self.children = dict() # mapping from specific value to corresponding subtrees
        self.type = type_ # leaf or internal
        self.attribute = attribute # The attribute used for dividing data. In the case of MNIST, its value is between 0 and 784(exclusive)
        self.label = label # The class the node is most likely to belong to
    
    def predict(self, feature):
        if self.type == 'leaf':
            return self.label
        sub_node = self.children[feature[self.attribute]]
        return sub_node.predict(feature)

    def add_child(self, key, child):
        self.children[key] = child
        

## 5. Train

In [5]:
def train1(features, labels, attributes):
    '''
        Note: features might be a subset of training data
    
        features: (B, 784)
        labels: (B,)
    '''
    
        
    # If the labels are the same, return as a leaf node
    label_set = set(labels)
    if len(label_set) == 1:
        return Node('leaf', label_set.pop())
    
    # If there is no candidate attributes, return as a leaf node whose label is the most common label
    label2freq = dict()
    for label in labels:
        label2freq[label] = 1 if label not in label2freq else label2freq[label] + 1
    common_label, _ = max(label2freq.items(), key=lambda x:x[1])
    if len(attributes) == 0:
        return Node('leaf', common_label)

    # Calculate the information entropy for the current set
    ent_d = information_entropy(labels)
    
    # Calculate the conditional entropy for each remaining attribute
    gains = list()
    for attribute in attributes:
        ent_da = conditional_entropy(labels, features[:, attribute])
        gain_da = ent_d - ent_da
        gains.append((gain_da, attribute))
    # Select the attribute with the highest information gain
    max_gain, selected_attribute = max(gains)
    remain_attributes = attributes[:]
    remain_attributes.remove(selected_attribute)
    
    node = Node('internal', attribute=selected_attribute)
    # Select the corresponding subset from training data for each value in the chosen attribute
#     selected_value = set([val for val in features[:, selected_attribute]])
    selected_value = set(features[:, selected_attribute])

    for val in selected_value:
        sub_features = list()
        sub_labels = list()
        for feature, label in zip(features, labels):
            if feature[selected_attribute] == val:
                sub_features.append(feature)
                sub_labels.append(label)

        # Recursive with new subsets
        node.add_child(val, train1(np.asarray(sub_features), np.asarray(sub_labels), remain_attributes))

    return node

    

def train(features, labels):
    return train1(features, labels, list(range(784)))

root = train(x_train, y_train)

## 6. Evaluate

In [6]:
def predict(features): # root.predict receive 1D array, thus we need to traverse batch
    y_predicted = list()
    for feature in features:
        y_pred = root.predict(feature)
        y_predicted.append(y_pred)
    return np.asarray(y_predicted)

y_predicted = predict(x_test)
score = accuracy_score(y_predicted, y_test)
print(score)

0.8603896103896104
