# Bagging with Random Forest
The goal of this project is to implement random forrest with gini measure as the splitting loss function

In [1]:
import numpy as np
import pandas as pd

## 1. Modified Decision Tree
We start with an implementation of the decision tree with gini splitting loss function
This is essentially the same code as the one in ../2_decision_tree

In [97]:
# gini_indx: np.array --> float
# group is a list of integers, where each integer represents a class type
def gini_index(groups):
    # count all samples at split point
    n_instances = sum([len(group) for group in groups])
    
    # sum weighted Gini index for each group
    gini = 1.0
    for group in groups:
        size = float(len(group))
        # avoid divide by zero
        if size == 0:
            continue

        # score the group based on the score for each class
        unique, counts = np.unique(group, return_counts=True)
        #print(counts)
        score = np.sum((counts / size) ** 2)
 
        # weight the group score by its relative size
        gini -= score * (size / n_instances)
        
    return gini
 
# test Gini values
# print(gini_index(np.array([[1, 0], [1, 0, 1]])))
# print(gini_index(np.array([[0, 0], [1, 1, 1]])))

# try_split: list of int, float, np.array --> np.array, np.array
# split data using the feature in data[:, index] 
# if feature < value => that row in data is in group 1 (left_index)
# else that row in data is in group 2 (right_index)
# returns the index of the two groups in data
def try_split(index, value, data):
    feature = data[:, index]
    
    left_index = np.where(feature < value)[0]
    right_index = np.where(feature >= value)[0]
    
    return left_index, right_index       

# get_binary_split: np.array, np.array, np.array --> set
# perform greedy split of data. Trained with target labels
# the features are supplied (randomly in the random forrest)
# returns a set containing: gini, best index (i.e. feature to split), value at which split occurs, and
# the best left and right, b_left, b_right split.
def get_binary_split(data, labels, features):
    b_index, b_gini, b_value, b_left, b_right = np.nan, 1, np.nan, None, None
    
    for index in features:
        for row in data:
            left_index, right_index = try_split(index, row[index], data)
            gini = gini_index([labels[left_index], labels[right_index]])
            
            if gini < b_gini:
                b_index, b_gini, b_value, b_left, b_right = index, gini, row[index], left_index, right_index
    
    return {'gini': b_gini, 'index':b_index, 'value':b_value, 'left':b_left, 'right':b_right}  

class DecisionTree(object):
    
    # default depth = 1 and no features is passed
    def __init__(self, depth=1, features=np.array([])):
        self.depth = depth
        self.node = None
        self.left = None
        self.right = None
        self.features = features
        
    def most_freq(self, arr):
        unique, counts = np.unique(arr, return_counts=True)
        ind = np.argmax(counts)
        return unique[ind]
    
    # data is an np.array of size N times D
    # N = number of samples
    # D = number of features
    # labels is an np.array of size N times 1
    def train(self, data, labels):
        if self.depth == 0:
            self.node = (self.most_freq(labels), 0, None, None)
        else:
            if self.features.any():
                splited_data = get_binary_split(data, labels, self.features)
            else:
                splited_data = get_binary_split(data, labels, np.arange(len(data[0])))                
            
            #print(len(splited_data['left']))
            #print(len(splited_data['right']))
            
            if (len(splited_data['left']) == 0) or (len(splited_data['right']) ==0):
                self.depth = 0
                self.node = (self.most_freq(labels), 0, None, None)
            else:
                self.node = (None, splited_data['gini'], splited_data['index'], splited_data['value'])
                self.left = DecisionTree(self.depth-1)
                self.left.train(data[splited_data['left']], labels[splited_data['left']])
                
                self.right = DecisionTree(self.depth-1)
                self.right.train(data[splited_data['right']], labels[splited_data['right']])  
    
    # predict_single: self, np.array --> int
    # row is a np.array of size D
    def predict_single(self, row):
        if self.depth == 0:
            return int(self.node[0])
        else:
            index = self.node[2]
            value = self.node[-1]

            if row[index] < value:
                return self.left.predict_single(row)
            else:
                return self.right.predict_single(row)
                    
    def predict(self, X):
        l = X.shape[0]
        ypred = np.empty(l, dtype=int)
        
        for i in range(0,l):
            ypred[i] = self.predict_single(X[i])
        
        return ypred
    
    def show_structure(self, max_depth=-1):
        if max_depth == -1:
            max_depth = self.depth
            
        if self.depth > 0:
            tabs = '\t' * (max_depth-self.depth)
            m = '{}[Depth {}: index={}, value={}, gini={}]'.format(tabs,
                                                                   max_depth-self.depth+1, 
                                                                   self.node[2],
                                                                   self.node[3],
                                                                   round(self.node[1],3))
            print(m)
            if self.left != None:
                self.left.show_structure(max_depth)
            if self.right != None:
                self.right.show_structure(max_depth)       
        

## 2. Random Forest

In [113]:
class RForest(object):
    def __init__(self, depth=1, num_bags=1, bag_size_ratio=0.6):
        self.depth = depth
        self.num_bags = num_bags
        self.bag_size_ratio = bag_size_ratio
        self.trees = []
        
    # train: self, np.array, np.array --> void
    def train(self, data, labels):
        n = data.shape[0]
        nprime = int(self.bag_size_ratio * n)
        
        for i in range(0, self.num_bags):
            index = np.random.randint(n, size=nprime)
            sample_data = data[index]
            sample_label = labels[index]
            
            features = np.random.randint(len(data[0]), size=np.random.randint(len(data[0])))
            
            T = DecisionTree(depth=self.depth, features=features)
            T.train(sample_data, sample_label)
            
            self.trees.append(T)
    
    # predict: self, np.array --> np.array
    def predict(self, X):
        l = X.shape[0]
        ypred = np.empty(l, dtype=int)
        x = []
        
        for T in self.trees:
            x.append(T.predict(X).reshape(l,1))
        
        results = np.concatenate(x, axis=0)
        
        #print(results)
        
        for i in range(0,l):
            # find the most frequent occurence
            counts = np.bincount(results[i])
            ypred[i] = np.argmax(counts)
            
        
        
        return ypred
        

## 3. Predictions
## 3.1 Banknotes 

In [114]:
# Load the data
# data is first downloweded into DATA_PATH from 
# http://archive.ics.uci.edu/ml/machine-learning-databases/00267/data_banknote_authentication.txt
import os

DATA_PATH = 'banknote'
FILE_NAME = 'data_banknote_authentication.txt'

def load_data(data_path=DATA_PATH, file_name=FILE_NAME):
    file_path = os.path.join(data_path, file_name)
    data = pd.read_csv(file_path)
    
    return data.values[:, :-1], data.values[:,-1]

DATA, LABELS = load_data()
LABELS = LABELS * 1.0

train_set = DATA[:1234, :]
train_labels = LABELS[:1234]

test_set = DATA[1234:,:]
test_labels = LABELS[1234:]

In [118]:
bank_auth = RForest(depth=4, num_bags=3)

In [119]:
bank_auth.train(train_set, train_labels)

In [120]:
ypred = bank_auth.predict(test_set)
e = len(test_labels[test_labels != ypred])/len(test_labels)
e # error rate

0.0072992700729927005

Previous bench mark is: 0.0364963503649635. Give slight improvement

## 3.2 Iris

In [121]:
from sklearn.datasets import load_iris

def prep_train_test(rate=0.9):
    iris = load_iris()
    X = iris.data
    y = iris.target
    ind = []
    
    for i in range(3):
        ind.append(np.random.choice(50, int(50*rate), replace=False) + 50*i)
    
    train_ind = np.concatenate(ind)
    test_ind = np.setdiff1d(np.arange(150), train_ind)
        
    
    return X[train_ind], y[train_ind], X[test_ind], y[test_ind]

iris_train_data, iris_train_labels, iris_test_data,iris_test_labels = prep_train_test(0.9)

In [129]:
iris_clf = RForest(depth=4, num_bags=5)
iris_clf.train(iris_train_data, iris_train_labels)

In [130]:
ypred = iris_clf.predict(iris_test_data)
e = len(iris_test_labels[iris_test_labels != ypred])/len(iris_test_labels)
e # error rate

0.06666666666666667

previous bench mark is: 0.06666666666666667. probably just Bayes error at this point.