In [3]:
import csv 
import sys
import numpy as np
import random

In [4]:
def load_file(filename):
    dataset = []
    with open(filename, 'r') as csvfile:
        reader = csv.reader(csvfile, delimiter=',')
        try: 
            for row in reader:
                dataset.append(row)
        except csv.Error as e:
            sys.exit('file %s, line %d: %s' % (filename, reader.line_num, e))
    return dataset
     
dataset = load_file('sonar.all-data.csv')          

In [5]:
def convert_float(data, nb_lines, size_row):
    for row in range(nb_lines):
        for i in range(size_row-1):
            data[row][i] = float(data[row][i])
    return data

dataset = convert_float(dataset, len(dataset), len(dataset[0]))
#print(dataset[0])

In [6]:
def convert_class_values(data):
    dict = {}
    index = -1
    values = []
    i = 0
    for row in data:
        if not(row[-1] in values):
            index +=1
            values.append(row[-1])
        data[i][-1] = index
        i += 1
    return data

dataset = convert_class_values(dataset)

In [11]:
def gini_index(groups, class_values):
    gini = 0.0
    nb_total = sum([len(group) for group in groups])
    
    #print(nb_total)
    for group in groups: # left and right group
        size = len(group)
        if size == 0: continue
        
        score = 0
        for class_value in class_values:
            p = [row[-1] for row in group].count(class_value) / float(size)
            score += p * p
        gini += (1 - score) * (size / nb_total)  #weight score by size of the group
            
            #print('class value ', class_value)
            #print('Group ', proportion)
            #print('gini ', gini)
    return gini

def get_gini(sub_dataset, row_nb, index_col):
    left, right = [],[]
    left_class, right_class = [],[]
    value_pivot = sub_dataset[row_nb][index_col]
    
    for row in sub_dataset:
        if row[index_col] < value_pivot:
            left.append(row)
            left_class.append(row[-1])
        else:
            right.append(row)
            right_class.append(row[-1])
    
    groups = []
    groups.append(left)
    groups.append(right)
    class_values= [0, 1]
    gini = gini_index(groups, class_values)
    return gini, {'left': left, 'right': right}

#gini, left, right = get_gini(dataset,4,5)

In [12]:
# Return tree node 
# Groups of the node contain two sub groups : left and right
def split(sub_dataset):
    # select random index to split
    gini_ref = 999
    gini_tmp = 999
    row_ref = 0
    group_ref = {}
    group_tmp = {}
    random.seed()
    index_col = random.randrange(len(dataset[0])-1)                         
    
    # split in an arbitrary way based on the value of the half of the sub dataset
    #row_ref = int(len(sub_dataset)/2)
    #value_ref = sub_dataset[row_ref][index_col]
    #left, right=[],[]
    #for row in sub_dataset:
    #    if row[index_col] < value_ref: left.append(row)
    #    else: right.append(row)
    #group_ref = {'left': left, 'right' : right}    
    
    # determine the best row/value to split
    for row_nb in range(len(sub_dataset)):
        gini_tmp, group_tmp = get_gini(sub_dataset, row_nb, index_col)
        #print(gini_tmp)
        if gini_tmp < gini_ref:
            gini_ref = gini_tmp
            group_ref = group_tmp
            value_ref = sub_dataset[row_nb][index_col]
            row_ref = row_nb
            
    return {'feature': index_col, 'row': row_ref, 'value': value_ref, 'groups': group_ref}

#split(dataset)

In [13]:
def terminal_node(sub_dataset):
    outcomes = [row[-1] for row in sub_dataset]
    return max(set(outcomes), key=outcomes.count) 

In [24]:
def build_tree(node, dataset, nb_features, num_feature):
    # if parent_tree not existing => root
    #if not('feature' in parent_tree.keys()):    
    #    parent_tree = split(dataset)
    #    build_tree(parent_tree, dataset, nb_features, num_feature+1)    
    #split the left group in sub child tree
    #else:
        if (num_feature<nb_features):
            #print('Left Length: ' ,len(node['groups']['left']))
            #print('Right Length: ' ,len(parent_tree['groups']['right']))
            if len(node['groups']['left']) > 0:
                node['left'] = split(node['groups']['left'])
                build_tree(node['left'], node['groups']['left'], nb_features, num_feature+1) 
            if len(node['groups']['right']) > 0:
                node['right'] = split(node['groups']['right'])
                build_tree(node['right'], node['groups']['right'], nb_features, num_feature+1) 
        # terminal node
        else:
            #print('terminal')
            if len(node['groups']['left']) > 0:
                node['left'] = terminal_node(node['groups']['left'])
                #print(terminal_node(node['groups']['left']))                
            if len(node['groups']['right']) > 0:
                node['right'] = terminal_node(node['groups']['right'])
                #print('right ' , terminal_node(node['groups']['right']))
    
root=split(dataset)
#print(root.indexes)
build_tree(root,dataset,3,1)

In [25]:
# randomforest
def randomforest(dataset, nb_trees = 5, nb_features = 5):
    trees =[]
    for i in range(nb_trees):
        root = split(dataset)
        build_tree(root, dataset, nb_features, 1)
        trees.append(root)
    return trees

trees = randomforest( dataset, nb_trees = 5, nb_features=5)
#print(trees)

In [20]:
def predict(node, row):
    #print('feature ', node['feature'])
    #print(row[node['feature']], ' vs ', node['value'] )
    if row[node['feature']] < node['value']:
        if isinstance(node['left'], dict): 
            return predict(node['left'], row)
        else:
            return node['left']
    else:
        if isinstance(node['right'], dict): 
            return predict(node['right'], row)
        else:
            return node['right']
        
value = predict(root, dataset[0])
print(value)

0


In [23]:
def bagging_prediction(trees, row):
    predictions = [predict(tree, row) for tree in trees]
    print(predictions)
    return max(set(predictions), key=predictions.count)

bagging_prediction(trees, dataset[0])

[0, 1, 0, 1, 1]


1

In [279]:
print(dataset[0])

[0.02, 0.0371, 0.0428, 0.0207, 0.0954, 0.0986, 0.1539, 0.1601, 0.3109, 0.2111, 0.1609, 0.1582, 0.2238, 0.0645, 0.066, 0.2273, 0.31, 0.2999, 0.5078, 0.4797, 0.5783, 0.5071, 0.4328, 0.555, 0.6711, 0.6415, 0.7104, 0.808, 0.6791, 0.3857, 0.1307, 0.2604, 0.5121, 0.7547, 0.8537, 0.8507, 0.6692, 0.6097, 0.4943, 0.2744, 0.051, 0.2834, 0.2825, 0.4256, 0.2641, 0.1386, 0.1051, 0.1343, 0.0383, 0.0324, 0.0232, 0.0027, 0.0065, 0.0159, 0.0072, 0.0167, 0.018, 0.0084, 0.009, 0.0032, 0]
