In [2]:
import numpy as np
import pandas as pd
import math

In [3]:
#df = pd.read_csv('benchmark.csv', sep = ';')

In [4]:
df = pd.read_csv('pima.tsv', sep = '\t')

In [5]:
targetColumn = 'target'
modeValue = df[targetColumn].mode()[0]
dfWithoutTarget = df.loc[:, df.columns != targetColumn]

In [6]:
def normalize(data):
    return (data - min(data))/ (max(data) - min(data))

In [7]:
for feature in dfWithoutTarget.columns:
    df[feature] = normalize(df[feature])

In [8]:
class Tree:
    
    left = None
    right = None
    
    def setAttribute(self, attribute):
        self.attribute = attribute

    def setValue(self, value):
        self.value = value

    def setLeft(self, left):
        self.left = left

    def setRight(self, right):
        self.right = right
        
    def traverse(self, row):
        if(self.left and self.right):
            if(row[self.attribute] < self.value):
                return self.left.traverse(row)
            else:
                return self.right.traverse(row)
        else:
            return self.value
        
    def __str__(self, depth=0):
        
        print('\t' * (2*depth), end='')
        print('['+self.attribute+']')
        
        if(self.left and self.right):
            print('\t' * ((2*depth)+1) + '<', end='')
            print(self.value)
            self.left.__str__(depth+1)
            
            print('\t' * ((2*depth)+1) + '>=', end='')
            print(self.value)
            self.right.__str__(depth+1)
        else:
            print('\t' * ((2*depth)+1), end='')
            print('(', end='')
            print(self.value, end='')
            print(')')
            
        return 'Decision Tree'
        

In [9]:
class RandomForest:
    
    def __init__(self, nTree, targetColumn, maxDepth):
        self.nTree = nTree
        self.targetColumn = targetColumn
        self.maxDepth = maxDepth
        
    def columnsSubset(self, columnValues, m):
        if m == len(columnValues):
            return columnValues
        columnsSubSet = np.random.choice(columnValues, m, replace=False)
        return columnsSubSet
    
    def entropy(self, df, parentNodeSize):
        if(len(df) == 0):
            return 1.0
        
        classByAttribute = df.groupby(df[self.targetColumn]).size()

        differentClasses = len(classByAttribute.index)
        totalSum = classByAttribute.sum()
        entropy = 0
        for label in range(differentClasses):
            labelSum = classByAttribute.iloc[label]
            labelProbability = labelSum/totalSum
            entropy -= labelProbability * (math.log(labelProbability,2))

        try:
            return entropy * (totalSum/ parentNodeSize)
        except:
            return 1.0
    
    def generatePartitions(self, df, attribute):
        partitions = list()
        df.sort_values(attribute, inplace=True)
        bestSplitEntropy = 999
        bestSplitValue = 0
        for i in range(1,len(df)):
            if(df[targetColumn].iloc[i] != df[targetColumn].iloc[i-1]):
                splitValue = (df[attribute].iloc[i] + df[attribute].iloc[i-1])/2
            
                if(splitValue==0):
                    splitValue += 0.00001
                if(splitValue==1):
                    splitValue -= 0.00001
                    
                leftData = df[df[attribute] < splitValue]
                leftEntropy = self.entropy(leftData, len(leftData))
                rightData = df[df[attribute] >= splitValue]
                rightEntropy = self.entropy(rightData, len(rightData))
                valueEntropy = leftEntropy + rightEntropy
                if(valueEntropy < bestSplitEntropy):
                    besSplitEntropy = valueEntropy
                    bestSplitValue = splitValue
        
        partitions.append(df[df[attribute] < bestSplitValue])
        partitions.append(df[df[attribute] >= bestSplitValue])
        
        
        
        
#         if(len(partitions[0])==0 or len(partitions[1])==0):
#             import pdb; pdb.set_trace()



        
        return (bestSplitValue, partitions)

    def decisionTree(self, df, depth, treeNode):
        classByAttribute = df.groupby(df[self.targetColumn]).size()
        parentNodeSize = classByAttribute.sum()
        parentNodeEntropy = self.entropy(df, parentNodeSize)
        

        if(parentNodeEntropy == 0 or depth == self.maxDepth):
            leaf = Tree()
            leaf.setAttribute(self.targetColumn)
            if(len(df) > 0):
                leaf.setValue(df[self.targetColumn].mode()[0])
            else:
                leaf.setValue(modeValue)
            return leaf

        mAttributes = round(math.sqrt(len(df.columns)))
        attributes = df.loc[:, df.columns != self.targetColumn].columns
        columnsSubSet = self.columnsSubset(attributes, mAttributes)
        gainDict = {}
        for attribute in columnsSubSet:
            subsetEntropy = 0
            partitions = self.generatePartitions(df, attribute)
            for partition in partitions[1]:
                    attributeEntropy = self.entropy(partition, parentNodeSize)
                    subsetEntropy += attributeEntropy
            gainDict[attribute] = parentNodeEntropy - subsetEntropy

        highestGain = max(gainDict, key=lambda key: gainDict[key])

        partitions = self.generatePartitions(df, highestGain)
        
        
#         if(len(partitions[1][0])==0 or len(partitions[1][1])==0):
#             import pdb; pdb.set_trace()
            
            
            
        newNode = Tree()
        newNode.setAttribute(highestGain)
        newNode.setValue(partitions[0])

        #if it's the root\n"
        if(depth == 0):
            treeNode = newNode

        left = self.decisionTree(partitions[1][0], depth+1, newNode)
        newNode.setLeft(left)

        right = self.decisionTree(partitions[1][1], depth+1, newNode)
        newNode.setRight(right)

        return newNode
    
    def createBootstraps(self, df):
        bootstraps = list()
        for i in range(self.nTree):
            bootstraps.append(df.sample(frac=1, replace=True))

        return bootstraps
                     
    def train(self, df):
        randomForest = list()
        bootstraps = self.createBootstraps(df)
        for i in range(self.nTree):
            print('Training Tree ', i)
            tree = Tree()
            trainData = bootstraps[i]
            tree = self.decisionTree(trainData, 0, tree)
            randomForest.append(tree)
                     
        self.randomForest = randomForest
        
    def predict(self, df):
        predictedValues = list()
        for index, row in df.iterrows():
            votes = list()
            for tree in self.randomForest:
                vote = tree.traverse(row)
                votes.append(vote)
            
            prediction = max(votes, key=votes.count)
            predictedValues.append(prediction)
            
        return np.array(predictedValues)
    

In [10]:
def generateFolds(df, k, targetColumn):
    
    targetClasses = df.groupby(df[targetColumn]).size().index.values
    
    partitionsByClass = list()
    for targetClass in targetClasses:
        classData = df[df[targetColumn]==targetClass]
        classData = classData.sample(frac=1)
        partitionsByClass.append(classData)
        
    proportions = list()
    for i in range(len(targetClasses)):
        step = (len(partitionsByClass[i])//k)
        proportions.append((step, 0, step))
    
    folds = {}
    for fold in range(k):
        folds[fold] = pd.DataFrame()
        for i in range(len(targetClasses)):
            step = proportions[i][0]
            posStart = proportions[i][1]
            posEnd = proportions[i][2]
            classData = partitionsByClass[i].iloc[posStart:posEnd, :]
            folds[fold] = pd.concat([folds[fold], classData])
            proportions[i] = (step, posEnd, posEnd + step)

    return folds

In [11]:
def accuracy(actualValues, predictions):
    return np.mean(actualValues == predictions)

In [12]:
def f1score(actualValues, predictions):
    truePositives = 0
    falsePositives = 0
    falseNegatives = 0
    beta = 1

    for i in range(len(actualValues)):
        if (actualValues[i] == 1 and predictions[i] == 1):
            truePositives += 1
            
        if (actualValues[i] == 1 and predictions[i] == 0):
            falseNegatives += 1
            
        if (actualValues[i] == 0 and predictions[i] == 1):
            falsePositives += 1
    try:        
        precision = truePositives / (truePositives + falsePositives)
    except:
        precision = np.NaN
        
    try:
        recall = truePositives / (truePositives + falseNegatives)
    except:
        recall = np.NaN
    
    try:
        f1score = (1 + (beta ** 2)) * ((precision * recall)/(((beta ** 2) * precision) + recall))
    except:
        f1score = np.NaN
        
        
    return (precision, recall, f1score)

In [13]:
def kfold(model, df, k, targetColumn):
    # shuffle and separate data
    df = df.sample(frac=1)
    
    folds = generateFolds(df, k, targetColumn)
    accuracies = []
    precisions = []
    recalls = []
    f1scores = []

    for testIndex in range(k):
        print('Fold ', testIndex)
        test = folds[testIndex]
        train = []
        
        for key in folds.keys():
            if key != testIndex:
                train.append(folds[key])
        
        train = pd.concat(train)
        model.train(train)
        
        predictions = model.predict(test.loc[:, df.columns != targetColumn])
        accuracies.append(accuracy(test[targetColumn], predictions))
        metrics = f1score(test[targetColumn].values, predictions)
        precisions. append(metrics[0])
        recalls. append(metrics[1])
        f1scores.append(metrics[2])
    
    accAvg = np.mean(accuracies)
    accStd = np.std(accuracies)
    precisionAvg = np.mean(precisions)
    recallsAvg = np.mean(recalls)
    f1Avg = np.mean([x for x in f1scores if not np.isnan(x)])
    f1Std = np.std([x for x in f1scores if not np.isnan(x)])
    print ('Accuracy Average:', accAvg)
    print ('Accuracy Standard Deviation : ', accStd)
    print ('Precision Average:', precisionAvg)
    print ('Recall Average:', recallsAvg)
    print ('F1Score Average: ', f1Avg)
    print ('F1Score Standard Deviation : ', f1Std)
        
    return accAvg, accStd, f1Avg, f1Std

In [14]:
model = RandomForest(5, targetColumn, 7)

In [15]:
model.nTree

5

In [None]:
train = kfold(model, df, 10, targetColumn)

Training Tree  0
5


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy


Training Tree  1
5
Training Tree  2
5
Training Tree  3
5
Training Tree  4
5
Training Tree  5
5
Training Tree  6
5
Training Tree  7
5
Training Tree  8
5
