In [None]:
import numpy as np

def splitData(data):
    shuffled = data.copy()
    np.random.shuffle(shuffled)

    testCount = int(shuffled.shape[0]*1/3)
    trainCount = shuffled.shape[0]-testCount
    trainSet = shuffled[0:trainCount]
    testSet = shuffled[trainCount:]

    return trainSet, testSet


def separateInputOutput(dataset):
    outputData = dataset[:,3] # Output
    inputData = np.concatenate((dataset[:,0:3],dataset[:,4:7],dataset[:,8:10]),axis=1)
    return inputData, outputData


def getInputAttributeDict():
    return {   0 : 'calorific_value',
                    1 : 'nitrogen',
                    2 : 'turbidity',
                    3 : 'alcohol',
                    4 : 'sugars',
                    5 : 'bitterness',
                    6 : 'colour',
                    7 : 'degree_of_fermentation'} .copy()


def normaliseData(data, normType = "range"):
    normalisedData = np.zeros(data.shape)
    for i in range(normalisedData.shape[1]):
        dataColumn = data[:,i].copy()
        if normType == "range":
            maxVal = dataColumn.max()
            minVal = dataColumn.min()
            for j, val in enumerate(dataColumn):
                normalisedData[j,i] = (val-minVal)/(maxVal-minVal)

        elif normType == "z":
            mean = dataColumn.mean()
            stdDev =  getStandardDeviation(dataColumn)
            for j, val in enumerate(dataColumn):
                normalisedData[j,i] = (val-mean)/stdDev
    return normalisedData


def getStandardDeviation(dataArray):
    n = dataArray.shape[0]
    mean = dataArray.mean()
    sumOfSquareDiffs = 0
    for val in dataArray:
        sumOfSquareDiffs += (val - mean)**2

    variance = sumOfSquareDiffs/(n-1)
    stdDev = variance**(1/2)
    return stdDev


def getUniqueClassCount(listToCount):
    classes = []
    for item in listToCount:
        if item not in classes:
            classes.append(item)

    counts = []
    for itemClass in classes:
        counts.append(listToCount.count(itemClass))
    return counts, classes


def getLeafGini(counts):
    total = np.sum(counts)
    giniLeaf = 1
    for count in counts:
        giniLeaf -= (count/total)**2
    return giniLeaf


def getNodeGini(sortedList, threshold):
        lessThan = []
        greaterThan = []

        for i in range(len(sortedList)):
            x = sortedList[i][0]
            if x < threshold:
                lessThan.append(sortedList[i][1])
            else:
                greaterThan.append(sortedList[i][1])

        countsLT, _ = getUniqueClassCount(lessThan)
        countsGT, _ = getUniqueClassCount(greaterThan)
        giniLT = getLeafGini(countsLT)
        giniGT = getLeafGini(countsGT)
        totalCount = len(sortedList)
        nodeGini = ((np.sum(countsLT)/totalCount)*giniLT) + ((np.sum(countsGT)/totalCount)*giniGT)
        return nodeGini


def getMinGiniAndThreshold(inputData, outputData, alreadyUsedAttributes):
    inputAttributesMinGinis = []
    attributeThresholds = []
    attributeIndexes = list(getInputAttributeDict())
    for i in alreadyUsedAttributes:
        attributeIndexes.remove(i)
    
    for i in attributeIndexes:
        attribute = inputData[:, i].copy() # Single attribute from input data
        
        inputOutputPairs = []
        for j,sample in enumerate(attribute):
            inputOutputPairs.append((sample,outputData[j]))
        attribute.sort()
        sortedInputOutputPairs = [tuple for x in attribute for tuple in inputOutputPairs if tuple[0] == x]

        testThresholds = []
        for j in range(len(attribute)-1):
            testThresholds.append((attribute[j]+attribute[j+1])/2)

        nodeGinis = []
        for testThresh in testThresholds:
            nodeGinis.append(getNodeGini(sortedInputOutputPairs, testThresh))

        minNodeGini = min(nodeGinis)
        minGiniIndex = nodeGinis.index(minNodeGini)
        attributeThreshold = testThresholds[minGiniIndex]

        inputAttributesMinGinis.append(minNodeGini)
        attributeThresholds.append(attributeThreshold)
    return inputAttributesMinGinis, attributeThresholds


def createTree(trainIn,trainOut):
    attributesUsed = []
    return recursiveBranch(trainIn, trainOut, 1, attributesUsed, None)
    

def recursiveBranch(inputData, outputData, parentGini, attributesUsed, currentNode):
    if currentNode == None: # Make root node
        minAttributeGinis, attributeThresholds = getMinGiniAndThreshold(inputData, outputData, attributesUsed)
        nodeAttributeGini = min(minAttributeGinis)

        nodeAttributeIndex = minAttributeGinis.index(nodeAttributeGini)
        attributesUsed.append(nodeAttributeIndex)

        # print("Node attribute index (lowest gini) =",nodeAttributeIndex)

        nodeThreshold = attributeThresholds[minAttributeGinis.index(nodeAttributeGini)]
        # print("Root node created using attribute {} (has a min gini of {:.5f}). Threshold at this node = {}. Parent node gini = {:.5f}.".format(nodeAttributeIndex, nodeAttributeGini, nodeThreshold, parentGini))

        rootNode = Node(nodeAttributeIndex, nodeThreshold)
        branchData(inputData, outputData, nodeAttributeGini, nodeThreshold, nodeAttributeIndex, attributesUsed, rootNode)
        # print("Back to root")
        return rootNode
        
    if len(getUniqueClassCount(list(outputData))[0]) == 1:
        # print("Leaf created - Remaining {} samples are all \"{}\".".format(inputData.shape[0], outputData[0]))
        currentNode.setValue(outputData)
        return

    if len(attributesUsed) == len(getInputAttributeDict()): ## -------------------TEST
        # print("Leaf created - All attributes have been used in current branch.")
        currentNode.setValue(outputData)
        return

    try:
        minAttributeGinis, attributeThresholds = getMinGiniAndThreshold(inputData, outputData, attributesUsed)
        nodeAttributeGini = min(minAttributeGinis)

        # If none of the new ginis is less than parent gini, exit recursion
        if parentGini < nodeAttributeGini:
            # print("Leaf created - Min gini of remaining attributes ({:.5f}) is less than parent node gini ({:.5f}).".format(nodeAttributeGini, parentGini))
            currentNode.setValue(outputData)
            return

        # Not leaf -> Continue recursion
        attributeIndexes = list(getInputAttributeDict()) # Get full attribute list
        for i in attributesUsed:                         # Remove previously used attributes
            attributeIndexes.remove(i)                  
        nodeAttributeIndex = attributeIndexes[minAttributeGinis.index(nodeAttributeGini)] # Get index of new attribute to use 
        attributesUsed.append(nodeAttributeIndex)                                         # Add to list of used attributes
        # print("Node attribute index (lowest gini) =",nodeAttributeIndex)

        nodeThreshold = attributeThresholds[minAttributeGinis.index(nodeAttributeGini)] # Get threshold for new node
        # print("Branch node created using attribute {} (has a min gini of {:.5f}). Threshold at this node = {}. Parent node has gini = {:.5f}.".format(nodeAttributeIndex, nodeAttributeGini, nodeThreshold, parentGini))
        currentNode.setValue(nodeThreshold)
        currentNode.setIndex(nodeAttributeIndex)
        branchData(inputData, outputData, nodeAttributeGini, nodeThreshold, nodeAttributeIndex, attributesUsed, currentNode)
        return
    except Exception:
        print(Exception)
        print("Attributes Used = {}".format(attributesUsed))#[getInputAttributeDict()[i] for i in attributesUsed]))
        print("OutputData:",outputData)
        print(len(getUniqueClassCount(list(outputData))[0]))
        print(len(attributesUsed),len(len(getInputAttributeDict())))


def branchData(X, y, splitAttributeGini, splitValue, splitAttributeIndex, attributesUsed, currentNode):
    lessThan_X = []
    lessThan_y = []
    greaterThan_X = []
    greaterThan_y = []

    for i in range(len(X)):
        if X[i,splitAttributeIndex] < splitValue:
            lessThan_X.append(X[i,:])
            lessThan_y.append(y[i])
        else:
            greaterThan_X.append(X[i,:])
            greaterThan_y.append(y[i])

    # left branch first:
    attributesUsed_left = attributesUsed.copy()
    attributesUsed_right = attributesUsed.copy()

    currentNode.addLeftChild(Node())
    currentNode.addRightChild(Node())

    # currentNode_right = currentNode.copy()
    
    lessThan_X = np.asarray(lessThan_X)
    lessThan_y = np.asarray(lessThan_y)
    # print("\nBranching left...")
    recursiveBranch(lessThan_X, lessThan_y, splitAttributeGini, attributesUsed_left, currentNode.getLeftChild())
    # print("Finished left branch.\n")


    # Right branch
    greaterThan_X = np.asarray(greaterThan_X)
    greaterThan_y = np.asarray(greaterThan_y)
    # print("\nBranching right...")
    recursiveBranch(greaterThan_X, greaterThan_y, splitAttributeGini, attributesUsed_right, currentNode.getRightChild())
    # print("Finished right branch.\n")

    return lessThan_X,lessThan_y,greaterThan_X,greaterThan_y   


In [None]:
class Node:
    def __init__(self,index=None, value=None):
        self.left = None
        self.right = None
        self.index = index
        self.value = value #stores value or the classification eg "ale"
        
    def addLeftChild(self,child):
        self.left = child
    
    def getLeftChild(self):
        return self.left

    def addRightChild(self,child):
        self.right = child

    def getRightChild(self):
        return self.right
    
    def printTree(self):
        if self.left:
            self.left.printTree()
        print(self.getData())
        if self.right:
            self.right.printTree()

    def getValue(self):
        return self.value

    def setValue(self, value):
        self.value = value

    def setIndex(self, index):
        self.index = index
    
    def getData(self):
        return self.index,self.value
    
    def isLeaf(self):
        if (self.left == None) & (self.right == None) & (self.index == None):
            return True
        else:
            return False

In [None]:
def accuracy(yPredicted,yActual):
    correct = 0
    for i in range(len(yPredicted)):
        if (yPredicted[i] == yActual[i]):
            correct += 1
    return correct*100/len(yActual)

def randomForestClassifierScratch(x,y,N):
    forest = []
    newX = []
    newY = []
    for i in range(0,N,1):
        newX,newY = bootstrapData(x,y)
        root1 = createTree(np.array(newX),np.array(newY))
        forest.append(root1)
    return np.array(forest)

def bootstrapData(X,Y):
    newX = []
    newY = []
    for j in range(len(X)):
        randIndex = np.random.randint(len(X))
        newX.append(X[randIndex])
        newY.append(Y[randIndex])
    return newX, newY

def predictForest(X,forest):
    predictions = []
    for i in range(len(X)):
        guess = []
        for tree in forest:
            guess.append(predictTree(X[i],tree))
        counts, classes = getUniqueClassCount(list(guess))
        majorityClass = classes[counts.index(max(counts))]
        predictions.append(majorityClass)
    return predictions


def predictTree(x,root):
    treePointer = root
    index,value = treePointer.getData()
    while treePointer.isLeaf() == False:
        if (x[index] < value):
            treePointer = treePointer.getLeftChild()
            index,value = treePointer.getData()
        else:
            treePointer = treePointer.getRightChild()
            index,value = treePointer.getData()

    counts, classes = getUniqueClassCount(list(value)) # get counts of classes in leaf
    majorityClass = classes[counts.index(max(counts))]

    return majorityClass

In [None]:
# Read in data:
with open("beer.txt", 'r') as f:
    lines = np.asarray(f.read().split('\n'))

dataset = []
# Split columns and convert numbers from string to float
for lineStr in lines: 
    attributesStr = lineStr.split('\t') # Separate attributes (currently all strings)
    sample = np.empty((len(attributesStr))).astype(object) # Create empty object array for sample data as floats and str
    for i, string in enumerate(attributesStr):
        try:
            sample[i] = float(string)
        except ValueError:
            sample[i] = string
    dataset.append(sample)
dataset = np.asarray(dataset)


trainSet, testSet = splitData(dataset)

testIn, testOut = separateInputOutput(testSet)
trainIn, trainOut = separateInputOutput(trainSet)


In [None]:
from sklearn.ensemble import RandomForestClassifier
from sklearn import tree
import matplotlib.pyplot as plt
import matplotlib.patches as mpatches

In [None]:
# N trees plot:
ns = np.array([1, 5, 10, 50, 100, 200, 500])
accuracies_scratch_train = np.zeros(ns.shape)
accuracies_scratch_test = np.zeros(ns.shape)

accuracies_scikit_train = np.zeros(ns.shape)
accuracies_scikit_test = np.zeros(ns.shape)

for i, n in enumerate(ns):
    scratchForest = randomForestClassifierScratch(trainIn,trainOut,n)
    scratchPredictions_test = predictForest(testIn,scratchForest)
    accuracies_scratch_test[i] = accuracy(scratchPredictions_test, testOut)

    scratchPredictions_train = predictForest(trainIn,scratchForest)
    accuracies_scratch_train[i] = accuracy(scratchPredictions_train, trainOut)

    scikitForest = RandomForestClassifier(n, max_features=None)
    scikitForest = scikitForest.fit(trainIn, trainOut)
    scikitPredictions_test = scikitForest.predict(testIn)
    accuracies_scikit_test[i] = accuracy(scikitPredictions_test, testOut)
    scikitPredictions_train = scikitForest.predict(trainIn)
    accuracies_scikit_train[i] = accuracy(scikitPredictions_train, trainOut)
    print("{} Trees:\nFrom scratch accuracy train/test = {:.3f}/{:.3f}\nScikit accuracy train/test = {:.3f}/{:.3f}\n".format(n, accuracies_scratch_train[i], accuracies_scratch_test[i], accuracies_scikit_train[i], accuracies_scikit_test[i]))

In [None]:
fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(5)
plt.plot(ns.astype('str'), list(accuracies_scratch_test), 'ko-', label="Random Forest From Scratch Test Accuracy")
plt.plot(ns.astype('str'), list(accuracies_scikit_test), 'mo-', label="Random Forest From Scikit Learn Test Accuracy")
plt.plot(ns.astype('str'), list(accuracies_scratch_train), 'ks--', label="Random Forest From Scratch Training Accuracy")
plt.plot(ns.astype('str'), list(accuracies_scikit_train), 'ms--', label="Random Forest From Scikit Learn Training Accuracy")

plt.legend()
plt.ylabel('Classification Accuracy (% Correct)')
plt.xlabel('Number of Trees (n)')
plt.ylim([0,108])
plt.xticks(np.arange(ns.shape[0]),ns)
plt.title("Classifier Accuracy vs Number of Trees in Forest")

for i in range(ns.shape[0]):
    label_scratch = "{:.2f}".format(accuracies_scratch_test[i])
    plt.annotate(label_scratch,(i-0.1,accuracies_scikit_test[i]-4))

    label_scikit = "{:.2f}".format(accuracies_scikit_test[i])
    plt.annotate(label_scikit,(i-0.1,accuracies_scikit_test[i]-6), color='m')

    label_scikit = "{:.2f}".format(accuracies_scratch_train[i])
    plt.annotate(label_scikit,(i-0.1,accuracies_scikit_train[i]+2))

    label_scikit = "{:.2f}".format(accuracies_scikit_train[i])
    plt.annotate(label_scikit,(i-0.1,accuracies_scikit_train[i]+4), color='m')
plt.show()

In [None]:
#Single Tree Tests sklearn v scratch implementation
preformaceSklearn = []
preformaceScratch = []

for i in range(10):
    trainData, testData = splitData(dataset)
    xtestIn,yTestOut=separateInputOutput(testData)
    xtrainIn,yTrainOut = separateInputOutput(trainData)

    #Test tree script developed from scratch
    treeScratch = createTree(xtrainIn,yTrainOut) 
    predictedYScratch = []
    for i in range(len(xtestIn)):
        predictionScratch = predictTree(xtestIn[i],treeScratch)
        predictedYScratch.append(predictionScratch)
    preformaceScratch.append(accuracy(predictedYScratch,yTestOut))
    
    #Sklearn 
    treeSklearn = tree.DecisionTreeClassifier()
    treeSklearn = treeSklearn.fit(xtrainIn, yTrainOut)
    sklearnTestPrdictionsTree  = treeSklearn.predict(xtestIn)
    preformaceSklearn.append(accuracy(sklearnTestPrdictionsTree,yTestOut))

In [None]:
fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(5)
plt.plot(preformaceScratch,c='black',marker = '.',label="Decision Tree From Scratch")
plt.plot(preformaceSklearn,c='purple',marker = '.',label="Decision Tree From Scikit Learn")
plt.legend()
plt.title(label = 'Scikit Learn vs Scratch Implementation')
plt.ylim([50,100])
plt.xlabel('Iteration')
plt.ylabel('Preformance')
for i in range(len(preformaceScratch)):
    label_scratch = "{:.2f}".format(preformaceScratch[i])
    plt.annotate(label_scratch,(i-0.1,60))

    label_scikit = "{:.2f}".format(preformaceSklearn[i])
    plt.annotate(label_scikit,(i-0.1,63), color='m')

plt.show()

print("Average Performance using Scikit Learn Decision Tree Implementation","{:.4f}%".format(sum(preformaceSklearn)/len(preformaceSklearn)))
print("Average Performance using Decision Tree Implementation Made from Scratch","{:.4f}%".format(sum(preformaceScratch)/len(preformaceScratch)))

In [None]:
reps = 10
n_trees = 50

noNorm = []
rangeNorm = []
zNorm = []

trainIn_rangeNorm = normaliseData(trainIn, 'range')
testIn_rangeNorm = normaliseData(testIn, 'range')

trainIn_zNorm = normaliseData(trainIn, 'z')
testIn_zNorm = normaliseData(testIn, 'z')

for i in range(reps):
    print("Iteration",i)

    forest = randomForestClassifierScratch(trainIn,trainOut,n_trees)
    predicty = predictForest(testIn,forest)
    acc = accuracy(predicty,testOut)
    print("No norm accuracy = ", acc)
    noNorm.append(acc)

    forest = randomForestClassifierScratch(trainIn_rangeNorm,trainOut,n_trees)
    predicty = predictForest(testIn_rangeNorm,forest)
    acc = accuracy(predicty,testOut)
    print("Range norm accuracy = ", acc)
    rangeNorm.append(acc)

    forest = randomForestClassifierScratch(trainIn_zNorm,trainOut,n_trees)
    predicty = predictForest(testIn_zNorm,forest)
    acc = accuracy(predicty,testOut)
    print("Z norm accuracy = ", acc)
    zNorm.append(acc)


In [None]:
# MEANS
print("Mean no norm = {:3f}".format(np.array(noNorm).mean()))
print("Mean range norm = {:3f}".format(np.array(rangeNorm).mean()))
print("Mean z norm = {:3f}".format(np.array(zNorm).mean()))

# PLOT
fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(5)
plt.title("Effect Of Normalisation On Random Forest Beer Classifier Accuracy")
plt.plot(noNorm, 'ko-', label='No Normalisation')
plt.plot(rangeNorm, 'bo-', label='Range Normalisation')
plt.legend()
plt.plot(zNorm, 'go-', label='Z Normalisation')
plt.ylim([50,100])
plt.xlabel("Test #")
plt.ylabel("Classification Accuracy (% correct)")
plt.xticks(np.arange(len(noNorm)),np.linspace(1,len(noNorm),len(noNorm)).astype(np.int))
plt.legend()
for i in range(reps):
    label_noNorm = "{:.2f}".format(noNorm[i])
    plt.annotate(label_noNorm,(i-0.1,rangeNorm[i]-3), color='k')

    label_rangeNorm = "{:.2f}".format(rangeNorm[i])
    plt.annotate(label_rangeNorm,(i-0.1,rangeNorm[i]-7), color='b')

    label_zNorm = "{:.2f}".format(zNorm[i])
    plt.annotate(label_zNorm,(i-0.1,rangeNorm[i]-5), color='g')
plt.show()

In [None]:
preformaceSklearnForest = []
preformaceScratchForest = []

for i in range(10):
    trainData, testData = splitData(dataset)
    xtestIn,yTestOut=separateInputOutput(testData)
    xtrainIn,yTrainOut = separateInputOutput(trainData)

    #Scratch implementation of forest classifier
    forestTest = randomForestClassifierScratch(xtrainIn,yTrainOut,50)
    predictYval = predictForest(xtestIn,forestTest)
    preformaceScratchForest.append(accuracy(predictYval,yTestOut))

    #sklearn
    rfcSklearnTest = RandomForestClassifier(50, max_features=None)
    rfcSklearnTest = rfcSklearnTest.fit(xtrainIn, yTrainOut)
    sklearnTrainPredictionsTest = rfcSklearnTest.predict(xtestIn)
    preformaceSklearnForest.append(accuracy(sklearnTrainPredictionsTest,yTestOut))

In [None]:
fig = plt.figure()
fig.set_figwidth(10)
fig.set_figheight(5)
plt.plot(preformaceScratchForest,c='black',marker = '.',label="Random Forest From Scratch")
plt.plot(preformaceSklearnForest,c='purple',marker = '.',label="Random Forest From Scikit Learn")
plt.title(label = 'Scikit Learn vs Scratch Implementation')
plt.legend()
plt.ylim([50,100])
plt.xlabel('Iteration')

plt.ylabel('Preformance')
for i in range(len(preformaceSklearnForest)):
    label_scratch = "{:.2f}".format(preformaceScratchForest[i])
    plt.annotate(label_scratch,(i-0.1,60))

    label_scikit = "{:.2f}".format(preformaceSklearnForest[i])
    plt.annotate(label_scikit,(i-0.1,63), color='m')

plt.show()

print("Average Performance using Scikit Learn Randon Forest Implementation","{:.4f}%".format(sum(preformaceSklearnForest)/len(preformaceSklearnForest)))
print("Average Performance using Randon Forest Implementation Made from Scratch","{:.4f}%".format(sum(preformaceScratchForest)/len(preformaceScratchForest)))