In [3]:
from pandas.io.parsers import read_csv
import numpy as np

np.random.seed(0) # REMOVE FOR FINAL SOLUTION


In [57]:
#Data Splitted.
def splitData(data):
    shuffled = data.copy()
    np.random.shuffle(shuffled)

    testCount = int(shuffled.shape[0]*1/3)
    trainCount = shuffled.shape[0]-testCount

    trainSet = shuffled[0:trainCount]
    testSet = shuffled[trainCount:]

    return trainSet, testSet

def separateInputOutput(dataset):
    outputData = dataset[:,3] # Output
    inputData = np.concatenate((dataset[:,0:3],dataset[:,4:7],dataset[:,8:10]),axis=1) # Inputs
    return inputData, outputData

def getInputAttributeDict():
    return {   0 : 'calorific_value',
                    1 : 'nitrogen',
                    2 : 'turbidity',
                    3 : 'alcohol',
                    4 : 'sugars',
                    5 : 'bitterness',
                    6 : 'colour',
                    7 : 'degree_of_fermentation'} .copy()
    


#ML ALG
def initializeWeights(x,y):
    N = x.shape[0]
    
def normaliseData(dataArray, normType = "range"):
    normalisedDataList = []

    if normType == "range":
        maxVal = dataArray.max()
        minVal = dataArray.min()
        for val in dataArray:
            normalisedDataList.append((val-minVal)/(maxVal-minVal))
        return np.array(normalisedDataList)

    elif normType == "z":
        mean = dataArray.mean()
        stdDev =  getStandardDeviation(dataArray)
        for val in dataArray:
            normalisedDataList.append((val-mean)/stdDev)
        return np.array(normalisedDataList)

def getStandardDeviation(dataArray):
    n = dataArray.shape[0]
    mean = dataArray.mean()
    sumOfSquareDiffs = 0
    for val in dataArray:
        sumOfSquareDiffs += (val - mean)**2

    variance = sumOfSquareDiffs/(n-1)
    stdDev = variance**(1/2)
    return stdDev


In [10]:
def getUniqueClassCount(listToCount):
    classes = []
    for item in listToCount:
        if item not in classes:
            classes.append(item)

    counts = []
    for itemClass in classes:
        counts.append(listToCount.count(itemClass))
    return counts

def getLeafGini(counts):
    total = np.sum(counts)
    giniLeaf = 1
    for count in counts:
        giniLeaf -= (count/total)**2
    return giniLeaf

def getNodeGini(sortedList, threshold):
        lessThan = []
        greaterThan = []

        for i in range(len(sortedList)):
            x = sortedList[i][0]
            # print(x, threshold)
            if x < threshold:
                lessThan.append(sortedList[i][1])
            else:
                greaterThan.append(sortedList[i][1])

        countsLT = getUniqueClassCount(lessThan)
        countsGT = getUniqueClassCount(greaterThan)
        giniLT = getLeafGini(countsLT)
        giniGT = getLeafGini(countsGT)
        # print(giniLT,giniGT)
        # print(np.sum(countsLT),np.sum(countsGT))
        totalCount = len(sortedList)
        # print(totalCount)
        nodeGini = ((np.sum(countsLT)/totalCount)*giniLT) + ((np.sum(countsGT)/totalCount)*giniGT)
        # print(giniNode)
        return nodeGini


In [113]:
def getMinGiniAndThreshold(inputData, outputData, alreadyUsedAttributes):
    inputAttributesMinGinis = []
    attributeThresholds = []
    attributeIndexes = list(getInputAttributeDict())
    # print(attributeIndexes)
    for i in alreadyUsedAttributes:
        attributeIndexes.remove(i)
    # print(attributeIndexes)

    for i in attributeIndexes:
        attribute = inputData[:, i].copy() # Single attribute from input data

        inputOutputPairs = []
        for j,sample in enumerate(attribute):
            inputOutputPairs.append((sample,trainOut[j]))
        attribute.sort()
        sortedInputOutputPairs = [tuple for x in attribute for tuple in inputOutputPairs if tuple[0] == x]

        testThresholds = []
        for j in range(len(attribute)-1):
            testThresholds.append((attribute[j]+attribute[j+1])/2)

        nodeGinis = []
        for testThresh in testThresholds:
            nodeGinis.append(getNodeGini(sortedInputOutputPairs, testThresh))

        minNodeGini = min(nodeGinis)
        minGiniIndex = nodeGinis.index(minNodeGini)
        attributeThreshold = testThresholds[minGiniIndex]

        inputAttributesMinGinis.append(minNodeGini)
        attributeThresholds.append(attributeThreshold)
    return inputAttributesMinGinis, attributeThresholds
   

In [124]:
def createTree(trainIn,trainOut):

    attributesUsed = []
    recursiveBranch(trainIn,trainOut, 1, attributesUsed)

    # minAttributeGinis, attributeThresholds = getMinGiniAndThreshold(trainIn,trainOut, [4,1])
    # print(minAttributeGinis, attributeThresholds)

    # rootAttributeGini = min(minAttributeGinis)
    # rootAttributeIndex = minAttributeGinis.index(rootAttributeGini)
    # print("Root is input data attribute", rootAttributeIndex)

    # rootThreshold = attributeThresholds[rootAttributeIndex]
    # unusedBranchAttributes = list(getInputAttributeDict())
    # print(unusedBranchAttributes)
    # unusedBranchAttributes.remove(rootAttributeIndex)
    # print(unusedBranchAttributes)

    # ltX,lty,gtX,gty = branchData(trainIn, trainOut, rootAttributeGini, rootThreshold, rootAttributeIndex, unusedBranchAttributes) #generate root
    return

def recursiveBranch(inputData, outputData, parentGini, attributesUsed):
    print(inputData.shape[0],"- Attributes Used =", attributesUsed)
    minAttributeGinis, attributeThresholds = getMinGiniAndThreshold(inputData, outputData, attributesUsed)
    nodeAttributeGini = min(minAttributeGinis)
    print(minAttributeGinis)
    attributeIndexes = list(getInputAttributeDict())
    print(attributeIndexes)
    for i in attributesUsed:
        attributeIndexes.remove(i)
    print(attributeIndexes)
    nodeAttributeIndex = attributeIndexes[minAttributeGinis.index(nodeAttributeGini)]
    print(nodeAttributeIndex)
    if parentGini < nodeAttributeGini: # If none of the new ginis is less than parent gini, exit recursion
        print("Leaf node has min gini = {} from attribute {}. Parent has gini = {}".format(nodeAttributeGini, nodeAttributeIndex, parentGini))
        return
    else:            # Continue recursion
        nodeThreshold = attributeThresholds[minAttributeGinis.index(nodeAttributeGini)]
        attributesUsed.append(nodeAttributeIndex)
        print(attributesUsed)
        print("Node has min gini = {} from attribute {} thresholding at {}, parent has gini = {}.".format(nodeAttributeGini, nodeAttributeIndex, nodeThreshold,parentGini))

    ltX,lty,gtX,gty = branchData(inputData, outputData, nodeAttributeGini, nodeThreshold, nodeAttributeIndex, attributesUsed)
    return
#     for i in range(inputAttributes.shape[1]):       # For each input attribute
#         splitVal, minGini = getMinGiniAndThreshold(inputAttributes[i], outputClasses, i)     
#         inputAttributesMinGinis.append(minGini)
#         splitVals.append(splitVal)
#         # print(splitVal, minGini)
#     # print("Root is input data attribute", ginis.index(min(ginis)))

#     minGini = min(inputAttributesMinGinis)
#     nextSplitAttributeIndex = inputAttributesMinGinis.index(minGini)
#     nextSplitVal = splitVals[nextSplitAttributeIndex]
#     ltX,lty,gtX,gty = branchData(trainIn, trainOut, minGini, nextSplitAttributeIndex, nextSplitVal)



In [125]:
def branchData(X, y, splitAttributeGini, splitValue, splitAttributeIndex, attributesUsed):
    lessThan_X = []
    lessThan_y = []
    greaterThan_X = []
    greaterThan_y = []
    
    for i in range(len(X)):
        if X[i,splitAttributeIndex] < splitValue:
            lessThan_X.append(X[i,:])
            lessThan_y.append(y[i])
        else:
            greaterThan_X.append(X[i,:])
            greaterThan_y.append(y[i])

    # Call recursively? :
        # if not at max depth:
            # Get min gini of new data for all attribs except this node's splitAttrib
            # if not less than splitAttributeGini, call recursive function
    
    # left branch first:
    # lessThan_X = np.delete(np.asarray(lessThan_X), splitAttributeIndex, 1)
    lessThan_X = np.asarray(lessThan_X)
    lessThan_y = np.asarray(lessThan_y)
    print("Branching left...")
    recursiveBranch(lessThan_X, lessThan_y, splitAttributeGini, attributesUsed)
    print("Finished left branch.")

    # Right branch
    # greaterThan_X = np.delete(np.asarray(greaterThan_X), splitAttributeIndex, 1)
    greaterThan_X = np.asarray(greaterThan_X)
    greaterThan_y = np.asarray(greaterThan_y)
    print("Branching right...")
    recursiveBranch(greaterThan_X, greaterThan_y, splitAttributeGini, attributesUsed)
    print("Finished right branch.")

    return lessThan_X,lessThan_y,greaterThan_X,greaterThan_y
        

In [126]:
#Reading in data
dataframe = read_csv("beer.txt",delim_whitespace=True,header=None)

# Preparing Data
dataset = dataframe.values
trainSet, testSet = splitData(dataset)

testIn, testOut = separateInputOutput(testSet)
trainIn, trainOut = separateInputOutput(trainSet)

N =trainIn.shape[0]
Wi = 1/N
# print(getInputAttributeDict()[1])

#running ml alg.
createTree(trainIn,trainOut) #ML ALG RAN HERE


103 - Attributes Used = []
[0.6463396362110814, 0.37448216088992786, 0.4698901098901098, 0.44495051048242545, 0.54784189271396, 0.46586136825468505, 0.4425598235862234, 0.6266308873878041]
[0, 1, 2, 3, 4, 5, 6, 7]
[0, 1, 2, 3, 4, 5, 6, 7]
1
[1]
Node has min gini = 0.37448216088992786 from attribute 1 thresholding at 0.35975925249999996, parent has gini = 1.
Branching left...
66 - Attributes Used = [1]
[0.629560915275201, 0.6383493050159718, 0.6100371631117079, 0.6346666666666666, 0.6274410774410774, 0.634336788942052, 0.6186274509803922]
[0, 1, 2, 3, 4, 5, 6, 7]
[0, 2, 3, 4, 5, 6, 7]
3
Leaf node has min gini = 0.6100371631117079 from attribute 3. Parent has gini = 0.37448216088992786
Finished left branch.
Branching right...
37 - Attributes Used = [1]
[0.6118863049095609, 0.6137123745819398, 0.6347701149425288, 0.6158366841942469, 0.6180306180306181, 0.5974227224533072, 0.6162162162162163]
[0, 1, 2, 3, 4, 5, 6, 7]
[0, 2, 3, 4, 5, 6, 7]
6
Leaf node has min gini = 0.5974227224533072 from 

In [127]:
class Node:
    def __init__(self,index,value,left,right):
        self.index = index
        self.value = value
        self.left = left
        self.right = right

    def addLeftChild(self,child):
        self.left = Node(child)
    
    def addRightChild(self,child):
        self.right = Node(child)
    
    def printTree(self):
        if self.left:
            self.left.printTree()
        print(self.index,self.value)
        if self.right:
            self.right.printTree()