In [35]:
from pandas.io.parsers import read_csv
import numpy as np

np.random.seed(0) # REMOVE FOR FINAL SOLUTION


In [36]:
#Data Splitted.
def splitData(data):
    shuffled = data.copy()
    np.random.shuffle(shuffled)

    testCount = int(shuffled.shape[0]*1/3)
    trainCount = shuffled.shape[0]-testCount

    trainSet = shuffled[0:trainCount]
    testSet = shuffled[trainCount:]

    return trainSet, testSet

def separateInputOutput(dataset):
    outputData = dataset[:,3] # Output
    inputData = np.concatenate((dataset[:,0:3],dataset[:,4:7],dataset[:,8:10]),axis=1) # Inputs
    return inputData, outputData

#ML ALG
def initializeWeights(x,y):
    N = x.shape[0]
    
def normaliseData(dataArray, normType = "range"):
    normalisedDataList = []

    if normType == "range":
        maxVal = dataArray.max()
        minVal = dataArray.min()
        for val in dataArray:
            normalisedDataList.append((val-minVal)/(maxVal-minVal))
        return np.array(normalisedDataList)

    elif normType == "z":
        mean = dataArray.mean()
        stdDev =  getStandardDeviation(dataArray)
        for val in dataArray:
            normalisedDataList.append((val-mean)/stdDev)
        return np.array(normalisedDataList)

def getStandardDeviation(dataArray):
    n = dataArray.shape[0]
    mean = dataArray.mean()
    sumOfSquareDiffs = 0
    for val in dataArray:
        sumOfSquareDiffs += (val - mean)**2

    variance = sumOfSquareDiffs/(n-1)
    stdDev = variance**(1/2)
    return stdDev


In [37]:
def getUniqueClassCount(listToCount):
    classes = []
    for item in listToCount:
        if item not in classes:
            classes.append(item)

    counts = []
    for itemClass in classes:
        counts.append(listToCount.count(itemClass))
    return counts

def getGiniLeaf(counts):
    total = np.sum(counts)
    giniLeaf = 1
    for count in counts:
        giniLeaf -= (count/total)**2
    return giniLeaf

def getGiniNode(sortedList, threshold):
        lessThan = []
        greaterThan = []

        for i in range(len(sortedList)):
            x = sortedList[i][0]
            # print(x, threshold)
            if x < threshold:
                lessThan.append(sortedList[i][1])
            else:
                greaterThan.append(sortedList[i][1])

        countsLT = getUniqueClassCount(lessThan)
        countsGT = getUniqueClassCount(greaterThan)
        giniLT = getGiniLeaf(countsLT)
        giniGT = getGiniLeaf(countsGT)
        # print(giniLT,giniGT)
        # print(np.sum(countsLT),np.sum(countsGT))
        totalCount = len(sortedList)
        # print(totalCount)
        giniNode = ((np.sum(countsLT)/totalCount)*giniLT) + ((np.sum(countsGT)/totalCount)*giniGT)
        # print(giniNode)
        return giniNode


In [38]:
def getSplitValue(trainIn, trainOut, attributeIndex):
    attribute = trainIn[:, attributeIndex].copy() # Output
    combined = []
    for j,sample in enumerate(attribute):
        combined.append((sample,trainOut[j]))
    attribute.sort()
    sortedList = [tuple for x in attribute for tuple in combined if tuple[0] == x]


    thresholds = []
    for j in range(len(attribute)-1):
        thresholds.append((attribute[j]+attribute[j+1])/2)

    giniNodes = []
    for threshold in thresholds:
        giniNodes.append(getGiniNode(sortedList, threshold))
    # print(giniNodes)
    minNodeGini = min(giniNodes)
    minGiniIndex = giniNodes.index(minNodeGini)
    splitVal = thresholds[minGiniIndex]
    return splitVal, minNodeGini
   

In [42]:
def train(trainIn,trainOut):
    ginis = []
    splitVals = []
    for i in range(trainIn.shape[1]):
        splitVal, minGini = getSplitValue(trainIn,trainOut, i)
        ginis.append(minGini)
        splitVals.append(splitVal)
        # print(splitVal, minGini)
    # print("Root is input data attribute", ginis.index(min(ginis)))
    ltX,lty,gtX,gty = branchData(testIn,testOut,ginis.index(min(ginis)),splitVals[ginis.index(min(ginis))]) #generate root


In [43]:
def branchData(X, y, splitAttributeIndex, splitValue):
    lessThan_X = []
    lessThan_y = []
    greaterThan_X = []
    greaterThan_y = []
    
    for i in range(len(X)):
        if X[i,splitAttributeIndex] < splitValue:
            lessThan_X.append(X[i,:])
            lessThan_y.append(y[i])
        else:
            greaterThan_X.append(X[i,:])
            greaterThan_y.append(y[i])
    print(lessThan_X)
    return lessThan_X,lessThan_y,greaterThan_X,greaterThan_y
        

In [44]:
#Reading in data
dataframe = read_csv("beer.txt",delim_whitespace=True,header=None)

# Preparing Data
dataset = dataframe.values
trainSet, testSet = splitData(dataset)

testIn, testOut = separateInputOutput(testSet)
trainIn, trainOut = separateInputOutput(trainSet)

N =trainIn.shape[0]
Wi = 1/N

#running ml alg.
train(trainIn,trainOut) #ML ALG RAN HERE 


[array([40.88053097, 0.264555015, 1.06, 3.7169230769999997, 17.08,
       7.4112105260000005, 15.24, 47.32857143], dtype=object), array([41.94247788, 0.352781138, 0.772727273, 4.104615385, 15.94,
       5.924736842000001, 12.24, 65.94857143], dtype=object), array([38.13716814, 0.32871495100000003, 3.7536363639999997,
       4.1384615380000005, 18.7, 7.7993684210000005, 10.2, 71.17142857],
      dtype=object), array([43.04867257, 0.180269393, 4.047272727, 3.935384615, 17.67,
       5.900210526, 5.76, 67.10285714], dtype=object), array([41.76548673, 0.143658705, 1.118181818, 3.652307692, 19.31,
       9.866684211, 9.0, 66.27285714], dtype=object), array([42.03097345, 0.282453802, 1.7081818180000001, 3.913846154, 16.81,
       12.10615789, 13.92, 61.05714286], dtype=object), array([43.97787611, 0.271177799, 1.4190909090000001, 3.8492307689999996,
       16.73, 9.530947368, 12.48, 53.57428571], dtype=object), array([41.19026549, 0.283402606, 2.620909091, 4.123076923, 19.6,
       7.5549473

In [46]:
class Node:
    def __init__(self,index,value,left,right):
        self.index = index
        self.value = value
        self.left = left
        self.right = right

    def addLeftChild(self,child):
        self.left = Node(child)
    
    def addRightChild(self,child):
        self.right = Node(child)
    
    def printTree(self):
        if self.left:
            self.left.printTree()
        print(self.index,self.value)
        if self.right:
            self.right.printTree()