In [45]:
from numpy import *

def loadDataSet(fileName):      #general function to parse tab -delimited floats
    dataMat = []                #assume last column is target value
    fr = open(fileName)
    for line in fr.readlines():
        curLine = line.strip().split('\t')
        fltLine = list(map(float,curLine)) #map all elements to float()
        dataMat.append(fltLine)
    return dataMat

def binSplitDataSet(dataSet, feature, value):
    mat0 = dataSet[nonzero(dataSet[:,feature] > value)[0],:]
    mat1 = dataSet[nonzero(dataSet[:,feature] <= value)[0],:]
    return mat0,mat1

In [46]:
testMat = mat(eye(4))
testMat

matrix([[ 1.,  0.,  0.,  0.],
        [ 0.,  1.,  0.,  0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

In [54]:
testMat[nonzero(testMat[:, 1] < 0.5)[0],:]

matrix([[ 1.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

In [48]:
mat0,mat1 = binSplitDataSet(testMat, 1, 0.5)
mat0

matrix([[ 0.,  1.,  0.,  0.]])

In [49]:
mat1

matrix([[ 1.,  0.,  0.,  0.],
        [ 0.,  0.,  1.,  0.],
        [ 0.,  0.,  0.,  1.]])

In [50]:
myDat = loadDataSet('ex00.txt')
myMat = mat(myDat)
myMat

matrix([[  3.60980000e-02,   1.55096000e-01],
        [  9.93349000e-01,   1.07755300e+00],
        [  5.30897000e-01,   8.93462000e-01],
        [  7.12386000e-01,   5.64858000e-01],
        [  3.43554000e-01,  -3.71700000e-01],
        [  9.80160000e-02,  -3.32760000e-01],
        [  6.91115000e-01,   8.34391000e-01],
        [  9.13580000e-02,   9.99350000e-02],
        [  7.27098000e-01,   1.00056700e+00],
        [  9.51949000e-01,   9.45255000e-01],
        [  7.68596000e-01,   7.60219000e-01],
        [  5.41314000e-01,   8.93748000e-01],
        [  1.46366000e-01,   3.42830000e-02],
        [  6.73195000e-01,   9.15077000e-01],
        [  1.83510000e-01,   1.84843000e-01],
        [  3.39563000e-01,   2.06783000e-01],
        [  5.17921000e-01,   1.49358600e+00],
        [  7.03755000e-01,   1.10167800e+00],
        [  8.30700000e-03,   6.99760000e-02],
        [  2.43909000e-01,  -2.94670000e-02],
        [  3.06964000e-01,  -1.77321000e-01],
        [  3.64920000e-02,   4.081

In [51]:
shape(myMat)

(200, 2)

In [55]:
def regLeaf(dataSet):#returns the value used for each leaf
    return mean(dataSet[:,-1])

def regErr(dataSet):
    return var(dataSet[:,-1]) * shape(dataSet)[0]

def linearSolve(dataSet):   #helper function used in two places
    m,n = shape(dataSet)
    X = mat(ones((m,n))); Y = mat(ones((m,1)))#create a copy of data with 1 in 0th postion
    X[:,1:n] = dataSet[:,0:n-1]; Y = dataSet[:,-1]#and strip out Y
    xTx = X.T*X
    if linalg.det(xTx) == 0.0:
        raise NameError('This matrix is singular, cannot do inverse,\n\
        try increasing the second value of ops')
    ws = xTx.I * (X.T * Y)
    return ws,X,Y

def modelLeaf(dataSet):#create linear model and return coeficients
    ws,X,Y = linearSolve(dataSet)
    return ws

def modelErr(dataSet):
    ws,X,Y = linearSolve(dataSet)
    yHat = X * ws
    return sum(power(Y - yHat,2))

def chooseBestSplit(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):
    tolS = ops[0]; tolN = ops[1]
    #if all the target variables are the same value: quit and return value
    if len(set(dataSet[:,-1].T.tolist()[0])) == 1: #exit cond 1
        return None, leafType(dataSet)
    m,n = shape(dataSet)
    #the choice of the best feature is driven by Reduction in RSS error from mean
    S = errType(dataSet)
    bestS = inf; bestIndex = 0; bestValue = 0
    for featIndex in range(n-1):
        for splitVal in set(dataSet[:,featIndex].T.tolist()[0]):
            mat0, mat1 = binSplitDataSet(dataSet, featIndex, splitVal)
            if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN): continue
            newS = errType(mat0) + errType(mat1)
            if newS < bestS: 
                bestIndex = featIndex
                bestValue = splitVal
                bestS = newS
    #if the decrease (S-bestS) is less than a threshold don't do the split
    if (S - bestS) < tolS: 
        return None, leafType(dataSet) #exit cond 2
    mat0, mat1 = binSplitDataSet(dataSet, bestIndex, bestValue)
    if (shape(mat0)[0] < tolN) or (shape(mat1)[0] < tolN):  #exit cond 3
        return None, leafType(dataSet)
    return bestIndex,bestValue#returns the best feature to split on
                              #and the value used for that split

def createTree(dataSet, leafType=regLeaf, errType=regErr, ops=(1,4)):#assume dataSet is NumPy Mat so we can array filtering
    feat, val = chooseBestSplit(dataSet, leafType, errType, ops)#choose the best split
    if feat == None: return val #if the splitting hit a stop condition return val
    retTree = {}
    retTree['spInd'] = feat
    retTree['spVal'] = val
    lSet, rSet = binSplitDataSet(dataSet, feat, val)
    retTree['left'] = createTree(lSet, leafType, errType, ops)
    retTree['right'] = createTree(rSet, leafType, errType, ops)
    return retTree  

In [56]:
createTree(myMat)

{'left': 1.0180967672413792,
 'right': -0.044650285714285719,
 'spInd': 0,
 'spVal': 0.48813}

In [57]:
myDat1 = loadDataSet('ex0.txt')
myMat1 = mat(myDat1)
myMat1

matrix([[  1.00000000e+00,   4.09175000e-01,   1.88318000e+00],
        [  1.00000000e+00,   1.82603000e-01,   6.39080000e-02],
        [  1.00000000e+00,   6.63687000e-01,   3.04225700e+00],
        [  1.00000000e+00,   5.17395000e-01,   2.30500400e+00],
        [  1.00000000e+00,   1.36430000e-02,  -6.76980000e-02],
        [  1.00000000e+00,   4.69643000e-01,   1.66280900e+00],
        [  1.00000000e+00,   7.25426000e-01,   3.27574900e+00],
        [  1.00000000e+00,   3.94350000e-01,   1.11807700e+00],
        [  1.00000000e+00,   5.07760000e-01,   2.09505900e+00],
        [  1.00000000e+00,   2.37395000e-01,   1.18191200e+00],
        [  1.00000000e+00,   5.75340000e-02,   2.21663000e-01],
        [  1.00000000e+00,   3.69820000e-01,   9.38453000e-01],
        [  1.00000000e+00,   9.76819000e-01,   4.14940900e+00],
        [  1.00000000e+00,   6.16051000e-01,   3.10544400e+00],
        [  1.00000000e+00,   4.13700000e-01,   1.89627800e+00],
        [  1.00000000e+00,   1.05279000e

In [58]:
createTree(myMat1)

{'left': {'left': {'left': 3.9871631999999999,
   'right': 2.9836209534883724,
   'spInd': 1,
   'spVal': 0.797583},
  'right': 1.980035071428571,
  'spInd': 1,
  'spVal': 0.582002},
 'right': {'left': 1.0289583666666666,
  'right': -0.023838155555555553,
  'spInd': 1,
  'spVal': 0.197834},
 'spInd': 1,
 'spVal': 0.39435}

In [59]:
createTree(myMat, ops=(0,1))

{'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': 1.035533,
                'right': 1.077553,
                'spInd': 0,
                'spVal': 0.993349},
               'right': {'left': 0.74420699999999995,
                'right': 1.069062,
                'spInd': 0,
                'spVal': 0.988852},
               'spInd': 0,
               'spVal': 0.989888},
              'right': 1.227946,
              'spInd': 0,
              'spVal': 0.985425},
             'right': {'left': {'left': 0.86291099999999998,
               'right': 0.67357900000000004,
               'spInd': 0,
               'spVal': 0.975022},
              'right': {'left': {'left': 1.0646899999999999,
                'right': {'left': 0.94525499999999996,
                 'right': 1.0229060000000001,
                 'spInd': 0,
                 'spVal': 0.950153},
                'spInd': 0,
               

In [60]:
myDat2 = loadDataSet('ex2.txt')
myMat2 = mat(myDat2)
myMat2

matrix([[  2.28628000e-01,  -2.26627300e+00],
        [  9.65969000e-01,   1.12386764e+02],
        [  3.42761000e-01,  -3.15848550e+01],
        [  9.01444000e-01,   8.73006250e+01],
        [  5.85413000e-01,   1.25295113e+02],
        [  3.34900000e-01,   1.89766500e+01],
        [  7.69043000e-01,   6.40419410e+01],
        [  2.97107000e-01,  -1.79837700e+00],
        [  9.01421000e-01,   1.00133819e+02],
        [  1.76523000e-01,   9.46348000e-01],
        [  7.10234000e-01,   1.08553919e+02],
        [  9.81980000e-01,   8.63996370e+01],
        [  8.58730000e-02,  -1.01371040e+01],
        [  5.37834000e-01,   9.09955360e+01],
        [  8.06158000e-01,   6.28776980e+01],
        [  7.08890000e-01,   1.35416767e+02],
        [  7.87755000e-01,   1.18642009e+02],
        [  4.63241000e-01,   1.71710570e+01],
        [  3.00318000e-01,  -1.80513180e+01],
        [  8.15215000e-01,   1.18319942e+02],
        [  1.39880000e-01,   7.33678400e+00],
        [  6.83730000e-02,  -1.516

In [61]:
createTree(myMat2)

{'left': {'left': {'left': {'left': 105.24862350000001,
    'right': 112.42895575000001,
    'spInd': 0,
    'spVal': 0.958512},
   'right': {'left': {'left': {'left': {'left': 87.310387500000004,
       'right': {'left': {'left': 96.452866999999998,
         'right': {'left': 104.82540899999999,
          'right': {'left': 95.181792999999999,
           'right': 102.25234449999999,
           'spInd': 0,
           'spVal': 0.872883},
          'spInd': 0,
          'spVal': 0.892999},
         'spInd': 0,
         'spVal': 0.910975},
        'right': 95.275843166666661,
        'spInd': 0,
        'spVal': 0.85497},
       'spInd': 0,
       'spVal': 0.944221},
      'right': {'left': 81.110151999999999,
       'right': 88.784498800000009,
       'spInd': 0,
       'spVal': 0.811602},
      'spInd': 0,
      'spVal': 0.833026},
     'right': 102.35780185714285,
     'spInd': 0,
     'spVal': 0.790312},
    'right': 78.085643250000004,
    'spInd': 0,
    'spVal': 0.759504},
   'spInd

In [62]:
createTree(myMat2, ops=(10000, 4))

{'left': 101.35815937735848,
 'right': -2.6377193297872341,
 'spInd': 0,
 'spVal': 0.499171}

In [63]:
def isTree(obj):
    return (type(obj).__name__=='dict')

def getMean(tree):
    if isTree(tree['right']): tree['right'] = getMean(tree['right'])
    if isTree(tree['left']): tree['left'] = getMean(tree['left'])
    return (tree['left']+tree['right'])/2.0
    
def prune(tree, testData):
    if shape(testData)[0] == 0: return getMean(tree) #if we have no test data collapse the tree
    if (isTree(tree['right']) or isTree(tree['left'])):#if the branches are not trees try to prune them
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
    if isTree(tree['left']): tree['left'] = prune(tree['left'], lSet)
    if isTree(tree['right']): tree['right'] =  prune(tree['right'], rSet)
    #if they are now both leafs, see if we can merge them
    if not isTree(tree['left']) and not isTree(tree['right']):
        lSet, rSet = binSplitDataSet(testData, tree['spInd'], tree['spVal'])
        errorNoMerge = sum(power(lSet[:,-1] - tree['left'],2)) +\
            sum(power(rSet[:,-1] - tree['right'],2))
        treeMean = (tree['left']+tree['right'])/2.0
        errorMerge = sum(power(testData[:,-1] - treeMean,2))
        if errorMerge < errorNoMerge: 
            print("merging")
            return treeMean
        else: return tree
    else: return tree

In [64]:
myTree = createTree(myMat2, ops=(0,1))

In [65]:
myDatTest = loadDataSet('ex2test.txt')
myMat2Test = mat(myDatTest)
myMat2Test

matrix([[  4.21862000e-01,   1.08302410e+01],
        [  1.05349000e-01,  -2.24161100e+00],
        [  1.55196000e-01,   2.18729760e+01],
        [  1.61152000e-01,   2.01541800e+00],
        [  3.82632000e-01,  -3.87789790e+01],
        [  1.77100000e-02,   2.01091130e+01],
        [  1.29656000e-01,   1.52668870e+01],
        [  6.13926000e-01,   1.11900063e+02],
        [  4.09277000e-01,   1.87473100e+00],
        [  8.07556000e-01,   1.11223754e+02],
        [  5.93722000e-01,   1.33835486e+02],
        [  9.53239000e-01,   1.10465070e+02],
        [  2.57402000e-01,   1.53328990e+01],
        [  6.45385000e-01,   9.39830540e+01],
        [  5.63460000e-01,   9.36452770e+01],
        [  4.08338000e-01,  -3.07198780e+01],
        [  8.74394000e-01,   9.18735050e+01],
        [  2.63805000e-01,  -1.92752000e-01],
        [  4.11198000e-01,   1.07511180e+01],
        [  4.49884000e-01,   9.21190100e+00],
        [  6.46315000e-01,   1.13533660e+02],
        [  6.73718000e-01,   1.251

In [66]:
prune(myTree, myMat2Test)

merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging
merging


{'left': {'left': {'left': {'left': 92.523991499999994,
    'right': {'left': {'left': {'left': 112.386764,
       'right': 123.559747,
       'spInd': 0,
       'spVal': 0.960398},
      'right': 135.83701300000001,
      'spInd': 0,
      'spVal': 0.958512},
     'right': 111.2013225,
     'spInd': 0,
     'spVal': 0.956951},
    'spInd': 0,
    'spVal': 0.965969},
   'right': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': {'left': 96.41885225,
              'right': 69.318648999999994,
              'spInd': 0,
              'spVal': 0.948822},
             'right': {'left': {'left': 110.03503850000001,
               'right': {'left': 65.548417999999998,
                'right': {'left': 115.75399400000001,
                 'right': {'left': {'left': 94.396114499999996,
                   'right': 85.005351000000005,
                   'spInd': 0,
                   'spVal': 0.912161},
                  'right': {'left': {'left': 106.81466

In [68]:
myMat2 = mat(loadDataSet('exp2.txt'))
createTree(myMat2, modelLeaf, modelErr, (1, 10))

{'left': matrix([[  1.69855694e-03],
         [  1.19647739e+01]]), 'right': matrix([[ 3.46877936],
         [ 1.18521743]]), 'spInd': 0, 'spVal': 0.285477}

In [69]:
def regTreeEval(model, inDat):
    return float(model)

def modelTreeEval(model, inDat):
    n = shape(inDat)[1]
    X = mat(ones((1,n+1)))
    X[:,1:n+1]=inDat
    return float(X*model)

def treeForeCast(tree, inData, modelEval=regTreeEval):
    if not isTree(tree): return modelEval(tree, inData)
    if inData[tree['spInd']] > tree['spVal']:
        if isTree(tree['left']): return treeForeCast(tree['left'], inData, modelEval)
        else: return modelEval(tree['left'], inData)
    else:
        if isTree(tree['right']): return treeForeCast(tree['right'], inData, modelEval)
        else: return modelEval(tree['right'], inData)
        
def createForeCast(tree, testData, modelEval=regTreeEval):
    m=len(testData)
    yHat = mat(zeros((m,1)))
    for i in range(m):
        yHat[i,0] = treeForeCast(tree, mat(testData[i]), modelEval)
    return yHat

In [74]:
trainMat = mat(loadDataSet('bikeSpeedVsIq_train.txt'))
testMat = mat(loadDataSet('bikeSpeedVsIq_test.txt'))
myTree = createTree(trainMat, ops=(1,20))
yHat = createForeCast(myTree, testMat[:, 0])
corrcoef(yHat, testMat[:, 1], rowvar=0)[0,1]

0.96408523182221462

In [75]:
ws, X, Y = linearSolve(trainMat)
ws

matrix([[ 37.58916794],
        [  6.18978355]])

In [76]:
for i in range(shape(testMat)[0]):
    yHat[i] = testMat[i, 0] + ws[0, 0]
    
corrcoef(yHat, testMat[:, 1], rowvar=0)[0,1]

0.94346842356747618

In [2]:
from tkinter import *
root = Tk()

In [3]:
myLabel = Label(root, text="Hello World")
myLabel.grid()

In [4]:
root.mainloop()

In [None]:
from numpy import *

from tkinter import *
import regTrees

import matplotlib
matplotlib.use('TkAgg')
from matplotlib.backends.backend_tkagg import FigureCanvasTkAgg
from matplotlib.figure import Figure

def reDraw(tolS,tolN):
    reDraw.f.clf()        # clear the figure
    reDraw.a = reDraw.f.add_subplot(111)
    if chkBtnVar.get():
        if tolN < 2: tolN = 2
        myTree=regTrees.createTree(reDraw.rawDat, regTrees.modelLeaf,\
                                   regTrees.modelErr, (tolS,tolN))
        yHat = regTrees.createForeCast(myTree, reDraw.testDat, \
                                       regTrees.modelTreeEval)
    else:
        myTree=regTrees.createTree(reDraw.rawDat, ops=(tolS,tolN))
        yHat = regTrees.createForeCast(myTree, reDraw.testDat)
    reDraw.a.scatter(reDraw.rawDat[:,0].flatten().A[0], reDraw.rawDat[:,1].flatten().A[0], s=5) #use scatter for data set
    reDraw.a.plot(reDraw.testDat, yHat, linewidth=2.0) #use plot for yHat
    reDraw.canvas.show()
    
def getInputs():
    try: tolN = int(tolNentry.get())
    except: 
        tolN = 10 
        print("enter Integer for tolN")
        tolNentry.delete(0, END)
        tolNentry.insert(0,'10')
    try: tolS = float(tolSentry.get())
    except: 
        tolS = 1.0 
        print("enter Float for tolS")
        tolSentry.delete(0, END)
        tolSentry.insert(0,'1.0')
    return tolN,tolS

def drawNewTree():
    tolN,tolS = getInputs()#get values from Entry boxes
    reDraw(tolS,tolN)
    
root=Tk()

reDraw.f = Figure(figsize=(5,4), dpi=100) #create canvas
reDraw.canvas = FigureCanvasTkAgg(reDraw.f, master=root)
reDraw.canvas.show()
reDraw.canvas.get_tk_widget().grid(row=0, columnspan=3)

Label(root, text="tolN").grid(row=1, column=0)
tolNentry = Entry(root)
tolNentry.grid(row=1, column=1)
tolNentry.insert(0,'10')
Label(root, text="tolS").grid(row=2, column=0)
tolSentry = Entry(root)
tolSentry.grid(row=2, column=1)
tolSentry.insert(0,'1.0')
Button(root, text="ReDraw", command=drawNewTree).grid(row=1, column=2, rowspan=3)
chkBtnVar = IntVar()
chkBtn = Checkbutton(root, text="Model Tree", variable = chkBtnVar)
chkBtn.grid(row=3, column=0, columnspan=2)

reDraw.rawDat = mat(regTrees.loadDataSet('sine.txt'))
reDraw.testDat = arange(min(reDraw.rawDat[:,0]),max(reDraw.rawDat[:,0]),0.01)
reDraw(1.0, 10)
               
root.mainloop()