In [1]:
import numpy as np
import matplotlib.pyplot as plt

from sklearn import tree
from sklearn.metrics import accuracy_score

In [2]:
#Load Data
filename = 'data/SPECTF.dat'
data = np.loadtxt(filename, delimiter=',')      #Column 1: Label
X = data[:, 1:]                   #shape=(267, 44)
y = np.array([data[:, 0]]).T      #shape=(267, 1)
n, d = X.shape 

In [3]:
#Shuffle the data
idx = np.arange(n)      #Shuffle array[0,1, 2,...266]
np.random.seed(13)
np.random.shuffle(idx)
X = X[idx]
y = y[idx]

In [4]:
#Split the data
Xtrain = X[1:101, :]     #Train on first 100 instances  (shape=(100, 44))
Xtest = X[101:, :]       #shape=(166, 44)
ytrain = y[1:101, :]    #Test on remaining instances    shape=(100, 1)
ytest = y[101:, :]       #shape=(166, 1)

In [5]:
#Train the decision tree
clf = tree.DecisionTreeClassifier()
clf = clf.fit(Xtrain, ytrain)

In [6]:
#Output predictions on the remaining data
y_pred = clf.predict(Xtest)        #shape=(166,)

In [7]:
y_pred

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 0., 1., 0., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 1., 1., 1., 0.,
       0., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 0., 1., 0.,
       1., 1., 1., 0., 1., 1., 1., 1., 1., 1., 1., 0., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 0., 0., 1., 1., 1., 0., 0., 0., 0., 1., 0., 1., 0.,
       1., 1., 0., 0., 1., 1., 0., 1., 1., 0., 0., 0., 1., 1., 1., 1., 0.,
       1., 1., 1., 0., 0., 1., 1., 0., 1., 1., 1., 1., 0., 1., 1., 0., 0.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 0.,
       1., 1., 1., 1., 1., 1., 0., 1., 0., 1., 0., 1., 1., 1., 1., 1., 1.,
       1., 0., 0., 1., 1., 0., 1., 1., 1., 0., 1., 1., 1.])

In [8]:
#Compute the training accuracy of the model
meanDecisionTreeAccuracy = accuracy_score(ytest, y_pred)

In [9]:
meanDecisionTreeAccuracy

0.7409638554216867

In [10]:
print((ytest - y_pred)**2)

[[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 0.]
 [1. 1. 1. ... 1. 1. 1.]
 [0. 0. 0. ... 0. 0. 0.]]


In [11]:
mean = (np.sum(np.array((ytest - y_pred)**2)))/n
mean

38.29962546816479

In [12]:
#TODO: update these statistics based on the results of your experiment
stddevDecisionTreeAccuracy = 0
meanDecisionStumpAccuracy = 0
stddevDecisionStumpAccuracy = 0
meanDT3Accuracy = 0
stddevDT3Accuracy = 0

In [13]:
#Make certain that the return value matches the API specification
stats = np.zeros((3, 2))
stats[0, 0] = meanDecisionTreeAccuracy
stats[0, 1] = stddevDecisionTreeAccuracy
stats[1, 0] = meanDecisionStumpAccuracy
stats[1, 1] = stddevDecisionStumpAccuracy
stats[2, 0] = meanDT3Accuracy
stats[2, 1] = stddevDT3Accuracy

In [14]:
print("Decision Tree Accuracy = ", stats[0,0], "(", stats[0, 1], ")")
print("Decision Stump Accuracy = ", stats[1, 0], "(", stats[1, 1], ")")
print("3-level Decision Tree = ", stats[2, 0], "(", stats[2, 1], ")")

Decision Tree Accuracy =  0.7409638554216867 ( 0.0 )
Decision Stump Accuracy =  0.0 ( 0.0 )
3-level Decision Tree =  0.0 ( 0.0 )
