In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.tree import DecisionTreeClassifier

In [None]:
def calculateError(trainingData, trainingLabels, testData, testLabels, maxDepth):
    trainingErrors = []
    testErrors = []
    minError = 100
    minErrorDepth = 0
    for i in range(1,maxDepth + 1):
        clf = DecisionTreeClassifier(max_depth=i)
        clf = clf.fit(trainingData,trainingLabels) 
        trainingErrors.append(1 - clf.score(trainingData, trainingLabels))
        testError = 1 - clf.score(testData, testLabels)
        testErrors.append(testError)
        testError = round(testError, 3)
        
        if testError < minError:
            minError = testError
            minErrorDepth = i
            
    return trainingErrors, testErrors, minError * 100, minErrorDepth

In [None]:
def graphResults(trainErrors, testErrors, depth):
    trainErrors = np.array(trainErrors) * 100
    testErrors = np.array(testErrors) * 100
    xAxis = np.arange(1, depth + 1)
    plt.plot(xAxis, trainErrors, linestyle='-', marker='o', color='r', label='Train')
    plt.plot(xAxis, testErrors, linestyle='-', marker='o', color='b', label='Test')
    plt.grid(True)
    plt.xticks(xAxis)
    plt.title('Misclassification Error vs Decision Tree Depth')
    plt.xlabel('Decision Tree Depths')
    plt.ylabel('Misclassification Error (%)')
    plt.legend()
    plt.show()

In [None]:
trainingDataMadelon = pd.read_csv('MADELON/madelon_train.data', delim_whitespace=True, header=None)
trainingLabelsMadelon = pd.read_csv('MADELON/madelon_train.labels', delim_whitespace=True, header=None)
testDataMadelon = pd.read_csv('MADELON/madelon_valid.data', delim_whitespace=True, header=None)
testLabelsMadelon = pd.read_csv('MADELON/madelon_valid.labels', delim_whitespace=True, header=None)

In [None]:
trainingDataWilt = pd.read_csv('wilt/wilt_train.csv', header=None)
trainingLabelsWilt = pd.read_csv('wilt/wilt_train.labels', header=None)
testDataWilt = pd.read_csv('wilt/wilt_test.csv', header=None)
testLabelsWilt = pd.read_csv('wilt/wilt_test.labels', header=None)

In [None]:
trainingDataGisette = pd.read_csv('Gisette/gisette_train.data', delim_whitespace=True, header=None)
trainingLabelsGisette = pd.read_csv('Gisette/gisette_train.data', delim_whitespace=True, header=None)
testDataGisette = pd.read_csv('Gisette/gisette_valid.data', delim_whitespace=True, header=None)
testLabelsGisette = pd.read_csv('Gisette/gisette_valid.data', delim_whitespace=True, header=None)

In [None]:
errorTable = pd.DataFrame({"Min_Error (%)":[0, 0, 0], "Depth":[0, 0, 0]}, index=["MADELON", "wilt", "Gisette"],)

In [None]:
trainingErrorsMadelon, testErrorsMadelon, minErrorMadelon, errorDepthMadelon = calculateError(trainingDataMadelon, trainingLabelsMadelon, testDataMadelon, testLabelsMadelon, 12)

In [None]:
errorTable.loc['MADELON', 'Min_Error (%)'] = minErrorMadelon
errorTable.loc['MADELON', 'Depth'] = errorDepthMadelon

In [None]:
graphResults(trainingErrorsMadelon, testErrorsMadelon, 12)

In [None]:
trainingErrorsWilt, testErrorsWilt, minErrorWilt, errorDepthWilt = calculateError(trainingDataWilt, trainingLabelsWilt, testDataWilt, testLabelsWilt, 10)

In [None]:
errorTable.loc['wilt', 'Min_Error (%)'] = minErrorWilt
errorTable.loc['wilt', 'Depth'] = errorDepthWilt

In [None]:
graphResults(trainingErrorsWilt, testErrorsWilt, 10)

In [None]:
trainingErrorsGisette, testErrorsGisette, minErrorGisette, errorDepthGisette = calculateError(trainingDataGisette, trainingLabelsGisette, testDataGisette, testLabelsGisette, 6)

In [None]:
errorTable.loc['Gisette', 'Min_Error (%)'] = minErrorGisette
errorTable.loc['Gisette', 'Depth'] = errorDepthGisette

In [None]:
graphResults(trainingErrorsGisette, testErrorsGisette, 6)

In [None]:
errorTable