In [1]:
%config InlineBackend.figure_format = 'retina'
import scipy.io as sio
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from sklearn import tree
from sklearn import datasets
from sklearn.metrics import accuracy_score
from sklearn.model_selection import GridSearchCV

In [2]:
# Ionosphere dataset.
X_and_Y = np.load('ionosphere.npy').astype(np.float32) # Load data from file.

np.random.seed(1)               # Set the random seed.
np.random.shuffle(X_and_Y)      # Shuffle the data.
X = X_and_Y[:, 0:-1]            # First column to second last column: Features.
Y = X_and_Y[:, -1]              # Last column: Labels.
Y[Y==0] = -1                    # Convert labels from {0, 1} to {-1, 1}.

print(X.shape)      # (351, 34)
print(Y.shape)      # (351,)
print(X_and_Y[0])

FileNotFoundError: [Errno 2] No such file or directory: 'ionosphere.npy'

In [None]:
# Divide the data points into training set and test set.
X_shuffled = X
Y_shuffled = Y
X_train = X_shuffled[:200]          # Shape: (200, 34)
Y_train = Y_shuffled[:200]          # Shape: (200,)
X_test = X_shuffled[200:]           # Shape: (151,4)
Y_test = Y_shuffled[200:]           # Shape: (151,)
print(X_train.shape)
print(Y_train.shape)
print(X_test.shape)
print(Y_test.shape)

### Decision Tree Using Scikit-Learn

In [None]:
# Perform grid search for best max depth.

# 1. Create a decision tree classifier.
estimator = tree.DecisionTreeClassifier(criterion = "entropy", random_state = 1)

# 2. Create a grid searcher with cross-validation.
D_list = [1, 2, 3, 4, 5]
param_grid = {'max_depth': D_list}

# 3. Use the grid searcher to fit the training set.
grid_search = GridSearchCV(estimator = estimator, param_grid = param_grid, cv = 10)

grid_search.fit(X_train, Y_train)

In [None]:
# Draw heatmaps for result of grid search.
def draw_heatmap(errors, D_list, title):
    plt.figure(figsize = (2,4))
    ax = sns.heatmap(errors, annot=True, fmt='.3f', yticklabels=D_list, xticklabels=[])
    ax.collections[0].colorbar.set_label('error')
    ax.set(ylabel='max depth D')
    bottom, top = ax.get_ylim()
    ax.set_ylim(bottom + 0.5, top - 0.5)
    plt.title(title)
    plt.show()

cross_val_errors = 1 - grid_search.cv_results_['mean_test_score'].reshape(-1,1)
draw_heatmap(cross_val_errors, D_list, title='cross-validation error w.r.t D')

In [3]:
# Show the best max depth.
best_max_depth = grid_search.best_params_['max_depth']
print("Best max depth D: {}".format(best_max_depth))

# Calculate the test error.
test_error = 1 - sum(grid_search.best_estimator_.predict(X_test) == Y_test) / len(X_test)
print("Test error: {}".format(test_error))

NameError: name 'grid_search' is not defined