In [1]:
# Import libraries

# First, let’s import all of the modules, functions and objects we are going to use in this tutorial.

# Pandas for data handling
import pandas # https://pandas.pydata.org/

# NumPy for numerical computing
import numpy as np # https://numpy.org/

# MatPlotLib for visualization
import matplotlib.pyplot as pl  # https://matplotlib.org/

# assessment
from sklearn import model_selection # for model comparisons
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import classification_report

# algorithms
from sklearn import tree
from sklearn.tree import DecisionTreeClassifier

In [2]:
# Let's set up our standard example problem: 
# Can we predict 'callSign' using these three features:  'Depth', 'Temperature', 'Salinity' ?

# Load the data
print('Loading data from file ...')  
dataset = pandas.read_csv('floats.csv')
print('Removing rows with missing data ...')  
dataset = dataset.dropna()
print('Reading list of problem variables X and y...')
X_name = [ 'Depth', 'Temperature', 'Salinity' ] 
y_name = 'callSign'
X = dataset[X_name]   
y = dataset[y_name]   

# setting the seed allows for repeatability
seed = 42 

print('Partitioning data into parts: formative (for development) and summative (for testing) ...')
test_size = 0.20   # means 20 percent
X_train, X_test, y_train, y_test = model_selection.train_test_split(X, y, test_size=test_size, random_state=seed)

print('done \n')

Loading data from file ...
Removing rows with missing data ...
Reading list of problem variables X and y...
Partitioning data into parts: formative (for development) and summative (for testing) ...
done 



In [3]:
# Chose the formative scoring method
scoring = 'f1_micro'

In [4]:
# Chose the Algorithm and tune some hyperparameters

selected_model = DecisionTreeClassifier()
hyperparameters = {'max_depth':[4, 5, 6, 7], 'criterion':['gini', 'entropy'] }

print("Now tuning hyperparameters...")
clf = GridSearchCV(selected_model, hyperparameters, cv=5, scoring=scoring)
clf.fit(X_train, y_train)

print("Best hyperparameters found on development set:")
print(clf.best_params_)
print("Grid scores on development set:")
means = clf.cv_results_['mean_test_score']
stds = clf.cv_results_['std_test_score']
for mean, std, params in zip(means, stds, clf.cv_results_['params']):
    print("%0.3f (+/-%0.03f) for %r"
        % (mean, std * 2, params))
print('done \n')

tuned_model = clf.best_estimator_

Now tuning hyperparameters...
Best hyperparameters found on development set:
{'criterion': 'entropy', 'max_depth': 7}
Grid scores on development set:
0.371 (+/-0.021) for {'criterion': 'gini', 'max_depth': 4}
0.385 (+/-0.016) for {'criterion': 'gini', 'max_depth': 5}
0.410 (+/-0.013) for {'criterion': 'gini', 'max_depth': 6}
0.425 (+/-0.012) for {'criterion': 'gini', 'max_depth': 7}
0.364 (+/-0.021) for {'criterion': 'entropy', 'max_depth': 4}
0.392 (+/-0.015) for {'criterion': 'entropy', 'max_depth': 5}
0.409 (+/-0.019) for {'criterion': 'entropy', 'max_depth': 6}
0.425 (+/-0.012) for {'criterion': 'entropy', 'max_depth': 7}
done 



In [5]:
print("Now testing the tuned model on the separate test set...")
print("Detailed classification report:")
print('\n')
y_true, y_pred = y_test, tuned_model.predict(X_test)
print(classification_report(y_true, y_pred))
print('done \n')

print(f'Tuned decision tree has {tuned_model.tree_.node_count} nodes with maximum depth {tuned_model.tree_.max_depth}.')

Now testing the tuned model on the separate test set...
Detailed classification report:


              precision    recall  f1-score   support

    Q4901043       0.59      0.68      0.63       179
    Q4901044       0.52      0.93      0.66       573
    Q4901265       0.24      0.41      0.30       172
    Q4901266       0.37      0.08      0.13       195
    Q4901267       0.32      0.31      0.32       167
    Q4901268       0.22      0.20      0.21       132
    Q4901269       0.24      0.08      0.12       191
    Q4901270       0.22      0.06      0.10       190
    Q4901271       0.31      0.05      0.09        92
    Q4901272       0.23      0.03      0.05       103
    Q4901273       0.00      0.00      0.00        11

    accuracy                           0.42      2005
   macro avg       0.29      0.26      0.24      2005
weighted avg       0.37      0.42      0.35      2005

done 

Tuned decision tree has 249 nodes with maximum depth 7.


  _warn_prf(average, modifier, msg_start, len(result))


In [6]:
# Compare to the default, unrestricted tree

estimator = DecisionTreeClassifier()
estimator.fit(X_train, y_train)
print(f'Default Decision tree has {estimator.tree_.node_count} nodes with maximum depth {estimator.tree_.max_depth}.')
print("Detailed classification report:")
print('\n')
y_true, y_pred = y_test, estimator.predict(X_test)
print(classification_report(y_true, y_pred))
print('done \n')

Default Decision tree has 7563 nodes with maximum depth 30.
Detailed classification report:


              precision    recall  f1-score   support

    Q4901043       0.87      0.84      0.85       179
    Q4901044       0.87      0.85      0.86       573
    Q4901265       0.33      0.28      0.31       172
    Q4901266       0.21      0.23      0.22       195
    Q4901267       0.28      0.29      0.28       167
    Q4901268       0.27      0.31      0.29       132
    Q4901269       0.17      0.16      0.16       191
    Q4901270       0.25      0.26      0.26       190
    Q4901271       0.15      0.16      0.16        92
    Q4901272       0.19      0.22      0.21       103
    Q4901273       0.00      0.00      0.00        11

    accuracy                           0.47      2005
   macro avg       0.33      0.33      0.33      2005
weighted avg       0.47      0.47      0.47      2005

done 

