In [2]:
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix

iris = pd.read_csv('https://gist.githubusercontent.com/curran/a08a1080b88344b0c8a7/raw/0e7a9b0a5d22642a06d3d5b9bcbad9890c8ee534/iris.csv')
iris.head()

Unnamed: 0,sepal_length,sepal_width,petal_length,petal_width,species
0,5.1,3.5,1.4,0.2,setosa
1,4.9,3.0,1.4,0.2,setosa
2,4.7,3.2,1.3,0.2,setosa
3,4.6,3.1,1.5,0.2,setosa
4,5.0,3.6,1.4,0.2,setosa


In [3]:
X = iris.drop('species', axis=1)
y = iris['species']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)

In [4]:
model = DecisionTreeClassifier()
model.fit(X_train, y_train)
preds = model.predict(X_test)
print(f'Accuracy = {round(accuracy_score(y_test, preds), 2)}')
print()
print(confusion_matrix(y_test, preds))

Accuracy = 0.84

[[12  0  0]
 [ 0 11  0]
 [ 0  6  9]]


In [5]:
# 3 sets of hyperparameters
params_1 = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 10}
params_2 = {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 1000}
params_3 = {'criterion': 'gini', 'splitter': 'random', 'max_depth': 100}
# 3 separate models
model_1 = DecisionTreeClassifier(**params_1)
model_2 = DecisionTreeClassifier(**params_2)
model_3 = DecisionTreeClassifier(**params_3)
model_1.fit(X_train, y_train)
model_2.fit(X_train, y_train)
model_3.fit(X_train, y_train)
# 3 separate prediction sets
preds_1 = model_1.predict(X_test)
preds_2 = model_3.predict(X_test)
preds_3 = model_2.predict(X_test)
print(f'Accuracy on Model 1 = {round(accuracy_score(y_test, preds_1), 5)}')
print(f'Accuracy on Model 2 = {round(accuracy_score(y_test, preds_2), 5)}')
print(f'Accuracy on Model 3 = {round(accuracy_score(y_test, preds_3), 5)}')


Accuracy on Model 1 = 0.84211
Accuracy on Model 2 = 0.86842
Accuracy on Model 3 = 0.81579


In [6]:
# Define parameter possibilities as lists
p_criterion = ['gini', 'entropy']
p_splitter = ['best', 'random']

p_max_depth = [1, 10, 100, 1000]
# The scores will go here
results = []

# Nested loops - we need to test for all combinations
for criterion in p_criterion:
    for splitter in p_splitter:
        for max_depth in p_max_depth:
            # Train the model
            model = DecisionTreeClassifier(
                criterion=criterion,
                splitter=splitter,
                max_depth=max_depth
            )
            model.fit(X_train, y_train)
            preds = model.predict(X_test)
            # Append current results
            results.append({
                'Accuracy': round(accuracy_score(y_test, preds), 5),
                'P_Criterion': criterion,
                'P_Splitter': splitter,
                'P_MaxDepth': max_depth
            })
            
# Convert to Pandas DataFrame and sort descendingly by accuracy
results = pd.DataFrame(results)
results = results.sort_values(by='Accuracy', ascending=False)
results


Unnamed: 0,Accuracy,P_Criterion,P_Splitter,P_MaxDepth
1,0.89474,gini,best,10
3,0.89474,gini,best,1000
5,0.89474,gini,random,10
7,0.89474,gini,random,1000
11,0.89474,entropy,best,1000
13,0.89474,entropy,random,10
6,0.86842,gini,random,100
14,0.86842,entropy,random,100
15,0.86842,entropy,random,1000
2,0.84211,gini,best,100


In [7]:
model = DecisionTreeClassifier()
params = {
    'criterion': ['gini', 'entropy'],
    'splitter': ['best', 'random'],
    'max_depth': [1, 10, 100, 1000]
}

clf = GridSearchCV(
    estimator=model,
    param_grid=params,
    cv=10, # 10-fold cross validation
    n_jobs=-1 # run in parallel
)

clf.fit(X_train, y_train)

GridSearchCV(cv=10, estimator=DecisionTreeClassifier(), n_jobs=-1,
             param_grid={'criterion': ['gini', 'entropy'],
                         'max_depth': [1, 10, 100, 1000],
                         'splitter': ['best', 'random']})

In [8]:
cv_results = pd.DataFrame(clf.cv_results_)
cv_results.head()

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_criterion,param_max_depth,param_splitter,params,split0_test_score,split1_test_score,...,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
0,0.010502,0.001687,0.006398,0.000801,gini,1,best,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.666667,0.666667,...,0.727273,0.727273,0.727273,0.727273,0.636364,0.636364,0.636364,0.687879,0.040769,13
1,0.011498,0.003414,0.006903,0.00164,gini,1,random,"{'criterion': 'gini', 'max_depth': 1, 'splitte...",0.666667,0.666667,...,0.727273,0.727273,0.636364,0.636364,0.727273,0.636364,0.636364,0.669697,0.039394,15
2,0.010701,0.003434,0.007299,0.002935,gini,10,best,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",1.0,0.833333,...,0.818182,1.0,1.0,0.909091,0.909091,1.0,1.0,0.94697,0.070173,12
3,0.010601,0.004078,0.0077,0.004585,gini,10,random,"{'criterion': 'gini', 'max_depth': 10, 'splitt...",1.0,0.916667,...,1.0,1.0,1.0,0.909091,0.818182,1.0,1.0,0.964394,0.059656,3
4,0.008299,0.000458,0.0056,0.000663,gini,100,best,"{'criterion': 'gini', 'max_depth': 100, 'split...",1.0,0.833333,...,0.818182,1.0,1.0,0.909091,1.0,1.0,1.0,0.956061,0.070564,6


In [9]:
cv_results = cv_results[['mean_test_score', 'param_criterion', 'param_splitter', 'param_max_depth']]
cv_results.sort_values(by='mean_test_score', ascending=False)


Unnamed: 0,mean_test_score,param_criterion,param_splitter,param_max_depth
11,0.965909,entropy,random,10
15,0.965909,entropy,random,1000
3,0.964394,gini,random,10
7,0.956818,gini,random,1000
12,0.956818,entropy,best,100
4,0.956061,gini,best,100
5,0.956061,gini,random,100
6,0.956061,gini,best,1000
10,0.956061,entropy,best,10
14,0.956061,entropy,best,1000


In [10]:
clf.best_params_

{'criterion': 'entropy', 'max_depth': 10, 'splitter': 'random'}