In [1]:
"""
University of Liege
ELEN0062 - Introduction to machine learning
Project 1 - Classification algorithms
"""
#! /usr/bin/env python
# -*- coding: utf-8 -*-

import numpy as np
from matplotlib import pyplot as plt

from data import make_dataset1, make_dataset2
from plot import plot_boundary
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV

In [2]:
dataset = make_dataset2
datasets = [make_dataset1, make_dataset2]
number_of_samples = 1500
training_sets = 1200
max_depth = [1, 2, 4, 8, None]
number_generations = 5

In [3]:
# 1.1 Plots to see how decision boudary is affected by complexity
X, y = dataset(number_of_samples, random_state = 0)
X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = training_sets,shuffle = False)

for j in range(len(max_depth)):
    # Decision Tree classifier
    dtc = DecisionTreeClassifier(max_depth = max_depth[j])
    dtc.fit(X_train, y_train)
    # printing the Plot
    plot_boundary( "plot\make_dataset2" + "_max_depth" + str(max_depth[j]),dtc, X_test[0:training_sets],y_test[0:training_sets],title = "max_depth : " + str(max_depth[j]))

In [4]:
# 1.2 
print("make_data", "max_depth", "mean", "std")
for i in range(len(max_depth)):
    accr = np.empty(number_generations)
    for k in range(number_generations):
        # Data set
        X, y = dataset(number_of_samples, random_state = k)
        X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = training_sets,shuffle = False)

        dtc = DecisionTreeClassifier(max_depth = max_depth[i])
        dtc.fit(X_train, y_train)
        accr[k] = dtc.score(X_test, y_test)
    print(max_depth[i],"{:.5f}".format(np.mean(accr)), "{:.5f}".format(np.std(accr)))

max_depth mean std
1 0.67533 0.03856
2 0.81600 0.02551
4 0.83400 0.00998
8 0.80600 0.02719
None 0.77267 0.01104


In [9]:
#tuning the hyperparameter of max_depth:
print("dataset","max_depth", "mean", "std")
for i in range(len(datasets)):
    for j in range(len(max_depth)):
        accr = np.empty(number_generations)
        for k in range(number_generations):
            # Data set
            X, y = datasets[i](number_of_samples, random_state = k)
            X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = training_sets,shuffle = False)
            dtc = DecisionTreeClassifier(max_depth = max_depth[j])
            dtc.fit(X_train, y_train)
            accr[k] = dtc.score(X_test, y_test)
        print(str(i+1),max_depth[j],"{:.5f}".format(np.mean(accr)), "{:.5f}".format(np.std(accr)))

dataset max_depth mean std
1 1 0.69000 0.00000
1 2 0.76333 0.00000
1 4 0.85000 0.00000
1 8 0.89333 0.00596
1 None 0.88000 0.00471
2 1 0.73667 0.00000
2 2 0.83333 0.00000
2 4 0.84000 0.00000
2 8 0.83000 0.00298
2 None 0.78933 0.00772


In [32]:
for i in range(len(datasets)):
    for k in range(number_generations):
        X, y = datasets[i](number_of_samples, random_state = k)
        X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = training_sets,shuffle = False)
        grid_params = { 'max_depth' : [1, 2, 4, 8, None]}
        gs = GridSearchCV(DecisionTreeClassifier(), grid_params, cv=10)
        g_res = gs.fit(X_train, y_train)
        print("k=", k,"n= ",i)
        print("Best parameter: max_depth =", g_res.best_params_["max_depth"])
        print("Best parameter accuracy:", g_res.best_score_)

k= 0 n=  0
Best parameter: max_depth = 8
Best parameter accuracy: 0.9
k= 1 n=  0
Best parameter: max_depth = 8
Best parameter accuracy: 0.8658333333333333
k= 2 n=  0
Best parameter: max_depth = 4
Best parameter accuracy: 0.8708333333333333
k= 3 n=  0
Best parameter: max_depth = 8
Best parameter accuracy: 0.8674999999999999
k= 4 n=  0
Best parameter: max_depth = 8
Best parameter accuracy: 0.8841666666666667
k= 0 n=  1
Best parameter: max_depth = 4
Best parameter accuracy: 0.8300000000000001
k= 1 n=  1
Best parameter: max_depth = 4
Best parameter accuracy: 0.8283333333333334
k= 2 n=  1
Best parameter: max_depth = 8
Best parameter accuracy: 0.8441666666666666
k= 3 n=  1
Best parameter: max_depth = 4
Best parameter accuracy: 0.8241666666666667
k= 4 n=  1
Best parameter: max_depth = 2
Best parameter accuracy: 0.8166666666666667


In [33]:
from sklearn.metrics import accuracy_score
score=[[],[]]
max_depth=[8,4]
for i in range(len(datasets)):
    
    for k in range(number_generations):
        X, y = datasets[i](number_of_samples, random_state = k)
        X_train, X_test, y_train, y_test = train_test_split(X, y,train_size = training_sets,shuffle = False)
        dtc = DecisionTreeClassifier(max_depth = max_depth[i])
        dtc.fit(X_train,y_train)
        y_pred = dtc.predict(X_test)
        accuracy= accuracy_score(y_pred, y_test)
        score[i].append(accuracy)
print("__________________________________________________________")
print(score[0])
print(score[1])
print("__________________________________________________________")
print("dataset", "average accuracy", "average accuracy standard deviation")
print("1","{:.4f}".format(np.mean(score[0])),"{:.4f}".format(np.std(score[0])))
print("2","{:.4f}".format(np.mean(score[1])),"{:.4f}".format(np.std(score[0])))

__________________________________________________________
[0.8866666666666667, 0.8766666666666667, 0.8833333333333333, 0.8466666666666667, 0.8433333333333334]
[0.84, 0.8333333333333334, 0.8466666666666667, 0.8333333333333334, 0.8166666666666667]
__________________________________________________________
dataset average accuracy average accuracy standard deviation
1 0.8673 0.0185
2 0.8340 0.0185
