In [1]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, confusion_matrix
import matplotlib.pylab as plt

In [2]:
Data = pd.read_excel('Data.xlsx')
Data = Data.set_index('Date')
Y = Data['Class']
X = Data.drop('Class', axis = 1)

### Parameters

In [3]:
step = 10
ts_prop = 0.1
va_prop = 0.2

https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html

https://www.oreilly.com/library/view/machine-learning-with/9781789343700/e3b1b1fd-ddff-435e-aff8-44eb01608b91.xhtml

In [4]:
Hpyer_Parameters = [[i for i in range(1,5)], # max_depth
                    [i for i in range(1,10)]] # min_samples_leaf
set_parameters = [i for i in it.product(*Hpyer_Parameters)]
k = 5

Epsilon = 0.15
delta = 0.15

### $\eta$ minimo estimado

##### Model exploration

In [5]:
X1, X1_ts, Y1, Y1_ts = train_test_split(X, Y, train_size = 1-ts_prop, random_state=19)
X1_tr, X1_va, Y1_tr, Y1_va = train_test_split(X1, Y1, train_size = 1-va_prop, random_state=19)

n_est = int(np.ceil((1/Epsilon)*(np.log(len(set_parameters)) + np.log(1/delta))))
N = range(n_est, round(X1.shape[0]*(1-va_prop)), step)

In [6]:
n_est_mod_results = {
    'Model': [None for i in range(len(set_parameters))],
    'Acc_Test': np.zeros((len(N),len(set_parameters))),
    'Acc_Val': np.zeros((len(N),len(set_parameters)))
}

In [7]:
for j in range(len(set_parameters)):
    dtc = DecisionTreeClassifier(max_depth = set_parameters[j][0], min_samples_leaf= set_parameters[j][1], random_state=19)
    for i in range(len(N)):
        np.random.seed(seed = 19)
        ind1 = np.random.randint(0, X1_tr.shape[0], N[i])
        dtc.fit(X1_tr.iloc[ind1,], Y1_tr[ind1])
        ind2 = np.random.randint(0, X1_va.shape[0], int(30*n_est/70))
        y_pred = dtc.predict(X1_va.iloc[ind2,])
        n_est_mod_results['Acc_Val'][i,j] = accuracy_score(Y1_va[ind2], y_pred)
    n_est_mod_results['Model'][j] = dtc
    
ind_bst = np.where(n_est_mod_results['Acc_Val'] == np.amax(n_est_mod_results['Acc_Val']))[1][0]
n_est_mod_results['Best'] = set_parameters[ind_bst]

### $\eta$ minimo optimo

In [8]:
X2, X2_ts, Y2, Y2_ts = train_test_split(X, Y, train_size = 1-ts_prop, random_state=19)
X2_tr, X2_va, Y2_tr, Y2_va = train_test_split(X2, Y2, train_size = 1-va_prop, random_state=19)

n_opt = int((np.log(2)/(2*Epsilon**2))*((2**k - 1)*(1 + np.log2(X.shape[1])) + 1 + np.log(delta**(-1))))
N_opt = range(n_est, round(X1.shape[0]*(1-va_prop)), step)

In [9]:
n_opt_mod_results = {
    'Model': [None for i in range(len(set_parameters))],
    'Acc_Test': np.zeros((len(N_opt),len(set_parameters))),
    'Acc_Val': np.zeros((len(N_opt),len(set_parameters)))
}

In [10]:
for j in range(len(set_parameters)):
    dtc = DecisionTreeClassifier(max_depth = set_parameters[j][0], min_samples_leaf= set_parameters[j][1], random_state=19)
    for i in range(len(N_opt)):
        np.random.seed(seed = 19)
        ind1 = np.random.randint(0, X1_tr.shape[0], N[i])
        dtc.fit(X1_tr.iloc[ind1,], Y1_tr[ind1])
        ind2 = np.random.randint(0, X1_va.shape[0], int(30*n_est/70))
        y_pred = dtc.predict(X1_va.iloc[ind2,])
        n_opt_mod_results['Acc_Val'][i,j] = accuracy_score(Y1_va[ind2], y_pred)
    n_opt_mod_results['Model'][j] = dtc
    
ind_bst = np.where(n_opt_mod_results['Acc_Val'] == np.amax(n_opt_mod_results['Acc_Val']))[1][0]
n_opt_mod_results['Best'] = set_parameters[ind_bst]