In [10]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.model_selection import train_test_split
from Orange.classification import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

## Carga de Datos

In [12]:
DataIn = pd.read_excel('Data.xlsx')
DataIn = DataIn.set_index('Date')
Data = DataIn.dropna(axis=0,how='any')
Y = Data['Class']
X = Data.drop('Class', axis = 1)

# Definición de funciones para optimización de hyper parametros

## Parametrización 

In [13]:
Hpyer_Parameters = [[1,2,3,4,5,6,7,8,9],[4,5,6]] # profundidad del arbol y nro de hojas.
set_parameters = [i for i in it.product(*Hpyer_Parameters)]
Epsilon = 0.2
delta = 0.1
Features = 67

*Calcula el m para validación y pruebas, basandose en el n min optimo/estimado.*

In [14]:
def getting_data_for_validation_and_test(n_training_data):
    return int((0.4*n_training_data)/0.6)

## División de datos en: Entrenamiento, Pruebas, Validación

Para cada clase de hipótesis, con su n estimado/optimo dividermos la muestra de la siguiente manera:

- n: Número optimo de elementos para que modelo aprenda. Serán los datos que utilizaremos para entrenar el modelo.
- validation_data: Datos para la validación de la muestra.
- test_data: Datos para probar modelo entrenado.

In [15]:
def split_data_for_model_training(X, Y, n, getting_data_for_validation_and_test):
    X_tr, X_ts, Y_tr, Y_ts = train_test_split(X,
                                              Y, 
                                              train_size = n, 
                                              test_size = getting_data_for_validation_and_test(n), 
                                              random_state=19)
    
    X2_va, X2_ts, Y2_va, Y2_ts = train_test_split(X_ts, 
                                                  Y_ts, 
                                                  train_size = 0.5, 
                                                  random_state=19)
    return X_tr, Y_tr, X2_va, Y2_va, X2_ts, Y2_ts

## Utilitario para construir un modelo con los parametros dados:

- X features
- Y Labels
- n muestra min optima o estimada
- max_depth hiperparámetro de Profundidad
- max_leaf_nodes hiperparámetro de hojas

In [16]:
def building_a_tree_model(split_data_for_model_training, 
                          getting_data_for_validation_and_test, 
                          X, 
                          Y, 
                          n, 
                          max_depth, max_leaf_nodes):
    
    clf = DecisionTreeClassifier(random_state=19, 
                                 max_depth = max_depth, 
                                 max_leaf_nodes = max_leaf_nodes)
    
    X_tr, Y_tr, X2_va, Y2_va, X2_ts, Y2_ts = split_data_for_model_training(X, 
                                                                           Y, 
                                                                           n, 
                                                                           getting_data_for_validation_and_test)
    
    ## Training the data
    clf.fit(X_tr, Y_tr)
    y_pred = clf.predict(X2_va)
    a_score = accuracy_score(Y2_va, y_pred)
    return clf, y_pred, a_score
    

## Cálculo de n min optimo para un arbol con:

- profundidad "k"
- error de experto "delta"
- error de entrenamiento "epsilon"
- y con un número de carácteristicas igual a "features"

In [17]:
def optim_n_for_tree( k, delta, epsilon, features ):
    re = ((2**k) - 1)*(1 + np.log2(features))+1+np.log(delta**(-1))
    return int((np.log(2)/(2*epsilon**2))*re)

# Modelos - N mínimo Optimo

Aqui encontramos el arbol con el score mas alto y mas bajo para todos los n optimos de las clases de hipotesis

In [18]:
def main_fun(data, epsilon, delta, features, X, Y, _set_parameters):
    results = []
    # TODO - Add Try/Catch
    for p in _set_parameters:        
        n = optim_n_for_tree(p[0], delta, epsilon, features)        
        # Validamos si n esta dentro del sample
        if (n + getting_data_for_validation_and_test(n)) > X.shape[0]:
            return results
        else: 
            clf, y_pred, a_score = building_a_tree_model(split_data_for_model_training,
                                  getting_data_for_validation_and_test,
                                  X,
                                  Y,
                                  n,
                                  p[0],
                                  p[1])
            results.append((clf, a_score))
    return results

In [19]:
w = main_fun(Data, Epsilon, delta, Features, X, Y, set_parameters)



## Arbol con scoring mas alto

In [20]:
max(w, key = lambda t: t[1])

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=19, splitter='best'),
 0.5155763239875389)

## Arbol con scoring mas bajo

In [21]:
min(w, key = lambda t: t[1])

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
             max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=19, splitter='best'),
 0.20689655172413793)

## Accuracy con arbol scoring alto: jugando con varios N

In [43]:
so = range(1, X.shape[0], 50)

In [52]:
def show_me_the_money():
    for i in range(1, X.shape[0], 20):
        print(i)

## Cálculo de n min estimado; aplica para todas las clases de hipotesis:

- error de experto "delta"
- error de entrenamiento "epsilon"
- conjunto de hiperparametros "h_aprox"

In [22]:
def estim_n_for_tree(epsilon, h_aprox, delta):
    return int(np.ceil((1/epsilon)*(np.log(h_aprox) + np.log(1/delta))))

# Modelos - N mínimo estimado

Aquí encontramos el árbol con el *score* mas alto y mas bajo, utilizando el n mínimo estimado
, para todas las clases de hipotesis.

In [23]:
def main_fun_min_estimado(data, epsilon, delta, X, Y, n, _set_parameters):
    results = []
    # TODO - Add Try/Catch
    for p in _set_parameters:        
        # Validamos si n esta dentro del sample
        if (n + getting_data_for_validation_and_test(n)) > X.shape[0]:
            return results
        else: 
            clf, y_pred, a_score = building_a_tree_model(split_data_for_model_training,
                                  getting_data_for_validation_and_test,
                                  X,
                                  Y,
                                  n,
                                  p[0],
                                  p[1])
            results.append((clf, a_score))
    return results

In [33]:
n_min_estimado = estim_n_for_tree(Epsilon, len(set_parameters),delta)
print("n min estimado:")
print(n_min_estimado)                                  
w_s = main_fun_min_estimado(Data, Epsilon, delta, X, Y, n_min_estimado,set_parameters)

n min estimado:
28




## Arbol con scoring mas alto

In [34]:
max(w_s, key = lambda t: t[1])

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
             max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=19, splitter='best'),
 0.2222222222222222)

## Arbol con scoring mas bajo

In [35]:
min(w_s, key = lambda t: t[1])

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=2,
             max_features=None, max_leaf_nodes=5, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=19, splitter='best'),
 0.1111111111111111)