In [260]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.model_selection import train_test_split
from Orange.classification import tree
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix

## Carga de Datos

In [261]:
DataIn = pd.read_excel('Data.xlsx')
DataIn = DataIn.set_index('Date')
Y = Data['Class']
X = Data.drop('Class', axis = 1)

In [262]:
Data = DataIn.dropna(axis=0,how='any')

*Calcula el m para validación y pruebas, basandose en el n min optimo/estimado.*

In [268]:
def getting_data_for_validation_and_test(n_training_data):
    return int((0.4*n_training_data)/0.6)

## División de datos en: Entrenamiento, Pruebas, Validación

Para cada clase de hipótesis, con su n estimado/optimo dividermos la muestra de la siguiente manera:

- n: Número optimo de elementos para que modelo aprenda. Serán los datos que utilizaremos para entrenar el modelo.
- validation_data: Datos para la validación de la muestra.
- test_data: Datos para probar modelo entrenado.

In [263]:
def split_data_for_model_training(X, Y, n, getting_data_for_validation_and_test):
    X_tr, X_ts, Y_tr, Y_ts = train_test_split(X,
                                              Y, 
                                              train_size = n, 
                                              test_size = getting_data_for_validation_and_test(n), 
                                              random_state=19)
    
    X2_va, X2_ts, Y2_va, Y2_ts = train_test_split(X_ts, 
                                                  Y_ts, 
                                                  train_size = 0.5, 
                                                  random_state=19)
    return X_tr, Y_tr, X2_va, Y2_va, X2_ts, Y2_ts

## Utilitario para construir un modelo con los parametros dados:

- X features
- Y Labels
- n muestra min optima o estimada
- max_depth hiperparámetro de Profundidad
- max_leaf_nodes hiperparámetro de hojas

In [264]:
def building_a_tree_model(split_data_for_model_training, 
                          getting_data_for_validation_and_test, 
                          X, 
                          Y, 
                          n, 
                          max_depth, max_leaf_nodes):
    
    clf = DecisionTreeClassifier(random_state=19, 
                                 max_depth = max_depth, 
                                 max_leaf_nodes = max_leaf_nodes)
    
    X_tr, Y_tr, X2_va, Y2_va, X2_ts, Y2_ts = split_data_for_model_training(X, 
                                                                           Y, 
                                                                           n, 
                                                                           getting_data_for_validation_and_test)
    
    ## Training the data
    clf.fit(X_tr, Y_tr)
    y_pred = clf.predict(X2_va)
    a_score = accuracy_score(Y2_va, y_pred)
    return clf, y_pred, a_score
    

## Cálculo de n min optimo para un arbol con:

- profundidad "k"
- error de experto "delta"
- error de entrenamiento "epsilon"
- y con un número de carácteristicas igual a "features"

In [265]:
def optim_n_for_tree( k, delta, epsilon, features ):
    re = ((2**k) - 1)*(1 + np.log2(features))+1+np.log(delta**(-1))
    return int((np.log(2)/(2*epsilon**2))*re)

## Configuración

In [2]:
Hpyer_Parameters = [[1,2,3,4,5,6,7,8,9],[4,5,6]] # profundidad del arbol y nro de hojas.
set_parameters = [i for i in it.product(*Hpyer_Parameters)]
Epsilon = 0.2
delta = 0.1
Features = 67

In [267]:
print("Tree parameters: ")
set_parameters

Tree parameters: 


[(1, 4),
 (1, 5),
 (1, 6),
 (2, 4),
 (2, 5),
 (2, 6),
 (3, 4),
 (3, 5),
 (3, 6),
 (4, 4),
 (4, 5),
 (4, 6),
 (5, 4),
 (5, 5),
 (5, 6),
 (6, 4),
 (6, 5),
 (6, 6),
 (7, 4),
 (7, 5),
 (7, 6),
 (8, 4),
 (8, 5),
 (8, 6),
 (9, 4),
 (9, 5),
 (9, 6)]

# Modelos - N mínimo Optimo

Aqui encontramos el arbol con el score mas alto y mas bajo para todos los n optimos de las clases de hipotesis

In [273]:
def main_fun(data, epsilon, delta, features, X, Y):
    results = []
    # TODO - Add Try/Catch
    for p in set_parameters:        
        n = optim_n_for_tree(p[0], delta, epsilon, features)        
        # Validamos si n esta dentro del sample
        if (n + getting_data_for_validation_and_test(n)) > X.shape[0]:
            return results
        else: 
            clf, y_pred, a_score = building_a_tree_model(split_data_for_model_training,
                                  getting_data_for_validation_and_test,
                                  X,
                                  Y,
                                  n,
                                  p[0],
                                  p[1])
            results.append((clf, a_score))
    return results

In [298]:
w = main_fun(Data, Epsilon, delta, Features, X, Y)



In [297]:
max(w, key = lambda t: t[1])

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=5,
             max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=19, splitter='best'),
 0.5155763239875389)

In [296]:
min(w, key = lambda t: t[1])

(DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=1,
             max_features=None, max_leaf_nodes=4, min_impurity_decrease=0.0,
             min_impurity_split=None, min_samples_leaf=1,
             min_samples_split=2, min_weight_fraction_leaf=0.0,
             presort=False, random_state=19, splitter='best'),
 0.20689655172413793)

## Cálculo de n min estimado para todas las clases de hipotesis:

- error de experto "delta"
- error de entrenamiento "epsilon"
- conjunto de hiperparametros "h_aprox"

In [6]:
def estim_n_for_tree(epsilon, h_aprox, delta):
    return int(np.ceil((1/epsilon)*(np.log(h_aprox) + np.log(1/delta))))

# Modelos - N mínimo estimado

Aquí encontramos el árbol con el *score* mas alto y mas bajo, utilizando el n mínimo estimado
, para todas las clases de hipotesis.

In [4]:
def main_fun_min_estimado(data, epsilon, delta, features, X, Y, n):
    results = []
    # TODO - Add Try/Catch
    for p in set_parameters:        
        # Validamos si n esta dentro del sample
        if (n + getting_data_for_validation_and_test(n)) > X.shape[0]:
            return results
        else: 
            clf, y_pred, a_score = building_a_tree_model(split_data_for_model_training,
                                  getting_data_for_validation_and_test,
                                  X,
                                  Y,
                                  n,
                                  p[0],
                                  p[1])
            results.append((clf, a_score))
    return results

## Utilidades

In [None]:
def get_best_decision_tree(set_parameters, epsilon, features, delta):
    result = map(lambda x: )

In [158]:
def get_optim_ms_from_H( set_parameters, epsilon, features, delta ):
    result = map(lambda x:optim_n_for_tree(x[0], delta, epsilon, features ), set_parameters)  
    return list(result)

In [15]:
n_opt = int(np.ceil((1/Epsilon)*(np.log(dimVC) + np.log(1/delta))))
X2_tr, X2_ts, Y2_tr, Y2_ts = train_test_split(X, Y, train_size = n_opt/X.shape[0], random_state=123)

# AQUI EL ALGORITMO USANDO X2

### $\eta$ minimo estimado

In [209]:
h_aprox = len(set_parameters)

In [63]:
n_est = int(np.ceil((1/Epsilon)*(np.log(h_aprox) + np.log(1/delta))))
