In [15]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score

## Carga de datos

In [4]:
DataIn = pd.read_excel('Data.xlsx')
DataIn = DataIn.set_index('Date')
Data = DataIn.dropna(axis=0,how='any')
Y = Data['Class']
X = Data.drop('Class', axis = 1)

## Configuración

In [7]:
sample_size = X.shape[0]
training_sample_size = int(sample_size * 0.6)
validation_sample_size = int(sample_size * 0.2)
test_sample_size = validation_sample_size

## Construcción de clases de hipótesis

Vamos utilizar en este caso el objeto llamado *Random Hyperparameter Grid* el cual permite generar un grid de hiperparámetros, de los cuales seleccionaremos un rango de hiperparametros y luego mediante la evaluación de estos modelos bajo un n optimo, escogeremos el mejor.

### Descripción de parámetros

- Número de arboles.
- Número de carácterísticas a considerar para cada división.
- Número máximo de niveles por árbol.
- Mínimo número de muestras requeridas para dividir un nodo.
- Mínimo número de muestras requeridas para cada nodo hoja.
- Método de selección de muestras para entrenar cada arbol

In [5]:
from sklearn.model_selection import RandomizedSearchCV

n_arboles = [int(x) for x in np.linspace(start = 200, stop = 2000, num = 10)]
max_caracteristicas = ['auto', 'sqrt']
max_profundidad_arbol = [int(x) for x in np.linspace(10, 110, num = 11)]
max_profundidad_arbol.append(None)
min_muestras_division = [2, 5, 10]
min_muestras_hojas = [1, 2, 4]
bootstrap = [True, False]
random_grid = {'n_estimators': n_arboles,
               'max_features': max_caracteristicas,
               'max_depth': max_profundidad_arbol,
               'min_samples_split': min_muestras_division,
               'min_samples_leaf': min_muestras_hojas,
               'bootstrap': bootstrap}
pprint(random_grid)

{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}


In [20]:
from sklearn.model_selection import RandomizedSearchCV

n_arboles = [int(x) for x in np.linspace(start = 50, stop = 300, num = 40)]
max_profundidad_arbol = [int(x) for x in np.linspace(10, 110, num = 11)]
max_profundidad_arbol.append(None)
min_muestras_hojas = [1, 2, 4]
random_grid = {'n_estimators': n_arboles,
               'max_depth': max_profundidad_arbol,
               'min_samples_leaf': min_muestras_hojas}
pprint(random_grid)

{'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None],
 'min_samples_leaf': [1, 2, 4],
 'n_estimators': [50,
                  56,
                  62,
                  69,
                  75,
                  82,
                  88,
                  94,
                  101,
                  107,
                  114,
                  120,
                  126,
                  133,
                  139,
                  146,
                  152,
                  158,
                  165,
                  171,
                  178,
                  184,
                  191,
                  197,
                  203,
                  210,
                  216,
                  223,
                  229,
                  235,
                  242,
                  248,
                  255,
                  261,
                  267,
                  274,
                  280,
                  287,
                  293,
           

In [9]:
def split_data_for_model_training(X, Y, training_sample_size, validation_sample_size):
    X_tr, X_ts, Y_tr, Y_ts = train_test_split(X,
                                              Y, 
                                              train_size = training_sample_size, 
                                              test_size = validation_sample_size*2, 
                                              random_state=19)
    
    X2_va, X2_ts, Y2_va, Y2_ts = train_test_split(X_ts, 
                                                  Y_ts, 
                                                  train_size = 0.5, 
                                                  random_state=19)
    return X_tr, Y_tr, X2_va, Y2_va, X2_ts, Y2_ts

In [17]:
# Realizaremos la búsqueda aleatoria de hiperparametros
# Utilizaremos 5 fold - cross validation.

def best_parameters(training_data_x, training_data_y):
    randForestClassi = RandomForestClassifier()
    clf = GridSearchCV(RandomForestClassifier(), random_grid, cv = 5)
    clf.fit(training_data_x, training_data_y)
    clf.best_params_

In [18]:
X_tr, Y_tr, X2_va, Y2_va, X2_ts, Y2_ts  = split_data_for_model_training(X, Y, training_sample_size, validation_sample_size)

In [21]:
best_parameters(X_tr, Y_tr)

KeyboardInterrupt: 

In [51]:
Hpyer_Parameters = [[1,2,3,4,5,6,7,8,9],[4,5,6]] # número de arboles del arbol y nro de hojas.
set_parameters = [i for i in it.product(*Hpyer_Parameters)]

Epsilon = 0.2
delta = 0.1
Features = 67
h_aprox = len(set_parameters)


In [59]:
def get_optim_ms_from_H( set_parameters, epsilon, features, delta ):
    result = map(lambda x:optim_n_for_tree(x[0], delta, epsilon, features ), set_parameters)  
    print(list(result)) 

In [60]:
def optim_n_for_tree( k, delta, epsilon, features ):
    re = ((2**k) - 1)*(1 + np.log2(features))+1+np.log(delta**(-1))
    return int((np.log(2)/(2*epsilon**2))*re)

# AQUI EL ALGORITMO USANDO X1

In [None]:
def getting_ready( data, n):
    
    

In [None]:
##
#
# (self, criterion=’gini’, splitter=’best’,
# max_depth=None, min_samples_split=2, min_samples_leaf=1, 
# min_weight_fraction_leaf=0.0, max_features=None, random_state=None, 
# max_leaf_nodes=None, min_impurity_decrease=0.0, 
# min_impurity_split=None, class_weight=None, 
# presort=False)
#

ns = get_optim_ms_from_H(set_parameters , epsilon=Epsilon, features=Features, delta = delta)

def decision_tree( deep, leafs, X, Y, n ):
    # Getting ready the data.
    if(X.shape[0] < n):
        new_n = 
    X1_tr, X1_ts, Y1_tr, Y1_ts = train_test_split(X, Y, train_size = n_est/X.shape[0], random_state=123)
   
    # tree instantiation
    clf = DecisionTreeClassifier(random_state=0, max_depth = deep, max_leaf_nodes = leafs)
    


    
    
    