In [1]:
import numpy as np
import pandas as pd
import itertools as it
from sklearn.linear_model import LogisticRegression
from sklearn.multiclass import OneVsOneClassifier
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import precision_score,confusion_matrix,recall_score,accuracy_score
from sklearn.metrics import roc_curve, auc,precision_recall_curve, roc_curve
import matplotlib.pylab as plt

In [2]:
Data = pd.read_excel('Data.xlsx')
Data = Data.set_index('Date')
Y = Data['Class']
X = Data.drop('Class', axis = 1)

### Parameters

In [10]:
step = 100
ts_prop = 0.1
va_prop = 0.2

Hyper_Parameters = [['l1','l2']] # Regresores
set_parameters = [i for i in it.product(*Hyper_Parameters)]

#### Error real y de entrenamiento deseados

In [4]:
Epsilon = 0.15
delta = 0.15

#### Garantia probable de aprendizaje ($\eta_{estimado}$) y Tamaño optimo de la muestra $\eta_{optimo}$

#### $\eta_{estimado}$

In [5]:
n_est = int(np.ceil((1/Epsilon)*(np.log(len(set_parameters)) + np.log(1/delta))))

#### $\eta_{optimo}$

In [6]:
n_opt = X.shape[1]+1

#### Division del conjunto: entrenamiento (70%), validacion (20%) y prueba (10%)

In [7]:
X1, X1_ts, Y1,Y1_ts = train_test_split(X, Y, train_size = 1-ts_prop, random_state=19)
X1_tr, X1_va, Y1_tr, Y1_va = train_test_split(X1, Y1, train_size = 1-va_prop, random_state=19)

#### Discretizacion para explorar el aumento de $\eta$

In [8]:
N = range(n_est, round(X1_tr.shape[0]*(1-va_prop)), step)

### Exploracion de modelos partiendo de $\eta_{estimado}$, es claro que $\eta_{estimado}$ es tenido en cuenta ya que $\eta_{estimado} < \eta_{optimo}$

In [9]:
n_est_mod_results = {
    'Model': [None for i in range(len(set_parameters))],
    'Acc_Test': np.zeros((len(N),len(set_parameters))),
    'Acc_Val': np.zeros((len(N),len(set_parameters)))
}

In [34]:
for j in range(len(set_parameters)):
    lrc = LogisticRegression(penalty= set_parameters[j][0])
    regresors = np.random.randint(0, X1_tr.shape[1], np.random.randint(0, X1_tr.shape[1]))
    for i in range(len(N)):
        np.random.seed(seed = 19)
        ind1 = np.random.randint(0, X1_tr.shape[0], N[i])
        lrc.fit(X1_tr.iloc[ind1, regresors], Y1_tr[ind1])
        y_pred = lrc.predict(X1_va.iloc[:,regresors])
        n_est_mod_results['Acc_Val'][i,j] = accuracy_score(Y1_va, y_pred)
    n_est_mod_results['Model'][j] = (lrc,regresors)
    
ind_bst = np.where(n_est_mod_results['Acc_Val'] == np.amax(n_est_mod_results['Acc_Val']))[1][0]
n_est_mod_results['Best'] = set_parameters[ind_bst]





IndexError: index 1 is out of bounds for axis 1 with size 1

#### Grid search

In [13]:
Hyper_Parameters = {
    'penalty': ['l1', 'l2']
}

clf = GridSearchCV(LogisticRegression(), Hyper_Parameters, cv = 5)
clf.fit(X1_tr.values, Y1_tr)
clf.best_params_



{'penalty': 'l1'}