In [2]:
# Load libraries
import numpy as np;
import pandas as pd
from timeit import default_timer

from sklearn.model_selection import train_test_split

# Logistic Regression

- Se aplica **sólo a problemas de aprendizaje supervisado de clasificación**.


- Es la familia más simple de modelos ML para este tipo de problemas.


- Mide la relación entre la variable dependiente categórica y las variables independientes numéricas estimando probabilidades mediante una **función logística aplicada al resultado de una regresión lineal**.

![imagen.png](attachment:imagen.png)

## Load Data

Utilizaremos el conjunto de datos del iris. Sí, OTRA VEZ 😛.

In [1]:
from sklearn.datasets import load_iris;

In [2]:
iris = load_iris();
X = iris.data;
y = iris.target;

In [5]:
perc_values = [0.7, 0.15, 0.15];
X_train, X_valtest, y_train, y_valtest = train_test_split(X, y, stratify = y, test_size=perc_values[1] + perc_values[2], random_state=1);
X_val, X_test, y_val, y_test = train_test_split(X_valtest, y_valtest, stratify = y_valtest, test_size= perc_values[2] / (perc_values[1] + perc_values[2]), random_state=1)

## Modelo Individual

### Lasso

![imagen-2.png](attachment:imagen-2.png)

Los coeficientes van 0.

![imagen.png](attachment:imagen.png)

### Imports (Pasos 1 & 2)

In [10]:
# 1) Import model
from sklearn.linear_model import LogisticRegression as model_constructor
?model_constructor

In [11]:
# 2) Import metric
from sklearn.metrics import roc_auc_score as metric

### Entrenar modelo (Pasos 3-5)

In [13]:
# [3] Define model
model = model_constructor(penalty = 'l1', 
                          solver = 'liblinear', 
                          C = 1, 
                          random_state = 0)

# [4] Train model
model.fit(X_train, y_train) ## Only train!!! Requires target y for being supervised learning.

# [5] Predict
pred_train = model.predict_proba(X_train) 
pred_val = model.predict_proba(X_val) 
pred_test = model.predict_proba(X_test) 

In [14]:
print(pred_test[0:10])

[[1.29532402e-03 5.10636439e-01 4.88068237e-01]
 [7.66471133e-03 7.11479133e-01 2.80856156e-01]
 [1.33741808e-02 5.03368264e-01 4.83257555e-01]
 [5.82315412e-04 3.07701322e-01 6.91716363e-01]
 [2.42023501e-01 7.22209708e-01 3.57667908e-02]
 [2.05985290e-03 1.69097671e-01 8.28842476e-01]
 [4.49629524e-03 7.98743653e-01 1.96760052e-01]
 [9.18946524e-01 8.10529131e-02 5.63187188e-07]
 [4.87467820e-05 3.90951800e-01 6.08999453e-01]
 [2.14459568e-03 3.17623409e-01 6.80231995e-01]]


Cada fila es una flor distinta, y cada columna la probabilidad de pertenecer a cada una de las tres clases.

Veamos los coeficientes.

In [15]:
model.coef_

array([[ 0.        ,  2.28546447, -2.56395648,  0.        ],
       [ 0.        , -1.23286654,  0.67258461, -1.16749638],
       [-1.90058644, -2.60300185,  2.73085971,  3.6420569 ]])

- Tres filas, una por clase, ya que construimos un clasificador binarios para clase de flor).


- Algunos coeficientes van a 0 (Lasso).

### Evaluación (Pasp 6)

Utilizaremos **AUC** como métrica de evaluación. Este es un problema multiclase por lo que tenemos que utilizar el argumento *multi_class*.

In [18]:
# [6] Compute metric
metric_train = metric(y_train, pred_train, multi_class = 'ovo')
metric_val = metric(y_val, pred_val, multi_class = 'ovo')
metric_test = metric(y_test, pred_test, multi_class = 'ovo')

In [19]:
print('AUC train = %.2f - AUC validation = %.2f - AUC test = %.2f ' 
      % (metric_train, metric_val, metric_test))

AUC train = 0.99 - AUC validation = 1.00 - AUC test = 1.00 


### Ridge Regression

![imagen-3.png](attachment:imagen-3.png)

Los coeficientes tienden a 0, pero nunca llegan exactamente a 0.

![imagen.png](attachment:imagen.png)

### Entrenamiento (Pasos 3-5)

In [20]:
# [3] Define model
model = model_constructor(penalty = 'l2', 
                          solver = 'lbfgs', 
                          C = 1, 
                          random_state = 0) 

# [4] Train model
model.fit(X_train, y_train) ## Only train!!! Requires target y for being supervised learning.

# [5] Predict
pred_train = model.predict_proba(X_train) 
pred_val = model.predict_proba(X_val) 
pred_test = model.predict_proba(X_test) 

Veamos los coeficientes.

In [21]:
model.coef_

array([[-0.47927525,  0.78036998, -2.29258292, -0.91834138],
       [ 0.15205962, -0.21849006, -0.07826914, -0.69417198],
       [ 0.32721563, -0.56187993,  2.37085206,  1.61251336]])

- Ningún coeficiente es 0 (Ridge).

### Evaluación (Paso 6)

In [22]:
# [6] Compute metric
metric_train = metric(y_train, pred_train, multi_class = 'ovo')
metric_val = metric(y_val, pred_val, multi_class = 'ovo')
metric_test = metric(y_test, pred_test, multi_class = 'ovo')

In [23]:
print('AUC train = %.2f - AUC validation = %.2f - AUC test = %.2f ' 
      % (metric_train, metric_val, metric_test))

AUC train = 1.00 - AUC validation = 1.00 - AUC test = 0.99 


### Elastic Net (Lasso + Ridge)

![imagen-5.png](attachment:imagen-5.png)

In [25]:
?model_constructor

In [29]:
# [3] Define model
model = model_constructor(penalty = 'elasticnet', 
                          solver = 'saga', 
                          C = 1, 
                          l1_ratio = 0.5,
                          random_state = 0,
                          max_iter  = 10000)# l1_ratio!!!

# [4] Train model
model.fit(X_train, y_train) ## Only train!!! Requires target y for being supervised learning.

# [5] Predict
pred_train = model.predict_proba(X_train) 
pred_val = model.predict_proba(X_val) 
pred_test = model.predict_proba(X_test) 

Veamos los coeficientes.

In [30]:
model.coef_

array([[ 0.        ,  0.79296797, -2.59105016, -0.50115734],
       [ 0.        ,  0.        ,  0.        , -0.38016573],
       [ 0.        , -0.42001488,  2.6889738 ,  1.88132308]])

- Algunos coeficientes van 0 (Elastic Net incluye Lasso).

### Evaluación (Step 6)

In [31]:
# [6] Compute metric
metric_train = metric(y_train, pred_train, multi_class = 'ovo')
metric_val = metric(y_val, pred_val, multi_class = 'ovo')
metric_test = metric(y_test, pred_test, multi_class = 'ovo')

In [32]:
print('AUC train = %.2f - AUC validation = %.2f - AUC test = %.2f ' 
      % (metric_train, metric_val, metric_test))

AUC train = 1.00 - AUC validation = 1.00 - AUC test = 1.00 


## Grid Search

<img src="figures/grid.jpg">

<img src="figures/grid.bmp">

Definamos ahora la rejilla que vamos a emplear.

In [3]:
# Regresion Logística
penalty_values = ['l1', 'l2', 'elasticnet'];
C_values = [1, 10, 100];

params_grid = {'penalty': penalty_values,
              'C': C_values}
 

Calculamos el número total de combinaciones.

In [4]:
n = len(params_grid['penalty'])*len(params_grid['C'])
print(str(n)+ ' iteraciones de logistic regression')

9 iteraciones de logistic regression


### Empleando Conjunto de Validación Fija

Utilizaremos timeit para medir el tiempo de cálculo de cada iteración de la búsqueda en rejilla.

In [35]:
num_iter = 1;
grid_results = pd.DataFrame(columns = ('penalty',
                                       'C',
                                       'auc_train', 
                                       'auc_val',
                                       'time'))
for penalty in params_grid['penalty']:
    for C in params_grid['C']:
        
            # Start time
            start_time = default_timer()
        
            # Print trace
            print('Iteracion = ' + str(num_iter))
            
            # [3] Define model
            if penalty == 'l1':
                model = model_constructor(penalty = penalty, 
                                          solver = 'liblinear', 
                                          C = C, 
                                          random_state = 0)
            elif penalty == 'l2':
                model = model_constructor(penalty = penalty,
                                          solver = 'lbfgs', 
                                          C = C, 
                                          random_state = 0)
            elif penalty == 'elasticnet':
                model = model_constructor(penalty = penalty,
                                  solver = 'saga', 
                                  C = C, 
                                  l1_ratio = 0.5,
                                  random_state = 0)
            
            # [4] Train model
            model.fit(X_train, y_train)
            
            # [5] Predict
            pred_train = model.predict_proba(X_train)
            pred_val = model.predict_proba(X_val)
            
            # [6] Compute metric
            metric_train = metric(y_train, pred_train, multi_class = 'ovo')
            metric_val = metric(y_val, pred_val, multi_class = 'ovo')
            
            # Computational time
            time = default_timer() - start_time
            
            # print error
            print('AUC train = %.2f - AUC validation = %.2f. Time spend = %.2f.' 
                  % (metric_train, metric_val, time))         
                       
            # Save iteration results
            grid_results.loc[num_iter]=[penalty,
                                        C,
                                        metric_train,
                                        metric_val,
                                        time] 
            num_iter += 1

print('Grid Search Total Computational Time: ', np.sum(grid_results.time.values)) 

Iteracion = 1
AUC train = 0.99 - AUC validation = 1.00. Time spend = 0.01.
Iteracion = 2
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.02.
Iteracion = 3
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.02.
Iteracion = 4
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.03.
Iteracion = 5
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.03.
Iteracion = 6
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.02.
Iteracion = 7
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.01.
Iteracion = 8
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.01.
Iteracion = 9
AUC train = 1.00 - AUC validation = 1.00. Time spend = 0.01.
Grid Search Total Computational Time:  0.15733219999856374


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Veamos los resultados obtenidos

In [36]:
grid_results

Unnamed: 0,penalty,C,auc_train,auc_val,time
1,l1,1,0.993469,1.0,0.009611
2,l1,10,0.996871,1.0,0.015803
3,l1,100,0.996871,1.0,0.018438
4,l2,1,0.997823,1.0,0.028334
5,l2,10,0.998639,1.0,0.029612
6,l2,100,0.998639,1.0,0.024904
7,elasticnet,1,0.997823,1.0,0.011492
8,elasticnet,10,0.997551,1.0,0.009761
9,elasticnet,100,0.997551,1.0,0.009377


**¿Con cuál nos quedamos**

In [37]:
grid_results = grid_results.sort_values(by = ['auc_val', 'auc_train', 'time'], ascending = [False, False, True])
grid_results

Unnamed: 0,penalty,C,auc_train,auc_val,time
6,l2,100,0.998639,1.0,0.024904
5,l2,10,0.998639,1.0,0.029612
7,elasticnet,1,0.997823,1.0,0.011492
4,l2,1,0.997823,1.0,0.028334
9,elasticnet,100,0.997551,1.0,0.009377
8,elasticnet,10,0.997551,1.0,0.009761
2,l1,10,0.996871,1.0,0.015803
3,l1,100,0.996871,1.0,0.018438
1,l1,1,0.993469,1.0,0.009611


In [38]:
best_model = grid_results.iloc[0]
best_model

penalty             l2
C                  100
auc_train     0.998639
auc_val              1
time         0.0249043
Name: 6, dtype: object

### Usando Cross-Validation

In [39]:
from sklearn.model_selection import GridSearchCV
?GridSearchCV

In [40]:
# Define grid
grid_cv = GridSearchCV(model_constructor(),
                     param_grid=params_grid,
                     n_jobs=2, ## Parallelization!
                     cv = 5) # Number of folds

En este caso, no necesitamos un conjunto de validación fijo, por lo que combinaremos la validación con el conjunto de train.

In [41]:
# Run grid
start_time = default_timer()

grid_cv.fit(np.concatenate((X_train, X_val), axis = 0), 
            np.concatenate((y_train, y_val), axis = 0))

stop_time = default_timer()
print('CV Grid Search Total Computational Time: : ', stop_time - start_time) 

CV Grid Search Total Computational Time: :  1.3181134000001293


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [42]:
grid_cv.best_params_

{'C': 1, 'penalty': 'l2'}

In [43]:
grid_cv.best_score_

0.9683076923076923

## Modelo Final

La validación ha cumplido su propósito, combinémosla con el conjunto de train para obtener más datos de entrenamiento.

In [44]:
print('Old train data size = ' + str(X_train.shape))
print('Old train target size = ' + str(y_train.shape))

# Combine train and validación
X_train = np.concatenate((X_train, X_val), axis = 0)
y_train = np.concatenate((y_train, y_val), axis = 0)

print('New train data size = ' + str(X_train.shape))
print('New train target size = ' + str(y_train.shape))

Old train data size = (105, 4)
Old train target size = (105,)
New train data size = (127, 4)
New train target size = (127,)


Utilicemos en este caso los mejores hiperparámetros ganadores de la búsqueda en rejilla que empleaba el conjunto de validación fijo .

In [45]:
# [3] Define model
if best_model.penalty == 'l1':
    model = model_constructor(penalty = best_model.penalty, 
                              solver = 'liblinear', 
                              C = best_model.C, 
                              random_state = 0)
elif best_model.penalty == 'l2':
    model = model_constructor(penalty = best_model.penalty,
                              solver = 'lbfgs', 
                              C = best_model.C, 
                              random_state = 0)
elif best_model.penalty == 'elasticnet':
    model = model_constructor(penalty = best_model.penalty,
                      solver = 'saga', 
                      C = best_model.C, 
                      l1_ratio = 0.5,
                      random_state = 0)
            
# [4] Train model
model.fit(X_train, y_train)
            
# [5] Predict
pred_train = model.predict_proba(X_train)
pred_test = model.predict_proba(X_test)
            
# [6] Compute metric
metric_train = metric(y_train, pred_train, multi_class = 'ovo')
metric_test = metric(y_test, pred_test, multi_class = 'ovo')

    

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [46]:
# print error
print('Accuracy train = %.2f - Accuracy test = %.2f' 
      % (metric_train, metric_test))

Accuracy train = 1.00 - Accuracy test = 1.00
