# MODELIZACIÓN PARA CLASIFICACIÓN

## IMPORTAR PAQUETES

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style(style='darkgrid')

from sklearn.model_selection import train_test_split

from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.ensemble import HistGradientBoostingClassifier
from sklearn.pipeline import Pipeline

from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV

from sklearn.metrics import roc_auc_score
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay

from sklearn.metrics import classification_report

#from sklearn.metrics import plot_precision_recall_curve
from sklearn.metrics import PrecisionRecallDisplay
import scikitplot as skplt
#from yellowbrick.classifier import discrimination_threshold

#Autocompletar rápido
%config IPCompleter.greedy=True

#Desactivar la notación científica
pd.options.display.float_format = '{:.2f}'.format

#Desactivar los warnings
import warnings
warnings.filterwarnings("ignore")

## IMPORTAR LOS DATOS

Sustituir la ruta del proyecto.

In [2]:
ruta_proyecto = 'C:/Users/pelop/OneDrive/Desktop/Curso Data Science Pedro/2 CURSO DATA SCIENCE/03_MACHINE_LEARNING/07_CASOS/01_LEADSCORING'

Nombres de los ficheros de datos.

In [3]:
nombre_x = 'x_preseleccionado.pickle'
nombre_y = 'y_preseleccionado.pickle'

Cargar los datos.

In [4]:
x = pd.read_pickle(ruta_proyecto + '/02_Datos/03_Trabajo/' + nombre_x)
y = pd.read_pickle(ruta_proyecto + '/02_Datos/03_Trabajo/' + nombre_y)

## MODELIZAR

### Reservar el dataset de validacion

In [5]:
train_x,val_x,train_y,val_y = train_test_split(x,y,test_size=0.3)

### Crear el pipe y el diccionario de algorimos, parámetros y valores a testar

Modificar para dejar solo los algoritmos que se quieran testar.

Modificar los parámetros.

In [8]:
pipe = Pipeline([('algoritmo',RandomForestClassifier())])

grid = [{'algoritmo': [LogisticRegression()],
         'algoritmo__n_jobs': [-1],
         'algoritmo__solver': ['saga'],
         'algoritmo__penalty': ['elasticnet', 'l1', 'l2', 'none'],
         'algoritmo__C': [0,0.25,0.5,0.75,1]},
        
        {'algoritmo': [RandomForestClassifier()],
         'algoritmo__n_jobs': [-1],
         'algoritmo__max_depth': [5,10,15],
         'algoritmo__n_estimators': [50,100,200]},
        
        {'algoritmo': [XGBClassifier()],
         'algoritmo__n_jobs': [-1],
         'algoritmo__verbosity': [0],#para que no salgan warnings
         'algoritmo__learning_rate': [0.01,0.025,0.05,0.1],
         'algoritmo__max_depth': [5,10,20],
         'algoritmo__reg_alpha': [0,0.1,0.5,1],
         'algoritmo__reg_lambda': [0.01,0.1,1],
         'algoritmo__n_estimators': [100,500,1000]},
        
        {'algoritmo': [HistGradientBoostingClassifier()],
         'algoritmo__learning_rate': [0.01,0.025,0.05,0.1],
         'algoritmo__max_iter': [50,100,200],
         'algoritmo__max_depth': [5,10,20],
         'algoritmo__min_samples_leaf': [500],
         'algoritmo__scoring': ['roc_auc'],
         'algoritmo__l2_regularization': [0,0.25,0.5,0.75,1]}   
       ]

# Best results:
# [LogisticRegression(C=1, n_jobs=-1, penalty='l1', solver='saga')] 0.8711
# [RandomForestClassifier(max_depth=10, n_estimators=200, n_jobs=-1)] 0.8879
# [XGBClassifier(learning_rate=0.01, max_depth=5, n_estimators=1000, n_jobs=-1, algoritmo__min_samples_leaf=500, reg_alpha=0.5, reg_lambda=1] 0.8968
# [HistGradientBoostingClassifier(l2_regularization=0.75, max_depth=5, max_iter=200, min_samples_leaf=500, scoring='roc_auc')] 0.8568

### Optimizar los hiper parámetros

####  Con grid search

In [9]:
grid_search = GridSearchCV(estimator= pipe, 
                           param_grid = grid, 
                           cv = 3, 
                           scoring = 'roc_auc',
                           verbose = 0,
                           n_jobs = -1)

modelo1 = grid_search.fit(train_x,train_y)
# modelo2 = grid_search.fit(train_x,train_y)
# modelo3 = grid_search.fit(train_x,train_y)
# modelo4 = grid_search.fit(train_x,train_y)

pd.DataFrame(grid_search.cv_results_).sort_values(by = 'rank_test_score')

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_algoritmo,param_algoritmo__C,param_algoritmo__n_jobs,param_algoritmo__penalty,param_algoritmo__solver,params,split0_test_score,split1_test_score,split2_test_score,mean_test_score,std_test_score,rank_test_score
17,0.08,0.01,0.01,0.0,LogisticRegression(),1.0,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,1
13,0.08,0.01,0.01,0.0,LogisticRegression(),0.75,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,2
9,0.08,0.01,0.01,0.0,LogisticRegression(),0.5,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,3
18,0.06,0.0,0.01,0.0,LogisticRegression(),1.0,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,4
5,0.08,0.01,0.01,0.0,LogisticRegression(),0.25,-1,l1,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,5
14,0.07,0.0,0.01,0.0,LogisticRegression(),0.75,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,6
10,0.06,0.01,0.01,0.0,LogisticRegression(),0.5,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.87,0.84,0.86,0.86,0.01,7
6,0.06,0.0,0.01,0.0,LogisticRegression(),0.25,-1,l2,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",0.86,0.83,0.86,0.85,0.01,8
16,0.0,0.0,0.0,0.0,LogisticRegression(),1.0,-1,elasticnet,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",,,,,,9
15,0.0,0.0,0.0,0.0,LogisticRegression(),0.75,-1,none,saga,"{'algoritmo': LogisticRegression(), 'algoritmo...",,,,,,9


## EVALUAR

### Predecir sobre validación

In [None]:
pred = modelo.best_estimator_.predict_proba(val_x)[:, 1]

### Evaluar sobre validación

In [None]:
roc_auc_score(val_y, pred)

### Examinar el mejor modelo

In [None]:
list(modelo.best_estimator_)

In [None]:
modelo.best_estimator_.named_steps.algoritmo.coef_

In [None]:
modelo.best_estimator_.named_steps.algoritmo.feature_names_in_

In [None]:
pd.DataFrame(data = modelo.best_estimator_.named_steps.algoritmo.coef_,
             columns = modelo.best_estimator_.named_steps.algoritmo.feature_names_in_).unstack().sort_values(ascending = False)

**Nota.-** Variables como *paginas_vistas_visita_mms* es extraño que salgan negativas pues indica que cuantas menos paginas vea por visita la conversión es mayor. Esto seguramente se debe a que esta variable está aún muy correlacionada con otras y habría que analizarla con más detalle.               

## REPORTING DEL MODELO

In [None]:
fig, ax = plt.subplots(1,3, figsize = (10,3.5), dpi = 100)

skplt.metrics.plot_cumulative_gain(val_y, modelo1.best_estimator_.predict_proba(val_x), ax=ax[0], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_cumulative_gain(val_y, modelo2.best_estimator_.predict_proba(val_x), ax=ax[0], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_cumulative_gain(val_y, modelo3.best_estimator_.predict_proba(val_x), ax=ax[0], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_cumulative_gain(val_y, modelo4.best_estimator_.predict_proba(val_x), ax=ax[0], text_fontsize='small', title_fontsize = 'medium')
lines = ax[0].get_lines()
for line in lines:
    if line.get_label() == 'Class 0':
        line.remove()
ax[0].get_legend().remove()

ax[0].properties()['children'][0].set_color('blue');   ax[0].properties()['children'][0].set_linewidth(1)
ax[0].properties()['children'][2].set_color('orange'); ax[0].properties()['children'][2].set_linewidth(1)
ax[0].properties()['children'][4].set_color('green');  ax[0].properties()['children'][4].set_linewidth(1)
ax[0].properties()['children'][6].set_color('red');    ax[0].properties()['children'][6].set_linewidth(1)
ax[0].lines[1].remove(); ax[0].lines[2].remove(); ax[0].lines[3].remove(); ax[0].properties()['children'][4].set_linewidth(1); 
ax[0].legend(['Logistic Regression model','Random Forest model','XGBoost model','LightGBM model','Random'], prop={'size': 6})

skplt.metrics.plot_lift_curve(val_y, modelo1.best_estimator_.predict_proba(val_x), ax=ax[1], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_lift_curve(val_y, modelo2.best_estimator_.predict_proba(val_x), ax=ax[1], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_lift_curve(val_y, modelo3.best_estimator_.predict_proba(val_x), ax=ax[1], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_lift_curve(val_y, modelo4.best_estimator_.predict_proba(val_x), ax=ax[1], text_fontsize='small', title_fontsize = 'medium')
lines = ax[1].get_lines()
for line in lines:
    if line.get_label() == 'Class 0':
        line.remove()
ax[1].get_legend().remove()

ax[1].properties()['children'][0].set_color('blue');   ax[1].properties()['children'][0].set_linewidth(1)
ax[1].properties()['children'][2].set_color('orange'); ax[1].properties()['children'][2].set_linewidth(1)
ax[1].properties()['children'][4].set_color('green');  ax[1].properties()['children'][4].set_linewidth(1)
ax[1].properties()['children'][6].set_color('red');    ax[1].properties()['children'][6].set_linewidth(1)
ax[1].properties()['children'][1].set_linewidth(1);    ax[1].properties()['children'][3].set_linewidth(1);
ax[1].properties()['children'][5].set_linewidth(1);    ax[1].properties()['children'][7].set_linewidth(1)

skplt.metrics.plot_roc(val_y, modelo1.best_estimator_.predict_proba(val_x), ax=ax[2], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_roc(val_y, modelo2.best_estimator_.predict_proba(val_x), ax=ax[2], text_fontsize='small', title_fontsize = 'medium')
skplt.metrics.plot_roc(val_y, modelo3.best_estimator_.predict_proba(val_x), ax=ax[2], text_fontsize='small', title_fontsize = 'medium')
myfig = skplt.metrics.plot_roc(val_y, modelo4.best_estimator_.predict_proba(val_x), ax=ax[2], text_fontsize='small', title_fontsize = 'medium')
lines = ax[2].get_lines()
for line in lines:
    if 'ROC curve of class 0' in line.get_label():
        line.set_label('Modelo')
    elif line.get_linestyle() != '--':  # Mantenemos la línea punteada pero sin leyenda
        line.set_visible(False)
        
ax[2].get_legend().remove()
ax[2].properties()['children'][0].set_color('blue');   ax[2].properties()['children'][0].set_linewidth(1)
ax[2].properties()['children'][5].set_color('orange'); ax[2].properties()['children'][5].set_linewidth(1)
ax[2].properties()['children'][10].set_color('green'); ax[2].properties()['children'][10].set_linewidth(1)
ax[2].properties()['children'][15].set_color('red');   ax[2].properties()['children'][15].set_linewidth(1)
ax[2].properties()['children'][4].set_linewidth(1);    ax[2].properties()['children'][9].set_linewidth(1)
ax[2].properties()['children'][14].set_linewidth(1);   ax[2].properties()['children'][19].set_linewidth(1)
ax[2].grid(visible=True)

# Ahora puedes mostrar el gráfico
plt.tight_layout()
plt.show()
fig = myfig.get_figure()
fig.savefig("exhibit_6.png")

In [None]:
ax[2].grid(visible=True)

### Gain Chart

In [None]:
fig, ax = plt.subplots()

skplt.metrics.plot_cumulative_gain(val_y, modelo1.best_estimator_.predict_proba(val_x), ax=ax) 

# Eliminamos la línea de los ceros, que internamente tiene etiqueta 'Class 0'
lines = ax.get_lines()
# Buscar por la etiqueta y eliminar la línea correspondiente
for line in lines:
    if line.get_label() == 'Class 0':
        line.remove()

# Personalizamos la leyenda sin incluir la línea de los ceros
plt.legend(labels=['Modelo'])

# Ahora puedes mostrar el gráfico
plt.show()

### Lift Chart

In [None]:
fig, ax = plt.subplots()

skplt.metrics.plot_lift_curve(val_y, modelo.best_estimator_.predict_proba(val_x), ax=ax) 

# Eliminamos la línea de los ceros, que internamente tiene etiqueta 'Class 0'
lines = ax.get_lines()
# Buscar por la etiqueta y eliminar la línea correspondiente
for line in lines:
    if line.get_label() == 'Class 0':
        line.remove()

# Personalizamos la leyenda sin incluir la línea de los ceros
plt.legend(labels=['Modelo'])

# Ahora puedes mostrar el gráfico
plt.show()

### ROC Chart

In [None]:
fig, ax = plt.subplots()

# Generamos la gráfica ROC
skplt.metrics.plot_roc(val_y, modelo.best_estimator_.predict_proba(val_x), ax=ax)

# Obtenemos todas las líneas y las leyendas
lines = ax.get_lines()

# Recorremos las líneas para eliminar las no deseadas, excepto la línea de la clase 0
for line in lines:
    if 'ROC curve of class 0' in line.get_label():
        line.set_label('Modelo')
    elif line.get_linestyle() != '--':  # Mantenemos la línea punteada pero sin leyenda
        line.set_visible(False)

# Solo añadimos al leyenda la línea de la clase 0 renombrada a 'Modelo'
handles, labels = ax.get_legend_handles_labels()
new_handles = [h for h, l in zip(handles, labels) if l == 'Modelo']
new_labels = ['Modelo']

ax.legend(new_handles, new_labels, loc='best')

# Ahora puedes mostrar el gráfico
plt.show();
