# <u>Métodos Supervisados para Clasificación - Parte 2</u>

## Caso de uso

<img src = 'https://datahack-prod.s3.ap-south-1.amazonaws.com/__sized__/contest_cover/loanpre-thumbnail-1200x1200.png'>

## Librerias

In [None]:
import warnings
warnings.filterwarnings("ignore")

import random
import pandas as pd
import numpy as np
import sklearn

import seaborn as sns
import matplotlib.pyplot as plt

Usamos una semilla a lo largo de todo el notebook para los procesos aleatorios

In [None]:
seed = 2021

In [None]:
target = 'Loan_Status'

## Bases

In [None]:
train = pd.read_csv('data/train_preprocesed.csv')
test = pd.read_csv('data/test_preprocesed.csv')

In [None]:
train.head(2)

In [None]:
test.head(2)

In [None]:
X_train = train.drop(target, axis = 1)
y_train = train[target]

In [None]:
X_test = test.drop(target, axis = 1)
y_test = test[target]

In [None]:
X_train.shape, y_train.shape

In [None]:
X_test.shape, y_test.shape

## funciones auxiliares

In [None]:
from sklearn.metrics import roc_auc_score, roc_curve

In [None]:
def plot_roc_curve(y, y_proba, label = ''):
    '''
    dibujar la curva roc para las probabilidades y target entregados
    
    params:
    y: etiquetas originales
    y_proba: probabilidades resultado del modelo
    '''
    
    auc_roc = roc_auc_score(y, y_proba)
    fpr, tpr, thresholds = roc_curve(y, y_proba)
    
    plt.figure(figsize=(8,6))
    plt.rcParams.update({'font.size': 12})
    plt.plot(fpr, fpr, c = 'red')
    plt.plot(fpr, tpr, label= (f"Curva ROC {label} (AUC = {auc_roc:.4f})"))
    plt.xlabel("FPR")
    plt.ylabel("TPR")
    plt.title(f"Curva ROC {label}")
    plt.legend(loc=4, numpoints=1)

## Random Forest

Encontrar los parámetros óptimos para el entrenamiento con Grid Search

RandomForestClassifier: https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html

In [None]:
from sklearn.ensemble import RandomForestClassifier

In [None]:
params = {'criterion': ['gini', 'entropy'],
          'max_depth': [5, 10, 20],
          'min_samples_split': [10, 25, 50, 100],
          'n_estimators': [25, 50, 100, 150],
          'class_weight': ['balanced', 'balanced_subsample'],
          'random_state': [seed]}

In [None]:
from sklearn.model_selection import GridSearchCV

In [None]:
rf = RandomForestClassifier()

In [None]:
gs = GridSearchCV(rf, param_grid = params, scoring = 'roc_auc')

In [None]:
gs.fit(X_train, y_train)

In [None]:
gs.best_estimator_

In [None]:
gs.best_params_

In [None]:
gs.best_score_

In [None]:
best_rf = gs.best_estimator_

In [None]:
y_test_pred_proba = best_rf.predict_proba(X_test)[:,1]

In [None]:
from sklearn.metrics import roc_auc_score

In [None]:
roc_auc_score(y_test, y_test_pred_proba)

In [None]:
plot_roc_curve(y_test, y_test_pred_proba, 'Random Forest')

### Importancia de variables

In [None]:
best_rf.feature_importances_

In [None]:
importances = pd.DataFrame({'columna': X_train.columns, 'importance_rf': best_rf.feature_importances_})

In [None]:
importances.sort_values(by = 'importance_rf', ascending = False)

## Random Forest - boruta

Documentación https://github.com/scikit-learn-contrib/boruta_py

In [None]:
#!pip install boruta

In [None]:
from boruta import BorutaPy

In [None]:
boruta = BorutaPy(estimator = best_rf, n_estimators = 'auto', max_iter = 100)
boruta.fit(X_train.values, y_train.values)

In [None]:
# check selected features
boruta.support_

In [None]:
# check ranking of features
boruta.ranking_

In [None]:
# call transform() on X to filter it down to selected features
X_filtered = boruta.transform(X_train.values)

In [None]:
# zip my names, ranks, and decisions in a single iterable
feature_ranks = list(zip(X_train.columns, 
                         boruta.ranking_, 
                         boruta.support_))

# iterate through and print out the results
for feat in feature_ranks:
    print('Feature: {:<25} Rank: {},  Keep: {}'.format(feat[0], feat[1], feat[2]))

In [None]:
green_area = X_train.columns[boruta.support_].to_list()
blue_area = X_train.columns[boruta.support_weak_].to_list()

In [None]:
print('features in the green area:', green_area)
print('features in the blue area:', blue_area)

## Gráficas comerciales para modelos supervisados

In [None]:
#!pip install scikit-plot

In [None]:
from sklearn.linear_model import LogisticRegression
lr = LogisticRegression(C=0.05, solver='liblinear')  # la mejor regresión logística

lr.fit(X_train,y_train) # Entrenamiento!

In [None]:
# Curva de Ganancias Acumuladas!

# ¿Cuantos leads o % de base tengo que gestionar para identificar tánto % de préstamos?
import scikitplot as skplt
y_probas = lr.predict_proba(X_test)
skplt.metrics.plot_cumulative_gain(y_test, y_probas)
plt.show()

In [None]:
# Curva de Lift!
import scikitplot as skplt
y_probas = lr.predict_proba(X_test)
skplt.metrics.plot_lift_curve(y_test, y_probas)
plt.show()
