<a href="https://colab.research.google.com/github/lmencisoe/ADL/blob/main/taller%204/Taller4_pipeline.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Importa librerías

In [74]:
import pandas as pd
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
from pandas_profiling import ProfileReport
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn import metrics
from sklearn.metrics import ConfusionMatrixDisplay, precision_score, recall_score, f1_score, auc
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.impute import SimpleImputer
from sklearn.compose import make_column_transformer
from sklearn.model_selection import train_test_split
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
import tensorflow as tf
import xgboost as xgb
from xgboost import XGBClassifier
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score
import seaborn as sns
import warnings
import re
warnings.filterwarnings('ignore')

# Lee la data

In [13]:
df_1 = pd.read_json('https://raw.githubusercontent.com/lmencisoe/CDA/main/taller%204/DataSet_Entrenamiento_v1.json')
df_2 = pd.read_json('https://raw.githubusercontent.com/lmencisoe/CDA/main/taller%204/DataSet_Entrenamiento_v2.json')

# Convierte a numérica la variable de Total Charges

In [78]:
df_1['TotalCharges'] = df_1['TotalCharges'].replace("", 0)
df_1['TotalCharges'] = df_1['TotalCharges'].astype(float)
df_2['TotalCharges'] = df_2['TotalCharges'].replace("", 0)
df_2['TotalCharges'] = df_2['TotalCharges'].astype(float)

# Extrae la data entrenamiento 1

In [15]:
X_total = df_1.drop(['Churn', 'customerID'], axis=1)
df_1['target'] = np.where(df_1['Churn']!= 'Yes', 1, 0)
Y_total = df_1['target'].astype(float)

# Train y test

In [17]:
X_train, X_test, y_train, y_test = train_test_split(X_total, Y_total, test_size=0.2, random_state=2022)

# Pipeline limpieza de datos

In [37]:
cat_features = X_total.select_dtypes(exclude=["float64", "int64"]).columns.to_list()
numeric_features = X_total.select_dtypes(["float64", "int64"]).columns.to_list()

categorical_transformer = Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy='constant', fill_value='missing')),
                ('encoder', OneHotEncoder())
            ]
        )    

numeric_transformer = Pipeline(
            steps=[
                ('imputer', SimpleImputer(strategy='most_frequent')),
                ('scaler', StandardScaler())
            ]
        )

preprocessor = ColumnTransformer(
            transformers=[
                ('numerical', numeric_transformer, numeric_features),
                ('categorical', categorical_transformer, cat_features)
            ]
        )

# Pipeline XGBoost

In [27]:
xgb_pipeline = Pipeline(
            [
                ("preprocessor", preprocessor),
                (
                    "classifier", XGBClassifier(),
                ),
            ]
        )

 # Hiperparámetros de XGBoost

In [76]:
xgb_param_grid = {
    'classifier__learning_rate': np.arange(0.05, 1, 0.05),
    'classifier__max_depth': np.arange(3, 10, 1),
    'classifier__n_estimators': np.arange(50, 200, 50)
}

randomized_roc_auc_xgb = RandomizedSearchCV(estimator=xgb_pipeline, 
                                            param_distributions=xgb_param_grid,
                                            n_iter=50,
                                            scoring='roc_auc',
                                            cv=3, 
                                            verbose=1)

with tf.device('/device:GPU:0'): 
  randomized_roc_auc_xgb.fit(X_train, y_train)
  
print('Best AUC: ', randomized_roc_auc.best_score_)
#print('Estimator: ', randomized_roc_auc.best_estimator_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Score:  0.840700712356648


**AUC Test**

In [77]:
predict_xgb = pd.Series(randomized_roc_auc_xgb.predict_proba(X_test)[:,1])
metrics.roc_auc_score(y_test, predict_xgb)

0.841886705996907

# Pipeline Decision Tree

In [39]:
dt_pipeline = Pipeline(
            [
                ("preprocessor", preprocessor),
                (
                    "classifier", DecisionTreeClassifier(),
                ),
            ]
        )

# Hiperparámetros de Decision Tree

In [41]:
dt_param_grid = {
    'classifier__max_depth': np.arange(3, 10, 1),
    'classifier__max_features': ['auto', 'sqrt', 'log2'],
    'classifier__min_samples_split': np.arange(5, 50, 5)
}

# Perform RandomizedSearchCV
randomized_roc_auc_dt = RandomizedSearchCV(estimator= dt_pipeline, 
                                            param_distributions=dt_param_grid,
                                            n_iter=50,
                                            scoring='roc_auc',
                                            cv=3, 
                                            verbose=1)

# Fit the estimator
with tf.device('/device:GPU:0'): 
  randomized_roc_auc_dt.fit(X_train, y_train)

# Compute metrics
print('Best AUC: ', randomized_roc_auc.best_score_)
#print('Estimator: ', randomized_roc_auc.best_estimator_)

Fitting 3 folds for each of 50 candidates, totalling 150 fits
Score:  0.840700712356648


**AUC Test**

In [75]:
predict = pd.Series(randomized_roc_auc_dt.predict_proba(X_test)[:,1])
metrics.roc_auc_score(y_test, predict)

0.8107297812016725