<a href="https://colab.research.google.com/github/omanofx/entregable_1/blob/Proyecto_final_Omar_Fernandez/cross_validation_Omar_Fernandez.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Aplicamos validación cruzada al dataset de hoteles ya procesado y depurado. Contiene todas las nuevas características creadas en el desafio de la clase 45.

## Librerías

In [53]:
import numpy as np
import pandas as pd

# memory management
import gc

import warnings
warnings.filterwarnings('ignore')

from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA

import xgboost as xgb

from sklearn.metrics import mean_squared_error

from sklearn.experimental import enable_halving_search_cv
from sklearn.model_selection import HalvingGridSearchCV, HalvingRandomSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, mean_squared_error, r2_score

from sklearn.ensemble import RandomForestClassifier, RandomForestRegressor, GradientBoostingClassifier
from sklearn.tree import DecisionTreeClassifier

## Funciones propias

In [54]:
def tiene_espacios_en_blanco(columna: str):
    return any(x.isspace() for x in columna)

In [55]:
# Definir la función analizar_dataframe
def analizar_dataframe(df1: pd.DataFrame, porcentaje_tolerancia: float):
    '''
    ## Esta función se utiliza para comprobar la consistencia de los datos de un DataFrame
    ### evita invocar a .info(), .isnull()
    #### Porcentaje de tolerancia: valor porcentual que se tolera para los valores nulos de
    #### cada caracteristica del dataset. Si el valor se encuentra por sobre la tolerancia,
    #### se indica como 'BORRAR' la caracteristica.
    '''
    # Validar que porcentaje_tolerancia esté entre 0 y 100
    if not (0 <= porcentaje_tolerancia <= 100):
        print("Error: El porcentaje de tolerancia debe estar entre 0 y 100.")
        return

    porcentaje_perdidos = df1.isnull().sum() * 100 / len(df1)
    total_nulos = df1.isnull().sum()
    tipo_dato = df1.dtypes
    valores_no_nulos_por_columna = df1.count()
    descripcion = df1.describe().transpose()

    resultado_analisis = pd.DataFrame({
        'Tipo de dato': tipo_dato,
        'Total No nulos': valores_no_nulos_por_columna,
        'Total nulos': total_nulos,
        'Porcentaje Nulos': round(porcentaje_perdidos, 2),
        'Borrar': np.where(porcentaje_perdidos > porcentaje_tolerancia, 'BORRAR', ''),
        'Columna con Espacios': df1.columns.to_series().apply(lambda x: tiene_espacios_en_blanco(x))
    })

    # Agregar las columnas de la descripción al resultado_analisis
    resultado_analisis = pd.concat([resultado_analisis, round(descripcion, 2)], axis=1)

    estilo_resultado = (
        resultado_analisis.style
        .applymap(lambda x: 'background-color: red',
                  subset=pd.IndexSlice[resultado_analisis['Porcentaje Nulos'] > porcentaje_tolerancia, 'Porcentaje Nulos'])
        .background_gradient(cmap='Reds', subset=['Porcentaje Nulos'])
        .applymap(lambda x: 'background-color: red' if x else '',
                  subset=pd.IndexSlice[resultado_analisis['Columna con Espacios'], 'Columna con Espacios'])
        .background_gradient(cmap='Reds', subset=['Columna con Espacios'])
    )


    return estilo_resultado

In [56]:
def calcular_precision_modelo(y_test, y_pred):
  '''
  ## Calcula la precisión del modelo
  '''
  accuracy_puntaje = accuracy_score(y_test, y_pred)

  print("Precisión del modelo: {:.2f}%".format(accuracy_puntaje * 100))

  if accuracy_puntaje > 0.9:
      print("El modelo tiene un alto rendimiento en la clasificación.")
  else:
      print("El modelo podría necesitar mejoras para lograr un rendimiento más alto.")

## Carga del Data set

El dataset ya fue procesado, por lo cual no contiene valores perdidos. Además, contiene las nuevas caraterísticas que fueron añadidas en previos desafios.

In [57]:
#dataset procesado: última version 12-2-2024
df_reservas = pd.read_csv("https://raw.githubusercontent.com/omanofx/entregable_1/Proyecto_final_Omar_Fernandez/df_reservas_ya_procesado.csv", sep = ",")

In [58]:
# Deja unicamente las características númericas ya que las categoricas fueron convertirdas con Label Enconder.
df_reservas_numerico = df_reservas.select_dtypes(include='number').sample(5000) # limito el DF

In [59]:
analizar_dataframe(df_reservas_numerico, porcentaje_tolerancia=0)

Unnamed: 0,Tipo de dato,Total No nulos,Total nulos,Porcentaje Nulos,Borrar,Columna con Espacios,count,mean,std,min,25%,50%,75%,max
is_canceled,int64,5000,0,0.0,,False,5000.0,0.28,0.45,0.0,0.0,0.0,1.0,1.0
lead_time,int64,5000,0,0.0,,False,5000.0,80.84,86.16,0.0,12.0,49.0,128.0,542.0
arrival_date_year,int64,5000,0,0.0,,False,5000.0,2016.21,0.67,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,int64,5000,0,0.0,,False,5000.0,26.76,13.71,1.0,16.0,27.0,37.0,53.0
arrival_date_day_of_month,int64,5000,0,0.0,,False,5000.0,15.66,8.98,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,int64,5000,0,0.0,,False,5000.0,1.0,1.0,0.0,0.0,1.0,2.0,8.0
stays_in_week_nights,int64,5000,0,0.0,,False,5000.0,2.64,1.92,0.0,1.0,2.0,4.0,20.0
adults,int64,5000,0,0.0,,False,5000.0,1.9,0.49,0.0,2.0,2.0,2.0,4.0
children,int64,5000,0,0.0,,False,5000.0,0.15,0.47,0.0,0.0,0.0,0.0,3.0
babies,int64,5000,0,0.0,,False,5000.0,0.01,0.1,0.0,0.0,0.0,0.0,2.0


In [60]:
df_pca = df_reservas_numerico.copy()

# Separar las características (X) y la variable objetivo si es aplicable
X = df_pca.drop('is_canceled', axis=1)
y = df_pca['is_canceled']

# Escalar las características para asegurar que tengan la misma escala
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Aplicar PCA con dos componentes principales para la visualización
pca = PCA(n_components = 2)
principal_components = pca.fit_transform(X_scaled)

# Crear un DataFrame
df_principal_components = pd.DataFrame(data = principal_components, columns=['PCA_1', 'PCA_2'])
df_principal_components['is_canceled'] = y.values

df_principal_components

Unnamed: 0,PCA_1,PCA_2,is_canceled
0,0.876196,-0.005652,0
1,-2.987561,-1.483295,1
2,-1.722205,-2.318029,1
3,1.984305,0.671160,0
4,1.235012,2.201453,0
...,...,...,...
4995,0.403777,0.597419,0
4996,-1.577007,0.692544,0
4997,2.856251,-1.012150,0
4998,-0.000111,-0.798567,0


In [61]:
df_principal_components.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5000 entries, 0 to 4999
Data columns (total 3 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   PCA_1        5000 non-null   float64
 1   PCA_2        5000 non-null   float64
 2   is_canceled  5000 non-null   int64  
dtypes: float64(2), int64(1)
memory usage: 117.3 KB


In [62]:
analizar_dataframe(df_principal_components, 0)

Unnamed: 0,Tipo de dato,Total No nulos,Total nulos,Porcentaje Nulos,Borrar,Columna con Espacios,count,mean,std,min,25%,50%,75%,max
PCA_1,float64,5000,0,0.0,,False,5000.0,0.0,2.01,-8.27,-1.05,0.3,1.17,10.88
PCA_2,float64,5000,0,0.0,,False,5000.0,0.0,1.69,-7.38,-0.87,0.04,0.92,11.34
is_canceled,int64,5000,0,0.0,,False,5000.0,0.28,0.45,0.0,0.0,0.0,1.0,1.0


In [63]:
# Restablecer los índices de ambos DataFrames
df_reservas_numerico_reset = df_reservas_numerico.reset_index(drop=True)
df_principal_components_reset = df_principal_components[['PCA_1', 'PCA_2']].reset_index(drop=True)

# Concatenar los DataFrames restablecidos
df_regresion = pd.concat([df_reservas_numerico_reset, df_principal_components_reset], axis=1)
df_regresion

Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,market_segment_le,reserved_room_type_le,assigned_room_type_le,customer_type_le,mes_arribo_numero,total_pasajeros,total_estadia,es_grupo_familiar,PCA_1,PCA_2
0,0,5,2016,36,28,2,2,2,0,0,...,3,0,0,2,8,2,4,0,0.876196,-0.005652
1,1,85,2015,53,31,0,1,3,0,0,...,6,7,7,2,12,3,1,0,-2.987561,-1.483295
2,1,31,2016,29,13,0,2,2,1,0,...,6,0,0,2,7,3,2,1,-1.722205,-2.318029
3,0,89,2016,36,29,1,1,1,0,0,...,6,0,0,2,8,1,2,0,1.984305,0.671160
4,0,108,2016,18,24,2,3,1,0,0,...,6,0,0,2,4,1,5,0,1.235012,2.201453
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4995,0,9,2016,25,12,2,2,2,0,0,...,6,0,0,2,6,2,4,0,0.403777,0.597419
4996,0,77,2016,31,25,1,4,2,0,0,...,6,3,3,2,7,2,5,0,-1.577007,0.692544
4997,0,1,2016,48,24,0,1,1,0,0,...,5,0,3,2,11,1,1,0,2.856251,-1.012150
4998,0,87,2016,23,29,2,2,0,2,0,...,6,1,1,3,5,2,4,0,-0.000111,-0.798567


In [64]:
# Junta el DF numerico con el PCA
#df_regresion = pd.concat([df_reservas_numerico, df_principal_components[['PCA_1','PCA_2']]], axis=1)
#df_regresion

In [65]:
analizar_dataframe(df_regresion, 0)

Unnamed: 0,Tipo de dato,Total No nulos,Total nulos,Porcentaje Nulos,Borrar,Columna con Espacios,count,mean,std,min,25%,50%,75%,max
is_canceled,int64,5000,0,0.0,,False,5000.0,0.28,0.45,0.0,0.0,0.0,1.0,1.0
lead_time,int64,5000,0,0.0,,False,5000.0,80.84,86.16,0.0,12.0,49.0,128.0,542.0
arrival_date_year,int64,5000,0,0.0,,False,5000.0,2016.21,0.67,2015.0,2016.0,2016.0,2017.0,2017.0
arrival_date_week_number,int64,5000,0,0.0,,False,5000.0,26.76,13.71,1.0,16.0,27.0,37.0,53.0
arrival_date_day_of_month,int64,5000,0,0.0,,False,5000.0,15.66,8.98,1.0,8.0,16.0,23.0,31.0
stays_in_weekend_nights,int64,5000,0,0.0,,False,5000.0,1.0,1.0,0.0,0.0,1.0,2.0,8.0
stays_in_week_nights,int64,5000,0,0.0,,False,5000.0,2.64,1.92,0.0,1.0,2.0,4.0,20.0
adults,int64,5000,0,0.0,,False,5000.0,1.9,0.49,0.0,2.0,2.0,2.0,4.0
children,int64,5000,0,0.0,,False,5000.0,0.15,0.47,0.0,0.0,0.0,0.0,3.0
babies,int64,5000,0,0.0,,False,5000.0,0.01,0.1,0.0,0.0,0.0,0.0,2.0


In [66]:
# probar con los features calculados con PCA
X = df_regresion.drop('is_canceled', axis=1)
y = df_regresion['is_canceled']

# Separación train/test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)

#model = LogisticRegression(max_iter=100, n_jobs=-1)
model= xgb.XGBClassifier(learning_rate=0.001)

# Ajustar modelo
model.fit(X_train, y_train)

# Predicciones
predicciones = model.predict(X_test)

# Llama a la función para calcular la precisión del modelo
calcular_precision_modelo(y_test, predicciones )


Precisión del modelo: 72.07%
El modelo podría necesitar mejoras para lograr un rendimiento más alto.


In [67]:
# Dividir los datos en características (X) y etiquetas (y)
X = df_regresion.drop(columns=['is_canceled'])
y = df_regresion['is_canceled']
print(X.shape, y.shape)

(5000, 32) (5000,)


In [68]:
# Separar en train y test
X_train,X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)
print(X_train.shape, X_test.shape)

(3500, 32) (1500, 32)


In [69]:
# Lista de hiperparametros
params_1 = {'criterion': 'gini', 'splitter': 'best', 'max_depth': 5}
params_2 = {'criterion': 'entropy', 'splitter': 'random', 'max_depth': 8}
params_3 = {'criterion': 'gini', 'splitter': 'random', 'max_depth': 10}

In [70]:
# Modelo 1
model.set_params(**params_1).fit(X_train, y_train)
print(f'Accuracy para Modelo 1 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 2
model.set_params(**params_2).fit(X_train, y_train)
print(f'Accuracy para Modelo 2 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')
# Modelo 3
model.set_params(**params_3).fit(X_train, y_train)
print(f'Accuracy para Modelo 3 = {round(accuracy_score(y_test, model.predict(X_test)), 5)}')

Accuracy para Modelo 1 = 0.71667
Accuracy para Modelo 2 = 0.71667
Accuracy para Modelo 3 = 0.71667


In [71]:
params_grid = {
        'min_child_weight': [1, 5, 10],
        'gamma': [0.5, 1, 1.5, 2, 5],
        'subsample': [0.6, 0.8, 1.0],
        'colsample_bytree': [0.6, 0.8, 1.0],
        'max_depth': [5,6,7],
        'criterion':['entropy','gini']
        }

In [72]:
# Función para buscar los hiperparametros
def apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method, cv_method, scoring="accuracy", **kwargs):
    grid_cv = search_method(model, params_grid, scoring=scoring, cv=cv_method, **kwargs)
    grid_cv.fit(X_train, y_train)

    print("Mejores parametros:", grid_cv.best_params_)
    print("Mejor score de CV:", grid_cv.best_score_)
    print(f'Accuracy del modelo: {round(accuracy_score(y_test, grid_cv.predict(X_test)), 5)}')


In [77]:
# Grid Search
apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method=GridSearchCV, cv_method=3)
print()
apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method=GridSearchCV, cv_method=5)

Mejores parametros: {'colsample_bytree': 0.6, 'criterion': 'entropy', 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6}
Mejor score de CV: 0.7231428119287653
Accuracy del modelo: 0.71667

Mejores parametros: {'colsample_bytree': 0.6, 'criterion': 'entropy', 'gamma': 0.5, 'max_depth': 5, 'min_child_weight': 1, 'subsample': 0.6}
Mejor score de CV: 0.7231428571428571
Accuracy del modelo: 0.71667


In [79]:
# Randomized Search CV
apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method=RandomizedSearchCV, cv_method=5)
print()
apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method=RandomizedSearchCV, cv_method=10)

Mejores parametros: {'subsample': 0.8, 'min_child_weight': 1, 'max_depth': 6, 'gamma': 5, 'criterion': 'gini', 'colsample_bytree': 0.6}
Mejor score de CV: 0.7231428571428571
Accuracy del modelo: 0.71667

Mejores parametros: {'subsample': 0.8, 'min_child_weight': 1, 'max_depth': 6, 'gamma': 1, 'criterion': 'entropy', 'colsample_bytree': 1.0}
Mejor score de CV: 0.7231428571428571
Accuracy del modelo: 0.71667


In [80]:
# Halving GridSearch CV
apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method=HalvingGridSearchCV, cv_method=5 ,factor=3)

Mejores parametros: {'colsample_bytree': 0.6, 'criterion': 'gini', 'gamma': 1.5, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 1.0}
Mejor score de CV: 0.7117283950617285
Accuracy del modelo: 0.71667

Mejores parametros: {'colsample_bytree': 0.6, 'criterion': 'gini', 'gamma': 1.5, 'max_depth': 6, 'min_child_weight': 5, 'subsample': 1.0}
Mejor score de CV: 0.7198632133165157
Accuracy del modelo: 0.71667


In [81]:
# Halving Randomized Search
apply_search_cv(model, params_grid, X_train, y_train, X_test, y_test, search_method=HalvingRandomSearchCV, cv_method=8, factor=3)

Mejores parametros: {'subsample': 0.6, 'min_child_weight': 5, 'max_depth': 6, 'gamma': 1, 'criterion': 'entropy', 'colsample_bytree': 1.0}
Mejor score de CV: 0.7171637904674539
Accuracy del modelo: 0.71667
