<a href="https://colab.research.google.com/github/matteeussPei/reservas_de_hotel/blob/main/ML_Previsao_Reserva.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install category_encoders



In [None]:
#importando as bibliotecas necessárias
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
from datetime import datetime

from pandas.plotting import scatter_matrix
import warnings
warnings.filterwarnings("ignore")

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score, StratifiedKFold, RandomizedSearchCV, train_test_split
from sklearn.metrics import make_scorer, roc_auc_score, accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
from category_encoders import TargetEncoder

from imblearn.under_sampling import RandomUnderSampler

from xgboost import XGBClassifier

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [None]:
#lendo o arquivo csv e visualizando as primeiras cinco entradas
df = pd.read_csv('https://raw.githubusercontent.com/matteeussPei/Hotel-Booking-Demand/main/hotel_bookings.csv')
df.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [None]:
#mostrando quantidade de variáveis e de linhas do arquivo
print('Total de variáveis:', df.shape[1])
print('Total de entradas:', df.shape[0])

Total de variáveis: 32
Total de entradas: 119390


In [None]:
#observando o tipo dos dados, a porcentagem de valores ausentes e a quantidade de valores únicos no arquivo
pd.DataFrame({'tipo de dados':df.dtypes,
                'dados_ausentes(%)': (df.isnull().sum()/df.shape[0])*100,
                    'valores unicos': df.nunique()}).sort_values(by='dados_ausentes(%)', ascending=False)

Unnamed: 0,tipo de dados,dados_ausentes(%),valores unicos
company,float64,94.306893,352
agent,float64,13.686238,333
country,object,0.408744,177
children,float64,0.00335,5
reserved_room_type,object,0.0,10
assigned_room_type,object,0.0,12
booking_changes,int64,0.0,21
deposit_type,object,0.0,3
hotel,object,0.0,2
previous_cancellations,int64,0.0,15


In [None]:
# apagaremos a coluna 'company' devido ter quase 100% dos valores nulos e por
# ser o ID da empresa/entidade que fez a reserva/responsável pelo pagamento da
# reserva a coluna 'agent' por se o ID da agência de viagens que fez a reserva

df.drop(columns = ['agent', 'company'], inplace=True)

In [None]:
# criaremos uma nova variável chamada 'equal_room', verificando se o quarto
# reservado é o mesmo que o entregue. Se sim, 'equal_room' receberá valor 1, se
# não, receberá valor 0.
df['equal_room'] = 0

for i in range(df.shape[0]):
  if (df.at[i,'reserved_room_type'] == df.at[i,'assigned_room_type']):
    df.at[i, 'equal_room'] = 1
  else:
    df.at[i, 'equal_room'] = 0


In [None]:
# preenchendo os valores faltantes com zero e convertendo a coluna para inteiros
df['children'] = df['children'].fillna(0)
df['children'] = df['children'].astype('int64', errors='raise')


In [None]:
# filtrando apenas para quantidades de adultos, crianças e bebês diferentes de
# zero, Entendendo que não há como esse tipo de reserva.
filtro = (df['babies'] == 0) & (df['children'] == 0) & (df['adults'] == 0)
df = df[~filtro]


In [None]:
# a coluna 'reservation_status' será deletada por apresenta o mesmo tipo de informação que a coluna 'is_canceled',
# onde o valor 'Check-Out' é igual a '0' na coluna 'is_canceled'. E os valores 'Canceled' e 'No-Show' é igual a 1 na coluna 'is_canceled'
# a coluna 'reservation_status_date' também será deletada, pois registra apenas a última data de alteração na reserva
# a coluna 'days_in_waiting_list' 99% dos seus valores iguais a zero, portanto não variabilidade e portanto não influência nos modelos
# a coluna 'arrival_date_weeK_number' mostra o número da semana anual

df.drop(columns=['country','reservation_status', 'reservation_status_date', 'days_in_waiting_list', 'arrival_date_week_number'], inplace = True)

In [None]:
# excluindo valores duplciados
df.drop_duplicates(inplace=True)


In [None]:
# Definindo os intervalos: dias 1-7 são a primeira semana, 8-14 são a segunda, etc.
bins = [0, 7, 14, 21, 28, 32]  # Usando 32 para incluir o dia 31 no último intervalo

# Labels para cada semana
labels = [1, 2, 3, 4, 5]  # Quatro semanas, então quatro labels

# Criando a nova coluna 'week_of_month' utilizando pd.cut()
df['week_of_month'] = pd.cut(df['arrival_date_day_of_month'], bins=bins, labels=labels, right=True, include_lowest=True)

# Excluindo coluna 'arrival_date_day_of_month'
df.drop(columns=['arrival_date_day_of_month'], axis = 1, inplace=True)


In [None]:
# selecionando as variáveis categóricas
cat_cols = [col for col in df.columns if df[col].dtype == 'O']


In [None]:
# exibindo variáveis categóricas
cat_cols

['hotel',
 'arrival_date_month',
 'meal',
 'market_segment',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type']

In [None]:
# criando dataframe apenas com as variáveis categóricas
cat_var = df[cat_cols]
cat_var.head()

Unnamed: 0,hotel,arrival_date_month,meal,market_segment,distribution_channel,reserved_room_type,assigned_room_type,deposit_type,customer_type
0,Resort Hotel,July,BB,Direct,Direct,C,C,No Deposit,Transient
1,Resort Hotel,July,BB,Direct,Direct,C,C,No Deposit,Transient
2,Resort Hotel,July,BB,Direct,Direct,A,C,No Deposit,Transient
3,Resort Hotel,July,BB,Corporate,Corporate,A,A,No Deposit,Transient
4,Resort Hotel,July,BB,Online TA,TA/TO,A,A,No Deposit,Transient


# Preparando dados para o modelo

In [None]:
# divindo os dados em treino e teste
X = df.drop(columns=['is_canceled'], axis = 1)
y = df['is_canceled']
X_train, X_test, y_train, y_test = train_test_split(X, y, stratify = y,test_size = 0.3,random_state=0)

## Variáveis numéricas



### MinMaxScaler

#### Variáveis de treino

Escalonando a colunas numéricas, devido ao alto range de valores


In [None]:
# selecionando apenas as variáveis numéricas, com excessão da 'week_of_mont' e 'arrival_date_year' pois são como 'categóricas
X_train_num_cols = X_train.drop(columns=cat_cols ) # excluindo variáveis categóricas
X_train_num_cols.drop(columns=['week_of_month', 'arrival_date_year'], axis = 1, inplace=True)  # excluindo as 'week_of_mont' e 'arrival_date_year' pois são como 'categóricas

In [None]:
# Estanciando e aplicando o MinMaxScaler nas variáveis de treino
sca = MinMaxScaler() # estanciando
sca.fit(X_train_num_cols) # aplicando o MinMaxScaler nas variáveis de treino
X_train_num_cols_temp = sca.transform(X_train_num_cols) # aplicando o MinMaxScaler nas variáveis de treino
X_train_num_cols_temp= pd.DataFrame(X_train_num_cols_temp, columns= X_train_num_cols.columns) # convertendo o resultado para DataFrame

#### Variáveis de teste

In [None]:
# Aplicando o MinMaxScaler nas variáveis de teste
X_test_num_cols = X_test.drop(columns=cat_cols) # excluindo variáveis categóricas
X_test_num_cols.drop(columns=['week_of_month', 'arrival_date_year'], axis =1, inplace=True) # excluindo as 'week_of_mont' e 'arrival_date_year' pois são como 'categóricas
X_test_num_cols_temp = sca.transform(X_test_num_cols) # aplicando o MinMaxScaler nas variáveis de teste
X_test_num_cols_temp= pd.DataFrame(X_test_num_cols_temp, columns= X_test_num_cols.columns) # convertendo o resultado para DataFrame

## Variáveis categóricas

### Target Encoder

#### Variáveis de treino

In [None]:
# Estanciando e aplicando o Target Encoder
encoder = TargetEncoder()
X_train_encoded = encoder.fit_transform(X_train, y_train)

In [None]:
# Visualizando as primeiras entradas após aplicação do Target Encoder
X_train_encoded.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,equal_room,week_of_month
116530,0.306126,157,2017,0.31707,1,4,2,0,0,0.26803,0.356109,0.313439,0,0,0,0.264136,0.31111,2,0.270558,0.301906,107.1,0,1,1,0.276074
8795,0.2354,342,2016,0.246984,2,5,1,0,0,0.26803,0.283278,0.148569,0,0,0,0.264136,0.231747,1,0.270558,0.160668,44.36,0,0,0,0.27339
58017,0.306126,167,2016,0.246984,0,2,1,0,0,0.26803,0.151389,0.313439,0,0,0,0.264136,0.31111,0,0.270558,0.301906,78.2,0,0,1,0.277622
118274,0.306126,105,2017,0.332128,2,3,2,0,0,0.359812,0.356109,0.313439,0,0,0,0.264136,0.31111,0,0.270558,0.301906,95.0,0,1,1,0.27339
14173,0.2354,7,2016,0.257702,0,1,1,0,0,0.26803,0.122665,0.133174,0,0,6,0.264136,0.24429,0,0.270558,0.301906,35.0,0,0,0,0.27339


#### Variáveis de teste

In [None]:
X_test_encoded= encoder.transform(X_test)

## Unindo após transformações as variáveis numéricas e categóricas

## Dados de treino

In [None]:
# verificando as colunas em comum
common_cols = X_train_encoded.columns.intersection(X_train_num_cols_temp.columns)
common_cols

Index(['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
       'adults', 'children', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests', 'equal_room'],
      dtype='object')

In [None]:
# Merge as duas DataFrames usando 'left' para manter todas as linhas de X_train_encoded
X_train_new = pd.merge(X_train_encoded, X_train_num_cols_temp, on=list(common_cols), how='left')

# Atualiza os valores de X_train_new com os valores correspondentes normalizados de X_train_num_cols_temp
X_train_new.update(X_train_num_cols_temp)

# Verifica os primeiros registros para confirmar as mudanças
X_train_new.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,equal_room,week_of_month
0,0.306126,0.213026,2017,0.31707,0.052632,0.08,0.04,0.0,0.0,0.26803,0.356109,0.313439,0,0.0,0.0,0.264136,0.31111,0.111111,0.270558,0.301906,0.019833,0.0,0.2,1,0.276074
1,0.2354,0.464043,2016,0.246984,0.105263,0.1,0.02,0.0,0.0,0.26803,0.283278,0.148569,0,0.0,0.0,0.264136,0.231747,0.055556,0.270558,0.160668,0.008215,0.0,0.0,0,0.27339
2,0.306126,0.226594,2016,0.246984,0.0,0.04,0.02,0.0,0.0,0.26803,0.151389,0.313439,0,0.0,0.0,0.264136,0.31111,0.0,0.270558,0.301906,0.014481,0.0,0.0,1,0.277622
3,0.306126,0.142469,2017,0.332128,0.105263,0.06,0.04,0.0,0.0,0.359812,0.356109,0.313439,0,0.0,0.0,0.264136,0.31111,0.0,0.270558,0.301906,0.017593,0.0,0.2,1,0.27339
4,0.2354,0.009498,2016,0.257702,0.0,0.02,0.02,0.0,0.0,0.26803,0.122665,0.133174,0,0.0,0.083333,0.264136,0.24429,0.0,0.270558,0.301906,0.006481,0.0,0.0,0,0.27339


## Dados de teste

In [None]:
# verificando as colunas em comum
common_cols = X_test_encoded.columns.intersection(X_test_num_cols_temp.columns)
common_cols

Index(['lead_time', 'stays_in_weekend_nights', 'stays_in_week_nights',
       'adults', 'children', 'babies', 'is_repeated_guest',
       'previous_cancellations', 'previous_bookings_not_canceled',
       'booking_changes', 'adr', 'required_car_parking_spaces',
       'total_of_special_requests', 'equal_room'],
      dtype='object')

In [None]:
# Merge as duas DataFrames usando 'left' para manter todas as linhas de X_test_encoded
X_test_new = pd.merge(X_test_encoded, X_test_num_cols_temp, on=list(common_cols), how='left')

# Atualiza os valores de X_test_encoded com os valores correspondentes normalizados de X_test_num_cols_temp
X_test_new.update(X_test_num_cols_temp)

# Verifica os primeiros registros para confirmar as mudanças
X_test_new.head() #

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,customer_type,adr,required_car_parking_spaces,total_of_special_requests,equal_room,week_of_month
0,0.2354,0.0,2016,0.309977,0.052632,0.0,0.02,0.0,0.0,0.26803,0.356109,0.313439,0,0.0,0.0,0.264136,0.229184,0.0,0.270558,0.301906,0.020185,0.0,0.0,0,0.27339
1,0.2354,0.126187,2015,0.251388,0.0,0.06,0.04,0.0,0.0,0.26803,0.356109,0.313439,0,0.0,0.0,0.302983,0.24429,0.0,0.270558,0.301906,0.016481,0.0,0.2,1,0.276074
2,0.306126,0.0,2016,0.301265,0.0,0.02,0.02,0.0,0.0,0.26803,0.147421,0.148569,0,0.0,0.0,0.264136,0.31111,0.055556,0.270558,0.301906,0.022407,0.0,0.0,1,0.284082
3,0.2354,0.07327,2016,0.246984,0.105263,0.12,0.04,0.0,0.0,0.26803,0.147421,0.148569,0,0.0,0.0,0.302983,0.24429,0.0,0.270558,0.160668,0.011613,0.0,0.0,1,0.284082
4,0.306126,0.024423,2016,0.295403,0.105263,0.02,0.02,0.0,0.0,0.26803,0.147421,0.148569,0,0.0,0.0,0.264136,0.31111,0.0,0.270558,0.301906,0.020802,0.0,0.2,1,0.284129


# Aplicando o modelo

In [None]:
# Stratified K-Fold Cross-Validation
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [None]:
scale_pos_weight = len(y_train[y_train == 0]) / len(y_train[y_train == 1])

param_grid = {
    'max_depth': np.arange(3, 10),  # Profundidades de árvore de 3 a 10
    'min_child_weight': [1, 2, 5, 10],  # Pesos mínimos
    'gamma': [0.1, 0.5, 1, 1.5, 2],  # Taxa de regularização
    'subsample': [0.6, 0.8, 1.0],  # Frações de subsample
    'colsample_bytree': [0.6, 0.8, 1.0],
    'learning_rate': [0.001, 0.01, 0.1, 0.2],  # Taxa de aprendizado
    'n_estimators': [100, 200, 500, 600],  # Número de árvores
    'reg_alpha': [0, 0.1, 0.5],  # Taxa de regularização
    'reg_lambda': [1, 1.5, 2]  # Taxa de regularização
}

In [None]:
auc_scorer = make_scorer(roc_auc_score, average = 'weighted')  # Função de avaliação para o RandomizedSearchCV

# Criar o modelo base
xgb = XGBClassifier(use_label_encoder=False, eval_metric='auc', scale_pos_weight=scale_pos_weight, objective= 'binary:logistic')

# Configurar o RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=xgb,  # Modelo base
    param_distributions=param_grid,  # Parâmetros a serem testados
    n_iter=100,  # Número de iterações/combinações de hiperparâmetros a serem testadas
    scoring = auc_scorer,  # Métrica de avaliação para o RandomizedSearchCV
    verbose=1,  # Verbosidade do RandomizedSearchCV
    random_state=42,  # Semente para a aleatoriedade
    n_jobs=-1,  # Número de jobs a serem executados em paralelo
    cv = skf  # Validação cruzada
)


# Fit do RandomizedSearchCV no dataset
random_search.fit(X_train_new, y_train)

Fitting 5 folds for each of 100 candidates, totalling 500 fits


In [None]:
# Melhor modelo encontrado
best_model = random_search.best_estimator_

print("Melhores hiperparâmetros encontrados:", random_search.best_params_)  # Imprimir os melhores hiperparâmetros

Melhores hiperparâmetros encontrados: {'subsample': 1.0, 'reg_lambda': 2, 'reg_alpha': 0.1, 'n_estimators': 200, 'min_child_weight': 2, 'max_depth': 9, 'learning_rate': 0.1, 'gamma': 0.1, 'colsample_bytree': 0.6}


In [None]:
# Fazer as previsões no conjunto de teste
y_pred = best_model.predict(X_test_new)
y_pred_proba = best_model.predict_proba(X_test_new)[:, 1]

In [None]:
# Avaliação do desempenho do modelo

auc = roc_auc_score(y_test, y_pred_proba)
accuracy = accuracy_score(y_test, y_pred)
precision = precision_score(y_test, y_pred, average='weighted')
recall = recall_score(y_test, y_pred, average='weighted')
f1 = f1_score(y_test, y_pred, average='weighted')
cm = confusion_matrix(y_test, y_pred)

print(f"AUC-ROC: {auc}")
print(f"Acurácia: {accuracy}")
print(f"Precisão: {precision}")
print(f"Recall: {recall}")
print(f"F1-Score: {f1}")
print("Matriz de Confusão:")
print(cm)

AUC-ROC: 0.867024231117551
Acurácia: 0.7887401143215097
Precisão: 0.8125874324918204
Recall: 0.7887401143215097
F1-Score: 0.7957341628364898
Matriz de Confusão:
[[14731  3703]
 [ 1693  5415]]


In [None]:
from sklearn.metrics import accuracy_score, classification_report, roc_auc_score
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, ExtraTreesClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from xgboost import XGBClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.neural_network import MLPClassifier
import pandas as pd

# Definir uma lista de modelos de classificação
models = [
    ('Logistic Regression', LogisticRegression()),
    ('Decision Tree', DecisionTreeClassifier()),
    ('Random Forest', RandomForestClassifier()),
    ('K-Nearest Neighbors', KNeighborsClassifier()),
    ('Support Vector Machine', SVC(probability=True)),  # Necessário definir probability=True para calcular roc_auc_score
    ('Naive Bayes', GaussianNB()),
    ('Neural Network', MLPClassifier()),
    ('AdaBoost', AdaBoostClassifier()),
    ('Gradient Boosting', GradientBoostingClassifier()),
    ('Extra Trees', ExtraTreesClassifier()),
    ('XGBClassifier', XGBClassifier())
]

# Treinar e avaliar cada modelo
results = []

for name, model in models:
    model.fit(X_train_new, y_train)
    y_pred = model.predict(X_test_new)
    y_pred_proba = model.predict_proba(X_test_new)[:, 1]
    accuracy = accuracy_score(y_test, y_pred)
    auc = roc_auc_score(y_test, y_pred_proba)
    report = classification_report(y_test, y_pred, output_dict=True)
    results.append((name, accuracy, auc, report))

# Exibir os resultados
for name, accuracy, auc, report in results:
    print(f"Model: {name}")
    print(f"Accuracy: {accuracy:.2f}")
    print(f"AUC-ROC: {auc:.2f}")
    print(f"Classification Report:\n{pd.DataFrame(report)}\n")
    print('--'*30)

Model: Logistic Regression
Accuracy: 0.73
AUC-ROC: 0.76
Classification Report:
                      0            1  accuracy     macro avg  weighted avg
precision      0.725716     0.979310  0.727155      0.852513      0.796288
recall         0.999837     0.019977  0.727155      0.509907      0.727155
f1-score       0.841003     0.039156  0.727155      0.440080      0.617860
support    18434.000000  7108.000000  0.727155  25542.000000  25542.000000

------------------------------------------------------------
Model: Decision Tree
Accuracy: 0.74
AUC-ROC: 0.69
Classification Report:
                      0            1  accuracy     macro avg  weighted avg
precision      0.825746     0.538905  0.743912      0.682326      0.745922
recall         0.817728     0.552476  0.743912      0.685102      0.743912
f1-score       0.821718     0.545606  0.743912      0.683662      0.744879
support    18434.000000  7108.000000  0.743912  25542.000000  25542.000000

-----------------------------------