# 0. IMPORTS

In [16]:
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from category_encoders.count import CountEncoder
from sklearn.ensemble        import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing   import MinMaxScaler, StandardScaler, RobustScaler
import xgboost as xgb
from sklearn.model_selection import cross_val_score
import xgboost as xgb

# Load Data

In [17]:
train = pd.read_csv('../inputs/train.csv')
test = pd.read_csv('../inputs/test.csv')

In [18]:
train.shape, test.shape

((72159, 15), (48106, 14))

## Feature Engineering

## Feature Class Split (Separar dados em classe y e feature X)

In [19]:
X_train = train.drop(columns=['Reserva Cancelada'])
y_train = train['Reserva Cancelada']

## Pre processing (transformar as colunas qualitativas em quantitativas)

In [20]:
numeric_cols = X_train.select_dtypes(include=['int64', 'float64']).columns

In [21]:
rs = RobustScaler()

rs.fit(X_train[numeric_cols])
X_train[numeric_cols] = rs.transform(X_train[numeric_cols])

In [22]:
cat_cols = X_train.select_dtypes(include=['object']).columns

In [23]:
# Transforma as features categoricas em numéricas através do numero de frequencia
count = CountEncoder(cols=cat_cols, return_df=True) 

X_preproc = count.fit_transform(X_train)

In [24]:
X_preproc

Unnamed: 0,id,Classificação do hotel,Meses da reserva até o check-in,Número de pernoites reservadas,Número de hospedes,Regime de alimentação,Nacionalidade,Forma de Reserva,Já se hospedou anterioremente,Tipo do quarto reservado,Reserva feita por agência de turismo,Reserva feita por empresa,Reserva com Estacionamento,Reserva com Observações
0,-0.439710,24255,0.4,1.0,0.0,8777,6251,59170,69901,51778,62288,68065,4453,42391
1,0.371188,47904,32.8,0.0,0.0,55716,34212,59170,69901,51778,62288,68065,67706,42391
2,0.563649,47904,0.2,0.0,0.0,55716,1464,59170,69901,51778,62288,68065,67706,29542
3,0.250732,47904,2.0,-0.5,0.0,55716,34212,59170,69901,51778,62288,68065,67706,42391
4,0.379183,47904,0.2,-0.5,0.0,55716,34212,59170,69901,51778,62288,68065,67706,42391
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
72154,0.692863,47904,-0.2,0.0,0.0,55716,6251,59170,69901,11798,62288,68065,67706,29542
72155,-0.153929,47904,0.2,-0.5,0.0,7189,2230,59170,69901,51778,62288,68065,4453,29542
72156,-0.928534,24255,0.4,-0.5,0.0,8777,34212,59170,69901,51778,9871,68065,67706,42391
72157,-0.734016,24255,-0.4,2.5,0.0,8777,34212,8834,69901,3924,62288,68065,67706,42391


## Fill NA

In [25]:
X_preproc.isna().sum()

id                                      0
Classificação do hotel                  0
Meses da reserva até o check-in         0
Número de pernoites reservadas          0
Número de hospedes                      3
Regime de alimentação                   0
Nacionalidade                           0
Forma de Reserva                        0
Já se hospedou anterioremente           0
Tipo do quarto reservado                0
Reserva feita por agência de turismo    0
Reserva feita por empresa               0
Reserva com Estacionamento              0
Reserva com Observações                 0
dtype: int64

In [26]:
X_preproc = X_preproc.fillna(0)

## Treinando Modelo - Logistic Regression

In [13]:
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import RandomizedSearchCV 

## Otimização hiperparam

In [14]:
# rf = RandomForestClassifier(random_state = 1)
# param_distributions =  {'n_estimators': [200, 400, 500],
#                'criterion':['gini','entropy'],
#                                   'bootstrap': [True],
#                                   'max_depth': [10, 20],
#                                   'max_features': ['auto','sqrt', 10],
#                                   'min_samples_leaf': [2,3],
#                                   'min_samples_split': [2,3]}
                                  
# # Instanciar o objeto RandomizedSearchCV
# random_search = RandomizedSearchCV(rf, param_distributions=param_distributions, n_iter=3, scoring='f1', cv=5)

# # Ajustar o modelo
# random_search.fit(X_preproc, y_train)

# # Imprimir os melhores parâmetros e o F1-score do modelo
# print('Melhores parâmetros:', random_search.best_params_)
# print('F1-score:', random_search.best_score_)

Melhores parâmetros: {'n_estimators': 400, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_features': 10, 'max_depth': 20, 'criterion': 'gini', 'bootstrap': True}
F1-score: 0.9636777302219226

In [79]:
rf = RandomForestClassifier(n_estimators= 400, 
                               min_samples_split= 2, 
                               min_samples_leaf= 2, 
                               max_features= 10, 
                               max_depth= 20, 
                               criterion= 'gini',  
                               bootstrap = True,  
                               n_jobs=-1, 
                               random_state=1)

rf.fit(X_preproc, y_train)

scores = cross_val_score(rf, X_preproc, y_train, cv=5, scoring='f1_macro')

print("F1-score Médio:", scores.mean())

F1-score Médio: 0.9710898083488955


In [27]:
xgb = xgb.XGBClassifier(objective='binary:logistic',
                        n_estimators=400,
                        eta=0.01,
                        max_depth=20,
                        subsample=0.7,                       
                        colsample_bytree=0.9,
                        random_state=1)

xgb.fit(X_preproc, y_train)

scores = cross_val_score(xgb, X_preproc, y_train, cv=5, scoring='f1_macro')

print("F1-score Médio:", scores.mean())

F1-score Médio: 0.9715118007009957


## Aplicando transformações no teste

In [28]:
# X_test = test.drop(columns='id')

X_test = test.copy()

X_test[numeric_cols] = rs.transform(X_test[numeric_cols])
X_test = count.transform(X_test)

X_test = X_test.fillna(0)

## Predic test

In [29]:
predicao_rf = xgb.predict(X_test)

## Solucao Final

In [30]:
df_final_rf = pd.concat([test.id, pd.Series(predicao_rf, name = 'Reserva Cancelada')], axis=1)

df_final_rf.head()

Unnamed: 0,id,Reserva Cancelada
0,118345,0
1,9500,1
2,34558,0
3,70816,1
4,105321,0


In [31]:
df_final_rf.to_csv('submission_tuned_xgb.csv', index=False) #Arquivo de submissão pronto para ser enviado.