# Homework 4

In [21]:
import dalex as dx
import pandas as pd
import warnings
warnings.filterwarnings('ignore')

from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_auc_score
from sklearn.compose import ColumnTransformer

In [2]:
data = pd.read_csv('hotel_bookings.csv')
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [17]:
# this time all variables will be used as it was possible to use machine with better processor
data = data[['is_canceled', 'lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
            'previous_cancellations', 'is_repeated_guest', 'arrival_date_month', 'deposit_type', 'customer_type']]
categorical_features = ['arrival_date_month', 'deposit_type', 'customer_type']
numeric_features = ['lead_time', 'arrival_date_year', 'adults', 'children', 'babies', 'booking_changes',
            'previous_cancellations', 'is_repeated_guest']
data = data.dropna()
X, y = data.loc[:, data.columns != 'is_canceled'], data[['is_canceled']]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=42)

## Random forest

In [26]:
categorical_transformer = OneHotEncoder()
numeric_transformer = StandardScaler()
preprocessor = ColumnTransformer(
   transformers=[
   ('categorical', categorical_transformer, categorical_features),
    ('numeric', numeric_transformer, numeric_features)
])
forest = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', RandomForestClassifier(random_state=123))
           ])

forest.fit(X_train, y_train)
print(f'ROC score: {roc_auc_score(y_test, forest.predict_proba(X_test)[:, 1])}')

ROC score: 0.8403758492684325


In [19]:
exp_forest = dx.Explainer(forest, X_train, y_train, label='random_forest')

Preparation of a new explainer is initiated

  -> data              : 107447 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 107447 values
  -> model_class       : sklearn.ensemble._forest.RandomForestClassifier (default)
  -> label             : random_forest
  -> predict function  : <function yhat_proba_default at 0x000001E599616F70> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.0, mean = 0.371, max = 1.0
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.979, mean = -0.000555, max = 0.987
  -> model_info        : package sklearn

A new explainer has been created!


In [20]:
exp_forest.model_parts(loss_function = '1-auc').plot()

## Adaboost

In [27]:
adaboost = Pipeline(steps = [
               ('preprocessor', preprocessor)
              ,('regressor', AdaBoostClassifier(n_estimators=100, random_state=0))
           ])

adaboost.fit(X_train, y_train)
print(f'ROC score: {roc_auc_score(y_test, adaboost.predict_proba(X_test)[:, 1])}')

ROC score: 0.8231847990679497


In [28]:
exp_adaboost = dx.Explainer(adaboost, X_train, y_train, label='adaboost')

Preparation of a new explainer is initiated

  -> data              : 107447 rows 11 cols
  -> target variable   : Parameter 'y' was a pandas.DataFrame. Converted to a numpy.ndarray.
  -> target variable   : 107447 values
  -> model_class       : sklearn.ensemble._weight_boosting.AdaBoostClassifier (default)
  -> label             : adaboost
  -> predict function  : <function yhat_proba_default at 0x000001E599616F70> will be used (default)
  -> predict function  : Accepts only pandas.DataFrame, numpy.ndarray causes problems.
  -> predicted values  : min = 0.484, mean = 0.5, max = 0.587
  -> model type        : classification will be used (default)
  -> residual function : difference between y and yhat (default)
  -> residuals         : min = -0.513, mean = -0.129, max = 0.514
  -> model_info        : package sklearn

A new explainer has been created!


In [30]:
exp_adaboost.model_parts(loss_function = '1-auc').plot()

## Summary
As one can notice `lead_time`(as well as `deposit_type`) is among top of the most important variables in both cases, although in random forest it holds incomparable influence. `arrival_date_month` is pretty useful in random forest but is considered rather not influential in adaboost model. Antipodal example is `previous_cancellations` for instance, as it is considered influential by adaboost und not so by random forest. There are also some variables as `booking_changes` or `arrival_date_year` which are in the middle for both models. All in all there are some visible differences between two models(also variables in random forest tend to give bigger loss function changes than in adaboost)