In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import datetime
import pickle
from tqdm.notebook import tqdm

from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.inspection import permutation_importance
import xgboost as xgb

In [2]:
import pandas as pd
from itertools import product
import math

# Data

In [3]:
data = pd.read_csv('2023_DS2_HW1_data_train.csv', sep = ',', decimal = '.', index_col = 'Booking_ID')

In [4]:
data

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
Booking_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
INN10204,,,,2.0,Meal Plan 2,,Room_Type 6,,2018.0,9.0,,Online,0.0,0.0,,,1.0,0.0
INN20020,,,,2.0,Meal Plan 1,,,,,12.0,,Online,0.0,0.0,0.0,,,0.0
INN16435,1.0,,,2.0,,0.0,Room_Type 1,,2018.0,11.0,,,0.0,0.0,,,1.0,0.0
INN07143,3.0,,,3.0,,,,100.0,2018.0,5.0,,Online,0.0,0.0,,,2.0,0.0
INN20511,1.0,0.0,1.0,1.0,Meal Plan 1,0.0,,,2018.0,11.0,,,0.0,0.0,0.0,150.0,,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
INN16851,2.0,,,,Meal Plan 1,,,43.0,2017.0,12.0,26.0,Offline,,0.0,0.0,,2.0,0.0
INN06266,1.0,,,2.0,Meal Plan 2,0.0,Room_Type 1,102.0,2017.0,10.0,,Online,,0.0,,80.0,,1.0
INN11285,2.0,,,2.0,,,,,2018.0,5.0,,Corporate,0.0,0.0,0.0,,1.0,0.0
INN00861,2.0,,,3.0,Meal Plan 1,0.0,,,2018.0,6.0,,Online,,0.0,,,0.0,1.0


In [5]:
# define list of predictors
cols_pred = list(data.columns[0:-1])

# define list of numerical predictors
cols_pred_num = [col for col in cols_pred if data[col].dtype != 'O']
# define list of categorical predictors
cols_pred_cat = [col for col in cols_pred if data[col].dtype == 'O']


Booking_ID: unique identifier of each booking
no_of_adults: Number of adults
no_of_children: Number of Children
no_of_weekend_nights: Number of weekend nights (Saturday or Sunday) the guest stayed or booked to stay at the hotel
no_of_week_nights: Number of week nights (Monday to Friday) the guest stayed or booked to stay at the hotel
type_of_meal_plan: Type of meal plan booked by the customer:
required_car_parking_space: Does the customer require a car parking space? (0 - No, 1- Yes)
room_type_reserved: Type of room reserved by the customer. The values are ciphered (encoded) by INN Hotels.
lead_time: Number of days between the date of booking and the arrival date
arrival_year: Year of arrival date
arrival_month: Month of arrival date
arrival_date: Date of the month
market_segment_type: Market segment designation.
repeated_guest: Is the customer a repeated guest? (0 - No, 1- Yes)
no_of_previous_cancellations: Number of previous bookings that were canceled by the customer prior to the current booking
no_of_previous_bookings_not_canceled: Number of previous bookings not canceled by the customer prior to the current booking
avg_price_per_room: Average price per day of the reservation; prices of the rooms are dynamic. (in euros)
no_of_special_requests: Total number of special requests made by the customer (e.g. high floor, view from the room, etc)
booking_status: Flag indicating if the booking was canceled or not.

In [6]:
cols_pred_num

#arrival date je blbost kterou vyhodim, ale arrival month bych asi mohla udelat kategorickej
#ostatni promenny jsou bud skutecne numericky nebo 0-1

['no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'required_car_parking_space',
 'lead_time',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'no_of_special_requests']

In [7]:
cols_pred_num.remove('arrival_month')
cols_pred_cat.append('arrival_month')

In [8]:
data = data.dropna(subset=['booking_status'])
data.booking_status.value_counts(dropna=False)

0.0    21774
1.0    10521
Name: booking_status, dtype: int64

In [9]:

#kdybychom chteli stratifikovat podle neceho dalsiho, museli bychom zlikvidovat unavailable data, coz je skoda. zaroven stratifikace podle booking status asi trochu pomaha - soude z grafu nize
data_train, data_rest = train_test_split(data, test_size=0.4, random_state = 12,stratify = (data[["booking_status"]]))
data.loc[data_train.index, 'sample'] = 'train'
# use train_test_split to split the rest into valid and test (don't forget seed); use stratification
data_valid, data_test = train_test_split(data_rest, test_size=0.5, random_state = 12, stratify = (data_rest[["booking_status"]]))
data.loc[data_valid.index, 'sample'] = 'valid'
data.loc[data_test.index, 'sample'] = 'test'

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data.loc[data_train.index, 'sample'] = 'train'


In [10]:
train_mask = (data['sample'] == 'train')
valid_mask = (data['sample'] == 'valid')
test_mask = (data['sample'] == 'test')

In [11]:
# define function to plot default rate in time for different samples
def default_rate_in_time_per_sample(dt, col_target, col_month, col_sample):
    # group by over month and sample
    dt_grp = dt.groupby([col_month, col_sample]).agg(
        def_rt = (col_target, np.mean)
    ).reset_index()
    
    # pivot sample values to columns
    dt_grp_pivot = dt_grp.pivot(index = col_month, columns = col_sample, values = 'def_rt')

    # plot default rate in time
    lines = plt.plot(range(len(dt_grp_pivot)), dt_grp_pivot, marker = 'o')
    plt.xticks(range(len(dt_grp_pivot)), dt_grp_pivot.index, rotation = 90)
    # set legend
    plt.legend(iter(lines), tuple(dt_grp_pivot.columns), loc='best', bbox_to_anchor=(1.05, 1))
    
    plt.ylim([0, 1])
    plt.ylabel('default rate')
    plt.xlabel('month')
    plt.show()

# Data processing

In [12]:
# find columns with infinity values
cols_with_inf = []
for col in cols_pred_num:
    if np.any(np.isinf(data[col])):
        cols_with_inf.append(col)
        print(f'Column {col} includes infinity values.')

# find columns with negative infinity values
cols_with_neginf = []
for col in cols_pred_num:
    if np.any(np.isneginf(data[col])):
        cols_with_neginf.append(col)
        print(f'Column {col} includes negative infinity values.')
print("No other columns with infinity values")

No other columns with infinity values


In [13]:
col_target = "booking_status"

# categorical variables ecoding

In [14]:
def mean_target_encoding(dt, predictor, target, alpha = 0.01):
    total_cnt = len(dt)
    total_dr = np.mean(dt[target])
    dt_grp = dt.groupby(predictor).agg(
        categ_dr = (target, np.mean),
        categ_cnt = (target, len)
    )
    
    dt_grp['categ_freq'] = dt_grp['categ_cnt'] / total_cnt
    dt_grp['categ_encoding'] = (dt_grp['categ_freq'] * dt_grp['categ_dr'] + alpha * total_dr) / (dt_grp['categ_freq'] + alpha)
    
    return dt_grp[['categ_encoding']].to_dict()['categ_encoding']

In [15]:
total_dr = np.mean(data[train_mask][col_target])

# encode categorical predictors
for pred in tqdm(cols_pred_cat):
    if len(data[pred].unique()) < 5:
        dummies = pd.get_dummies(
            data[pred], 
            prefix = pred,
            prefix_sep = '_',
            dummy_na = True if data[pred].isnull().sum() > 0 else False,
            drop_first = False
        )
        
        for d in dummies.columns:
            if d in data.columns:
                del data[d]
                
        data = data.join(dummies)
        
        for col in dummies.columns:
            if col not in cols_pred:
                cols_pred.append(col)
        
        if pred in cols_pred:
            cols_pred.remove(pred)
    else:
        new_vals = mean_target_encoding(
            dt=data[train_mask], 
            predictor=pred, 
            target=col_target
        )

        additional_values = set(data[data[pred].notnull()][pred].unique()) - set(new_vals.keys())
        for p in additional_values:
            new_vals[p] = total_dr

        data['MTE_' + pred] = data[pred].replace(new_vals)
        
        if 'MTE_' + pred not in cols_pred:
            cols_pred.append('MTE_' + pred)
        
        if pred in cols_pred:
            cols_pred.remove(pred)

  0%|          | 0/4 [00:00<?, ?it/s]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['MTE_' + pred] = data[pred].replace(new_vals)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['MTE_' + pred] = data[pred].replace(new_vals)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data['MTE_' + pred] = data[pred].replace(new_vals)
A value is trying to be set on a copy of a slice fro

In [16]:
cols_pred
to_remove = ["arrival_date","no_of_children","repeated_guest","MTE_room_type_reserved"]
questionable = ["avg_price_per_room","MTE_type_of_meal_plan","required_car_parking_space","no_of_previous_bookings_not_canceled"]
rm_1 = ["avg_price_per_room","MTE_type_of_meal_plan"]
rm_2 = ["MTE_type_of_meal_plan","required_car_parking_space"]
rm_3 = ["MTE_type_of_meal_plan","required_car_parking_space","no_of_previous_bookings_not_canceled"]
rm_4 = ["avg_price_per_room","MTE_type_of_meal_plan","required_car_parking_space"]


aaa = cols_pred
bbb = [x for x in cols_pred if x not in to_remove]
ccc = [x for x in bbb if x not in questionable]
ddd = [x for x in bbb if x not in rm_1]
eee = [x for x in bbb if x not in rm_2]
fff = [x for x in bbb if x not in rm_3]
ggg = [x for x in bbb if x not in rm_4]

predictor_set_list = [aaa,bbb,ccc,ddd,eee,fff,ggg]

predictor_set = predictor_set_list[5]

# Model

In [19]:
params = {    
    'max_depth': 5,
    'eta': 0.10,
    'lambda': 0.1,
    "subsample":0.7,
    'objective': 'binary:logistic',
    'eval_metric': 'auc'
}


evals_result = {}
booster = xgb.train(
    verbose_eval=False,
    params = params,
    dtrain = xgb.DMatrix(data[train_mask][predictor_set], data[train_mask][col_target]),
    num_boost_round = 200,
    evals = (
        (xgb.DMatrix(data[train_mask][predictor_set], data[train_mask][col_target]), 'train'),
        (xgb.DMatrix(data[test_mask][predictor_set], data[test_mask][col_target]), 'test'),
        (xgb.DMatrix(data[valid_mask][predictor_set], data[valid_mask][col_target]), 'valid')
    ),
    evals_result = evals_result,
    early_stopping_rounds = 10
)


In [20]:
dtest = xgb.DMatrix(data[valid_mask][predictor_set])
y_pred = booster.predict(dtest)
y_true = data[valid_mask][col_target]
y_true = y_true.values
roc_auc = roc_auc_score( y_true,y_pred)

AUC = roc_auc
AUC


0.8099511945973554

# loading test set

In [21]:
data = pd.read_csv('/Users/pavelrezabek/Desktop/data-science-2-HA_1/data-science-2-HA_1/2023_DS2_HW1_data_test.csv', sep = ',', decimal = '.', index_col = 'Booking_ID')

In [26]:
# define list of predictors
cols_pred = list(data.columns)

# define list of numerical predictors
cols_pred_num = [col for col in cols_pred if data[col].dtype != 'O']
# define list of categorical predictors
cols_pred_cat = [col for col in cols_pred if data[col].dtype == 'O']


In [28]:
cols_pred_num

['no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'required_car_parking_space',
 'lead_time',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'no_of_special_requests']

In [29]:
cols_pred_num

#arrival date je blbost kterou vyhodim, ale arrival month bych asi mohla udelat kategorickej
#ostatni promenny jsou bud skutecne numericky nebo 0-1

['no_of_adults',
 'no_of_children',
 'no_of_weekend_nights',
 'no_of_week_nights',
 'required_car_parking_space',
 'lead_time',
 'arrival_year',
 'arrival_month',
 'arrival_date',
 'repeated_guest',
 'no_of_previous_cancellations',
 'no_of_previous_bookings_not_canceled',
 'avg_price_per_room',
 'no_of_special_requests']

In [30]:
cols_pred_num.remove('arrival_month')
cols_pred_cat.append('arrival_month')


In [39]:
  cols_pred_cat

['market_segment_type', 'arrival_month']

In [38]:
cols_pred_cat = cols_pred_cat[2:4]

In [44]:
data.market_segment_type.value_counts()

Online           1316
Offline           601
Corporate         104
Complementary      19
Aviation            7
Name: market_segment_type, dtype: int64

In [48]:
replace_dict1 = {'Online': 0.363841, 'Offline': 0.307843, 'Corporate': 0.167078, 'Complementary': 0.19804247576622647,'Aviation': 0.32545989}

# Use the replace() method to replace values in the "name" column
data['market_segment_type'] = data['market_segment_type'].replace(replace_dict1)
# Print the updated DataFrame
print(data)





            no_of_adults  no_of_children  no_of_weekend_nights  \
Booking_ID                                                       
INN04969             2.0             NaN                   1.0   
INN34541             2.0             0.0                   NaN   
INN36109             2.0             NaN                   NaN   
INN01554             2.0             NaN                   0.0   
INN24975             2.0             NaN                   NaN   
...                  ...             ...                   ...   
INN29523             1.0             NaN                   NaN   
INN25061             2.0             NaN                   1.0   
INN08475             2.0             NaN                   NaN   
INN13558             1.0             NaN                   NaN   
INN00626             2.0             NaN                   0.0   

            no_of_week_nights type_of_meal_plan  required_car_parking_space  \
Booking_ID                                                    

In [50]:
replace_dict2 = {1: 0.10975141226859428, 2: 0.2649120016467528, 3: 0.2876456095535222, 4: 0.36802913597831044, 5: 0.36072953496010357, 6: 0.3945629908265532, 7: 0.43479006606457415, 8: 0.3784152206481103, 9: 0.3209463849088696, 10: 0.35476825930811823, 11: 0.29137465229584697, 12: 0.15468658181050565}

# Use the replace() method to replace values in the "name" column
data['arrival_month'] = data['arrival_month'].replace(replace_dict2)
# Print the updated DataFrame
print(data)



            no_of_adults  no_of_children  no_of_weekend_nights  \
Booking_ID                                                       
INN04969             2.0             NaN                   1.0   
INN34541             2.0             0.0                   NaN   
INN36109             2.0             NaN                   NaN   
INN01554             2.0             NaN                   0.0   
INN24975             2.0             NaN                   NaN   
...                  ...             ...                   ...   
INN29523             1.0             NaN                   NaN   
INN25061             2.0             NaN                   1.0   
INN08475             2.0             NaN                   NaN   
INN13558             1.0             NaN                   NaN   
INN00626             2.0             NaN                   0.0   

            no_of_week_nights type_of_meal_plan  required_car_parking_space  \
Booking_ID                                                    

In [51]:
data = data.rename(columns={'arrival_month': 'MTE_arrival_month'})
data = data.rename(columns={'market_segment_type': 'MTE_market_segment_type'})

In [56]:
dtrain = xgb.DMatrix(data[predictor_set])
y_pred = booster.predict(dtrain)


In [59]:
xxx = pd.DataFrame(y_pred)

Unnamed: 0,0
0,0.002698
1,0.162917
2,0.187376
3,0.772983
4,0.186984
...,...
3588,0.039724
3589,0.490286
3590,0.948840
3591,0.065873


In [58]:
y_pred.to_csv("/Users/pavelrezabek/Desktop/predictions")

AttributeError: 'numpy.ndarray' object has no attribute 'to_csv'