In [33]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

import itertools
import joblib

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
columns = [
    # unique id
    u'reservation_id', 
    # dates
    u'booking_date', u'checkin_date', u'checkout_date',
    # traveller related info
    u'memberid', u'numberofadults', u'numberofchildren', u'total_pax', u'persontravellingid', u'member_age_buckets', 
    u'state_code_residence',
    # resort info
    u'resort_id', u'resort_region_code', u'resort_type_code', u'state_code_resort', u'cluster_code',
    # booking info
    u'channel_code', u'booking_type_code', u'room_type_booked_code', u'roomnights', u'season_holidayed_code', 
    u'reservationstatusid_code', 
    # product purchased
    u'main_product_code',
    # target 
    u'amount_spent_per_room_night_scaled'
]


In [3]:
train = pd.read_csv('train.csv')
print train.shape
test = pd.read_csv('test.csv')
print test.shape

(341424, 24)
(146765, 23)


In [4]:
df = pd.concat([train, test], axis=0, sort=False)
df.reset_index(drop=True, inplace=True)
df.shape

(488189, 24)

In [5]:
df.booking_date = pd.to_datetime(df.booking_date, format='%d/%m/%y')
df.checkin_date = pd.to_datetime(df.checkin_date, format='%d/%m/%y')
df.checkout_date = pd.to_datetime(df.checkout_date, format='%d/%m/%y')

In [6]:
df = df[columns]
df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346


In [7]:
df['booking_in_advance_days'] = (df.checkin_date - df.booking_date).dt.days
df['booking_roomnights'] = (df.checkout_date - df.checkin_date).dt.days
df['total_persons_travelling'] = df.numberofadults + df.numberofchildren

In [8]:
# roomnights, numberofadults, numberofchildren, total_pax

In [9]:
def create_date_variables(df, date_key, week=False, month=False, year=False, dayofweek=False, dayofmonth=False, dayofyear=False):
    if week:
        df['{}_{}'.format(date_key, 'week')] = df[date_key].dt.week
    if month:
        df['{}_{}'.format(date_key, 'month')] = df[date_key].dt.month
    if year:
        df['{}_{}'.format(date_key, 'year')] = df[date_key].dt.year
    if dayofweek:
        df['{}_{}'.format(date_key, 'dayofweek')] = df[date_key].dt.dayofweek
    if dayofmonth:
        df['{}_{}'.format(date_key, 'dayofmonth')] = df[date_key].dt.day
    if dayofyear:
        df['{}_{}'.format(date_key, 'dayofyear')] = df[date_key].dt.dayofyear
    return df 

def num_div_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_div_{}".format(col1, col2)] = (df[col1]/df[col2]).replace([np.inf, -np.inf, np.nan], 0)
    return df

def num_dif_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_dif_{}".format(col1, col2)] = df[col1] - df[col2]
    return df
    
def cat_interactions(df, cat_cols):
    for col1, col2 in itertools.combinations(cat_cols, 2):
        print '{}_{}'.format(col1, col2) 
        df['{}_{}'.format(col1, col2)] = df[col1].astype(str) + '_' + df[col2].astype(str)
    return df

    
def create_group_variables(df, group_col, num_col):
    #     count, mean, std, min, max, x-mean, median, x-median, iqr 
    if isinstance(group_col, str):
        group_col = [group_col]
    if isinstance(num_col, str):
        num_col = [num_col]
        
    df_group = df.groupby(group_col)
    
    functions = {
        'mean':'mean',
        'max':'max',
        'min':'min',
        'std':'std',
        'median':'median',
#         'min-max': lambda x: x.max()- x.min()
    }
    final_columns, apply_dict = [], {}
    prefix = 'GRP_' + '_'.join(map(str, group_col))
    for col in num_col:
        methods = []            
        for key, val in functions.iteritems():
            methods.append(val)
            final_columns.append("{}#{}#{}".format(prefix , col, key)) 
        apply_dict[col] = methods
        
    df_var = df_group.agg(apply_dict)
    df_var.columns = final_columns
    df_var.reset_index(inplace=True)
    
    df = df.merge(df_var, how='left', on=group_col)

    for col in num_col:
        df["{}#{}#{}".format(prefix , col, 'min_mean')] = df[col] - df["{}#{}#{}".format(prefix , col, 'mean')]
        
    return df


In [10]:
# df.head()
df = create_date_variables(df, 'checkin_date',  week=True, month=True, year=True, dayofweek=True, dayofmonth=True, dayofyear=True)
df = create_date_variables(df, 'checkout_date',  week=True, dayofweek=True)
df = create_date_variables(df, 'booking_date',  week=True, month=True, year=True, dayofyear=True)


# df = create_date_variables(df, 'checkout_date')
# df = create_date_variables(df, 'booking_date')

In [11]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear'],
      dtype='object')

In [12]:
df = cat_interactions(df, ['resort_id', 'checkin_date'])
df = cat_interactions(df, ['resort_id', 'checkout_date'])
df = cat_interactions(df, ['resort_id', 'booking_date'])

df = cat_interactions(df, ['resort_id', 'channel_code'])
df = cat_interactions(df, ['resort_id', 'booking_type_code'])
df = cat_interactions(df, ['resort_id', 'reservationstatusid_code'])
df = cat_interactions(df, ['resort_id', 'resort_type_code'])
df = cat_interactions(df, ['resort_id', 'cluster_code'])
df = cat_interactions(df, ['resort_id', 'room_type_booked_code'])


df = cat_interactions(df, ['persontravellingid', 'member_age_buckets', 'state_code_residence', 'state_code_resort'])
df = cat_interactions(df, ['resort_id', 'memberid'])

df = cat_interactions(df, ['memberid', 'channel_code'])
df = cat_interactions(df, ['memberid', 'booking_type_code'])
df = cat_interactions(df, ['memberid', 'reservationstatusid_code'])
df = cat_interactions(df, ['memberid', 'resort_type_code'])
df = cat_interactions(df, ['memberid', 'cluster_code'])
df = cat_interactions(df, ['memberid', 'room_type_booked_code'])

df = cat_interactions(df, ['memberid', 'checkin_date'])
df = cat_interactions(df, ['memberid', 'checkout_date'])
df = cat_interactions(df, ['memberid', 'booking_date'])

df = cat_interactions(df, ['memberid', 'checkin_date_month'])
df = cat_interactions(df, ['memberid', 'checkin_date_week'])
df = cat_interactions(df, ['memberid', 'checkin_date_dayofweek'])
df = cat_interactions(df, ['memberid', 'checkout_date_week'])
df = cat_interactions(df, ['memberid', 'checkout_date_dayofweek'])
df = cat_interactions(df, ['memberid', 'booking_date_week'])


df = cat_interactions(df, ['resort_id', 'booking_roomnights'])
df = cat_interactions(df, ['memberid', 'booking_roomnights'])


resort_id_checkin_date
resort_id_checkout_date
resort_id_booking_date
resort_id_channel_code
resort_id_booking_type_code
resort_id_reservationstatusid_code
resort_id_resort_type_code
resort_id_cluster_code
resort_id_room_type_booked_code
persontravellingid_member_age_buckets
persontravellingid_state_code_residence
persontravellingid_state_code_resort
member_age_buckets_state_code_residence
member_age_buckets_state_code_resort
state_code_residence_state_code_resort
resort_id_memberid
memberid_channel_code
memberid_booking_type_code
memberid_reservationstatusid_code
memberid_resort_type_code
memberid_cluster_code
memberid_room_type_booked_code
memberid_checkin_date
memberid_checkout_date
memberid_booking_date
memberid_checkin_date_month
memberid_checkin_date_week
memberid_checkin_date_dayofweek
memberid_checkout_date_week
memberid_checkout_date_dayofweek
memberid_booking_date_week
resort_id_booking_roomnights
memberid_booking_roomnights


In [13]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear',
       u'resort_id_checkin

In [14]:
df = num_div_interactions(df, num_cols=['numberofadults', 'numberofchildren','total_pax',  
                                        'roomnights', 'booking_in_advance_days', 'booking_roomnights', 
                                        'total_persons_travelling'])

df = num_dif_interactions(df, num_cols=[ 'roomnights', 'booking_roomnights'])
df = num_dif_interactions(df, num_cols=[ 'total_persons_travelling', 'total_pax'])
df = num_dif_interactions(df, num_cols=[ 'checkout_date_week', 'checkin_date_week', 'booking_date_week'])
df.shape
                                        

(488189, 98)

In [15]:
for c in [u'booking_date', u'checkin_date', u'checkout_date',u'memberid', u'resort_id',
        u'resort_id_checkin_date', u'resort_id_checkout_date',
         ]:
    print c
    df = create_group_variables(df, group_col=c, num_col=[
        u'numberofadults', u'numberofchildren', u'total_pax',u'roomnights', u'booking_in_advance_days'
    ])
    print df.shape

booking_date
(488189, 128)
checkin_date
(488189, 158)
checkout_date
(488189, 188)
memberid
(488189, 218)
resort_id
(488189, 248)
resort_id_checkin_date
(488189, 278)
resort_id_checkout_date
(488189, 308)


In [16]:
df.shape
df.head()

(488189, 308)

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_month,booking_date_year,booking_date_dayofyear,resort_id_checkin_date,resort_id_checkout_date,resort_id_booking_date,resort_id_channel_code,resort_id_booking_type_code,resort_id_reservationstatusid_code,resort_id_resort_type_code,resort_id_cluster_code,resort_id_room_type_booked_code,persontravellingid_member_age_buckets,persontravellingid_state_code_residence,...,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#numberofadults#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#numberofchildren#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#min_mean
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,4,2018,95,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,46_F,46_7.0,...,13.758114,43,0,19.0,19.571429,0.95119,4,2,2.0,2.714286,0.377964,1,0,0.0,0.142857,-0.285714,-3.571429,-16.571429,-1.714286,-0.142857,1.095445,4,2,2.0,2.8,0.83666,3,1,2.0,2.2,37.19543,93,0,15.0,29.0,1.140175,5,2,4.0,3.6,0.0,0,0,0.0,0.0,-0.8,-2.2,-26.0,-2.6,0.0
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,1,2015,23,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,46_F,46_7.0,...,28.604445,87,14,51.5,49.25,0.46291,3,2,2.0,2.25,0.707107,2,0,0.0,0.25,-1.0,-4.125,-47.25,2.75,77.75,1.035098,4,2,2.0,2.75,1.85164,8,2,3.5,4.0,32.780656,102,15,92.0,77.5,0.755929,4,2,2.0,2.5,0.707107,2,0,0.0,0.25,-0.75,-4.0,-75.5,2.5,77.75
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,1,2015,28,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,47_F,47_7.0,...,47.807194,120,4,92.0,71.444444,0.866025,4,2,2.0,2.666667,0.666667,2,0,0.0,0.222222,-0.888889,-2.888889,-69.444444,1.333333,3.777778,0.95119,4,2,2.0,2.714286,1.112697,6,3,3.0,3.714286,47.088569,93,4,92.0,55.0,0.48795,3,2,2.0,2.285714,0.377964,1,0,0.0,0.142857,-0.714286,-3.714286,-53.0,1.714286,3.857143
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,5,2015,122,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,19.348299,85,15,37.0,38.76,0.988264,6,2,3.0,3.32,0.869866,2,0,0.0,0.56,-1.04,-2.52,-36.76,1.68,39.44,1.240347,6,2,3.0,3.0,2.970414,14,1,4.0,4.851852,30.394744,103,0,41.0,43.925926,1.217161,6,2,3.0,3.407407,0.733799,2,0,0.0,0.333333,-1.0,-2.851852,-41.925926,1.592593,39.666667
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,9,2015,245,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,39.8239,104,6,103.0,75.407407,1.206582,6,1,2.0,2.925926,0.802418,2,0,0.0,0.481481,-0.62963,-4.185185,-73.407407,2.074074,102.518519,1.258091,6,1,2.0,2.93617,2.611261,14,1,3.0,4.085106,41.120837,106,0,19.0,45.893617,1.062784,6,1,3.0,2.851064,0.649486,2,0,0.0,0.276596,-0.93617,-4.085106,-43.893617,2.148936,102.723404


In [17]:
df.shape
df.head()

(488189, 308)

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_month,booking_date_year,booking_date_dayofyear,resort_id_checkin_date,resort_id_checkout_date,resort_id_booking_date,resort_id_channel_code,resort_id_booking_type_code,resort_id_reservationstatusid_code,resort_id_resort_type_code,resort_id_cluster_code,resort_id_room_type_booked_code,persontravellingid_member_age_buckets,persontravellingid_state_code_residence,...,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#numberofadults#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#numberofchildren#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#total_pax#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#roomnights#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_i_n___d_a_t_e#booking_in_advance_days#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#std,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#max,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#min,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#median,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofadults#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#numberofchildren#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#total_pax#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#roomnights#min_mean,GRP_r_e_s_o_r_t___i_d___c_h_e_c_k_o_u_t___d_a_t_e#booking_in_advance_days#min_mean
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,4,2018,95,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,46_F,46_7.0,...,13.758114,43,0,19.0,19.571429,0.95119,4,2,2.0,2.714286,0.377964,1,0,0.0,0.142857,-0.285714,-3.571429,-16.571429,-1.714286,-0.142857,1.095445,4,2,2.0,2.8,0.83666,3,1,2.0,2.2,37.19543,93,0,15.0,29.0,1.140175,5,2,4.0,3.6,0.0,0,0,0.0,0.0,-0.8,-2.2,-26.0,-2.6,0.0
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,1,2015,23,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,46_F,46_7.0,...,28.604445,87,14,51.5,49.25,0.46291,3,2,2.0,2.25,0.707107,2,0,0.0,0.25,-1.0,-4.125,-47.25,2.75,77.75,1.035098,4,2,2.0,2.75,1.85164,8,2,3.5,4.0,32.780656,102,15,92.0,77.5,0.755929,4,2,2.0,2.5,0.707107,2,0,0.0,0.25,-0.75,-4.0,-75.5,2.5,77.75
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,1,2015,28,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,47_F,47_7.0,...,47.807194,120,4,92.0,71.444444,0.866025,4,2,2.0,2.666667,0.666667,2,0,0.0,0.222222,-0.888889,-2.888889,-69.444444,1.333333,3.777778,0.95119,4,2,2.0,2.714286,1.112697,6,3,3.0,3.714286,47.088569,93,4,92.0,55.0,0.48795,3,2,2.0,2.285714,0.377964,1,0,0.0,0.142857,-0.714286,-3.714286,-53.0,1.714286,3.857143
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,5,2015,122,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,19.348299,85,15,37.0,38.76,0.988264,6,2,3.0,3.32,0.869866,2,0,0.0,0.56,-1.04,-2.52,-36.76,1.68,39.44,1.240347,6,2,3.0,3.0,2.970414,14,1,4.0,4.851852,30.394744,103,0,41.0,43.925926,1.217161,6,2,3.0,3.407407,0.733799,2,0,0.0,0.333333,-1.0,-2.851852,-41.925926,1.592593,39.666667
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,9,2015,245,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,39.8239,104,6,103.0,75.407407,1.206582,6,1,2.0,2.925926,0.802418,2,0,0.0,0.481481,-0.62963,-4.185185,-73.407407,2.074074,102.518519,1.258091,6,1,2.0,2.93617,2.611261,14,1,3.0,4.085106,41.120837,106,0,19.0,45.893617,1.062784,6,1,3.0,2.851064,0.649486,2,0,0.0,0.276596,-0.93617,-4.085106,-43.893617,2.148936,102.723404


In [18]:
# group resort date and checkin_date for number of bookings for that day

In [19]:
unique_id = 'reservation_id'

date_cols = []

target_col = 'amount_spent_per_room_night_scaled'

cat_cols = [
    'memberid',
    'persontravellingid', 'member_age_buckets', 'state_code_residence', 
    'resort_id', 'resort_region_code', 'resort_type_code', 'state_code_resort', 'cluster_code', 'channel_code',
    'booking_type_code', 'room_type_booked_code', 'season_holidayed_code', 'reservationstatusid_code', 'main_product_code',
    u'checkin_date', u'checkout_date', u'booking_date',
    
u'resort_id_checkin_date', u'resort_id_checkout_date',
u'resort_id_booking_date', u'resort_id_channel_code',
u'resort_id_booking_type_code', u'resort_id_reservationstatusid_code',
u'resort_id_resort_type_code', u'resort_id_cluster_code',
u'resort_id_room_type_booked_code',
u'persontravellingid_member_age_buckets',
u'persontravellingid_state_code_residence',
u'persontravellingid_state_code_resort',
u'member_age_buckets_state_code_residence',
u'member_age_buckets_state_code_resort',
u'state_code_residence_state_code_resort', u'resort_id_memberid',
u'memberid_channel_code', u'memberid_booking_type_code',
u'memberid_reservationstatusid_code', u'memberid_resort_type_code',
u'memberid_cluster_code', u'memberid_room_type_booked_code',
u'memberid_checkin_date', u'memberid_checkout_date',
u'memberid_booking_date', u'memberid_checkin_date_month',
u'memberid_checkin_date_week', u'memberid_checkin_date_dayofweek',
u'memberid_checkout_date_week', u'memberid_checkout_date_dayofweek',
u'memberid_booking_date_week', u'resort_id_booking_roomnights',
u'memberid_booking_roomnights'
]

columns_to_drop = []


In [20]:
from ml_modules.encoding import FreqeuncyEncoding

In [21]:
fE = FreqeuncyEncoding(categorical_columns=cat_cols, return_df=True)

In [22]:
%%time
df1 = fE.fit_transform(df)

CPU times: user 1min 49s, sys: 1min 44s, total: 3min 33s
Wall time: 51.1 s


In [23]:
df1.shape

(488189, 308)

In [24]:
df1.drop(columns=date_cols +[unique_id, target_col] + columns_to_drop, inplace=True)

In [25]:
x_train, y_train = df1[:341424].values, train[target_col].values
x_train.shape, y_train.shape

((341424, 306), (341424,))

In [26]:
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor

In [27]:
from ml_modules.custom_estimator import Estimator
from ml_modules.custom_fold_generator import FoldScheme

In [28]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

#### lgb leaves48 k5

In [29]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 48, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=5)

In [30]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.01066	valid_0's l2: 1.02144	valid_1's rmse: 1.00534	valid_1's l2: 1.0107
[200]	valid_0's rmse: 0.988811	valid_0's l2: 0.977748	valid_1's rmse: 0.982048	valid_1's l2: 0.964417
[300]	valid_0's rmse: 0.980048	valid_0's l2: 0.960494	valid_1's rmse: 0.971907	valid_1's l2: 0.944603
[400]	valid_0's rmse: 0.975619	valid_0's l2: 0.951833	valid_1's rmse: 0.965877	valid_1's l2: 0.932918
[500]	valid_0's rmse: 0.973009	valid_0's l2: 0.946747	valid_1's rmse: 0.961467	valid_1's l2: 0.924419
[600]	valid_0's rmse: 0.971469	valid_0's l2: 0.943752	valid_1's rmse: 0.957997	valid_1's l2: 0.917758
[700]	valid_0's rmse: 0.970342	valid_0's l2: 0.941564	valid_1's rmse: 0.9549	valid_1's l2: 0.911834
[800]	valid_0's rmse: 0.969545	valid_0's l2: 0.940017	valid_1's rmse: 0.952111	valid_1's l2: 0.906515
[900]	valid_0's rmse: 0.968967	valid_0's l2: 0.938897	valid_1's rmse: 0.949505	valid_1's l2: 0.901559
[1000]	valid_0's rmse: 0.9

[1500]	valid_0's rmse: 0.96138	valid_0's l2: 0.924251	valid_1's rmse: 0.938064	valid_1's l2: 0.879964
[1600]	valid_0's rmse: 0.961318	valid_0's l2: 0.924132	valid_1's rmse: 0.93611	valid_1's l2: 0.876303
[1700]	valid_0's rmse: 0.961276	valid_0's l2: 0.924051	valid_1's rmse: 0.934198	valid_1's l2: 0.872725
[1800]	valid_0's rmse: 0.961264	valid_0's l2: 0.924028	valid_1's rmse: 0.932312	valid_1's l2: 0.869206
[1900]	valid_0's rmse: 0.961258	valid_0's l2: 0.924018	valid_1's rmse: 0.930488	valid_1's l2: 0.865808
[2000]	valid_0's rmse: 0.961248	valid_0's l2: 0.923998	valid_1's rmse: 0.92865	valid_1's l2: 0.862391
[2100]	valid_0's rmse: 0.961243	valid_0's l2: 0.923989	valid_1's rmse: 0.926814	valid_1's l2: 0.858984
[2200]	valid_0's rmse: 0.961229	valid_0's l2: 0.923962	valid_1's rmse: 0.925011	valid_1's l2: 0.855646
[2300]	valid_0's rmse: 0.961223	valid_0's l2: 0.92395	valid_1's rmse: 0.923239	valid_1's l2: 0.852371
[2400]	valid_0's rmse: 0.961215	valid_0's l2: 0.923934	valid_1's rmse: 0.9214

In [31]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9650834158758828,
 0.002480513290703905,
 [0.9668427733009985,
  0.9645632888724335,
  0.961205703495109,
  0.9642953584136521,
  0.9685099552972207])

In [34]:
joblib.dump(est, 'lgb-308-lve48-k5.pkl')

['lgb-308-lve48-k5.pkl']

In [35]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

  return np.mean(np.column_stack((est.predict(x) for est in self.fitted_models)), axis=1)


In [36]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-308-lve48-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-308-lve48-k5-train.csv', index=False)

#### lgb leaves100 k5

In [38]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 100, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=5)

In [39]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00596	valid_0's l2: 1.01196	valid_1's rmse: 0.999059	valid_1's l2: 0.99812
[200]	valid_0's rmse: 0.98481	valid_0's l2: 0.96985	valid_1's rmse: 0.974327	valid_1's l2: 0.949314
[300]	valid_0's rmse: 0.977087	valid_0's l2: 0.954699	valid_1's rmse: 0.96279	valid_1's l2: 0.926964
[400]	valid_0's rmse: 0.973338	valid_0's l2: 0.947387	valid_1's rmse: 0.954965	valid_1's l2: 0.911958
[500]	valid_0's rmse: 0.971225	valid_0's l2: 0.943278	valid_1's rmse: 0.948637	valid_1's l2: 0.899913
[600]	valid_0's rmse: 0.969966	valid_0's l2: 0.940833	valid_1's rmse: 0.943122	valid_1's l2: 0.889478
[700]	valid_0's rmse: 0.969017	valid_0's l2: 0.938995	valid_1's rmse: 0.938	valid_1's l2: 0.879844
[800]	valid_0's rmse: 0.968401	valid_0's l2: 0.937801	valid_1's rmse: 0.933181	valid_1's l2: 0.870827
[900]	valid_0's rmse: 0.967992	valid_0's l2: 0.937009	valid_1's rmse: 0.928686	valid_1's l2: 0.862458
[1000]	valid_0's rmse: 0.967

[200]	valid_0's rmse: 0.985378	valid_0's l2: 0.970971	valid_1's rmse: 0.973712	valid_1's l2: 0.948116
[300]	valid_0's rmse: 0.978103	valid_0's l2: 0.956686	valid_1's rmse: 0.962199	valid_1's l2: 0.925827
[400]	valid_0's rmse: 0.974474	valid_0's l2: 0.949599	valid_1's rmse: 0.954336	valid_1's l2: 0.910756
[500]	valid_0's rmse: 0.972466	valid_0's l2: 0.945691	valid_1's rmse: 0.948038	valid_1's l2: 0.898776
[600]	valid_0's rmse: 0.971322	valid_0's l2: 0.943467	valid_1's rmse: 0.942506	valid_1's l2: 0.888317
[700]	valid_0's rmse: 0.970544	valid_0's l2: 0.941955	valid_1's rmse: 0.937373	valid_1's l2: 0.878668
[800]	valid_0's rmse: 0.969932	valid_0's l2: 0.940769	valid_1's rmse: 0.932538	valid_1's l2: 0.869627
[900]	valid_0's rmse: 0.969498	valid_0's l2: 0.939926	valid_1's rmse: 0.928034	valid_1's l2: 0.861248
[1000]	valid_0's rmse: 0.969271	valid_0's l2: 0.939486	valid_1's rmse: 0.923811	valid_1's l2: 0.853427
[1100]	valid_0's rmse: 0.969064	valid_0's l2: 0.939086	valid_1's rmse: 0.919827	v

In [40]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9651807002507475,
 0.0025756049725012805,
 [0.967073762232058,
  0.9647707435360413,
  0.9609773016047191,
  0.9645348062521448,
  0.9685468876287742])

In [41]:
joblib.dump(est, 'lgb-308-lve100-k5.pkl')

['lgb-308-lve100-k5.pkl']

In [42]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

In [43]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-308-lve100-k5-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-308-lve100-k5-train.csv', index=False)

#### lgb leaves100 k10

In [45]:
est = Estimator(model=LGBMRegressor(**{
        'n_estimators': 20000, 
        'learning_rate': 0.01,
        'num_leaves': 100, 
        'colsample_bytree': 0.5000000000000001, 
        'subsample': 1.0, 
        'min_child_weight': 150, 
        'boosting_type': 'gbdt'
}), early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold, 
                eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=10)

In [46]:
train_preds = est.fit_transform(x_train, y_train)

Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00841	valid_0's l2: 1.01688	valid_1's rmse: 0.999387	valid_1's l2: 0.998774
[200]	valid_0's rmse: 0.986874	valid_0's l2: 0.97392	valid_1's rmse: 0.974988	valid_1's l2: 0.950602
[300]	valid_0's rmse: 0.978998	valid_0's l2: 0.958437	valid_1's rmse: 0.963888	valid_1's l2: 0.92908
[400]	valid_0's rmse: 0.975186	valid_0's l2: 0.950988	valid_1's rmse: 0.956546	valid_1's l2: 0.91498
[500]	valid_0's rmse: 0.973058	valid_0's l2: 0.946842	valid_1's rmse: 0.950681	valid_1's l2: 0.903794
[600]	valid_0's rmse: 0.971578	valid_0's l2: 0.943964	valid_1's rmse: 0.945526	valid_1's l2: 0.89402
[700]	valid_0's rmse: 0.970669	valid_0's l2: 0.942198	valid_1's rmse: 0.940849	valid_1's l2: 0.885197
[800]	valid_0's rmse: 0.969945	valid_0's l2: 0.940793	valid_1's rmse: 0.936434	valid_1's l2: 0.876908
[900]	valid_0's rmse: 0.9694	valid_0's l2: 0.939736	valid_1's rmse: 0.932321	valid_1's l2: 0.869222
[1000]	valid_0's rmse: 0.96

[2200]	valid_0's rmse: 0.963639	valid_0's l2: 0.9286	valid_1's rmse: 0.890181	valid_1's l2: 0.792423
Early stopping, best iteration is:
[2085]	valid_0's rmse: 0.963604	valid_0's l2: 0.928533	valid_1's rmse: 0.893524	valid_1's l2: 0.798386
Training until validation scores don't improve for 200 rounds.
[100]	valid_0's rmse: 1.00114	valid_0's l2: 1.00229	valid_1's rmse: 1.0001	valid_1's l2: 1.00019
[200]	valid_0's rmse: 0.98039	valid_0's l2: 0.961165	valid_1's rmse: 0.975585	valid_1's l2: 0.951767
[300]	valid_0's rmse: 0.972965	valid_0's l2: 0.946662	valid_1's rmse: 0.964491	valid_1's l2: 0.930243
[400]	valid_0's rmse: 0.969342	valid_0's l2: 0.939624	valid_1's rmse: 0.957054	valid_1's l2: 0.915952
[500]	valid_0's rmse: 0.967345	valid_0's l2: 0.935755	valid_1's rmse: 0.951173	valid_1's l2: 0.90473
[600]	valid_0's rmse: 0.966106	valid_0's l2: 0.933361	valid_1's rmse: 0.946009	valid_1's l2: 0.894934
[700]	valid_0's rmse: 0.965286	valid_0's l2: 0.931778	valid_1's rmse: 0.941338	valid_1's l2: 

[100]	valid_0's rmse: 0.996067	valid_0's l2: 0.992149	valid_1's rmse: 1.00073	valid_1's l2: 1.00147
[200]	valid_0's rmse: 0.974368	valid_0's l2: 0.949393	valid_1's rmse: 0.976373	valid_1's l2: 0.953305
[300]	valid_0's rmse: 0.966717	valid_0's l2: 0.934542	valid_1's rmse: 0.965204	valid_1's l2: 0.931619
[400]	valid_0's rmse: 0.96298	valid_0's l2: 0.927331	valid_1's rmse: 0.957747	valid_1's l2: 0.917279
[500]	valid_0's rmse: 0.960905	valid_0's l2: 0.923339	valid_1's rmse: 0.951863	valid_1's l2: 0.906042
[600]	valid_0's rmse: 0.959432	valid_0's l2: 0.920509	valid_1's rmse: 0.946657	valid_1's l2: 0.896159
[700]	valid_0's rmse: 0.958583	valid_0's l2: 0.918882	valid_1's rmse: 0.94195	valid_1's l2: 0.887269
[800]	valid_0's rmse: 0.957966	valid_0's l2: 0.917699	valid_1's rmse: 0.937571	valid_1's l2: 0.879039
[900]	valid_0's rmse: 0.957614	valid_0's l2: 0.917024	valid_1's rmse: 0.933431	valid_1's l2: 0.871294
[1000]	valid_0's rmse: 0.957304	valid_0's l2: 0.916431	valid_1's rmse: 0.929489	valid_

In [47]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9642702049227301,
 0.004806150043709987,
 [0.9680457024873663,
  0.963958049555288,
  0.9636042525041135,
  0.9632826841534914,
  0.9587984848042512,
  0.9620810164433214,
  0.9706649947823239,
  0.9564967719458812,
  0.9627355682031266,
  0.9730345243481376])

In [48]:
joblib.dump(est, 'lgb-308-lve100-k10.pkl')

['lgb-308-lve100-k10.pkl']

In [49]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

In [50]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-308-lve100-k10-test.csv', index=False)

pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('lgb-308-lve100-k10-train.csv', index=False)