In [1]:
%load_ext autoreload
%autoreload 2

from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

import pandas as pd
import numpy as np
pd.options.display.max_columns = 100
pd.options.display.max_rows = 100

import itertools
import joblib

In [2]:
columns = [
    # unique id
    u'reservation_id', 
    # dates
    u'booking_date', u'checkin_date', u'checkout_date',
    # traveller related info
    u'memberid', u'numberofadults', u'numberofchildren', u'total_pax', u'persontravellingid', u'member_age_buckets', 
    u'state_code_residence',
    # resort info
    u'resort_id', u'resort_region_code', u'resort_type_code', u'state_code_resort', u'cluster_code',
    # booking info
    u'channel_code', u'booking_type_code', u'room_type_booked_code', u'roomnights', u'season_holidayed_code', 
    u'reservationstatusid_code', 
    # product purchased
    u'main_product_code',
    # target 
    u'amount_spent_per_room_night_scaled'
]


In [3]:
train = pd.read_csv('train.csv')
print train.shape
test = pd.read_csv('test.csv')
print test.shape

(341424, 24)
(146765, 23)


In [4]:
df = pd.concat([train, test], axis=0, sort=False)
df.reset_index(drop=True, inplace=True)
df.shape

(488189, 24)

In [5]:
df.booking_date = pd.to_datetime(df.booking_date, format='%d/%m/%y')
df.checkin_date = pd.to_datetime(df.checkin_date, format='%d/%m/%y')
df.checkout_date = pd.to_datetime(df.checkout_date, format='%d/%m/%y')

In [6]:
df = df[columns]
df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346


In [7]:
df['booking_in_advance_days'] = (df.checkin_date - df.booking_date).dt.days
df['booking_roomnights'] = (df.checkout_date - df.checkin_date).dt.days
df['total_persons_travelling'] = df.numberofadults + df.numberofchildren

In [8]:
# roomnights, numberofadults, numberofchildren, total_pax

In [9]:
def create_date_variables(df, date_key, week=False, month=False, year=False, dayofweek=False, dayofmonth=False, dayofyear=False):
    if week:
        df['{}_{}'.format(date_key, 'week')] = df[date_key].dt.week
    if month:
        df['{}_{}'.format(date_key, 'month')] = df[date_key].dt.month
    if year:
        df['{}_{}'.format(date_key, 'year')] = df[date_key].dt.year
    if dayofweek:
        df['{}_{}'.format(date_key, 'dayofweek')] = df[date_key].dt.dayofweek
    if dayofmonth:
        df['{}_{}'.format(date_key, 'dayofmonth')] = df[date_key].dt.day
    if dayofyear:
        df['{}_{}'.format(date_key, 'dayofyear')] = df[date_key].dt.dayofyear
    return df 

def num_div_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_div_{}".format(col1, col2)] = (df[col1]/df[col2]).replace([np.inf, -np.inf, np.nan], 0)
    return df

def num_dif_interactions(df, num_cols):
    for col1, col2 in itertools.combinations(num_cols,2):
        df["{}_dif_{}".format(col1, col2)] = df[col1] - df[col2]
    return df
    
def cat_interactions(df, cat_cols):
    for col1, col2 in itertools.combinations(cat_cols, 2):
        print '{}_{}'.format(col1, col2) 
        df['{}_{}'.format(col1, col2)] = df[col1].astype(str) + '_' + df[col2].astype(str)
    return df

    
def create_group_variables(df, group_col, num_col):
    #     count, mean, std, min, max, x-mean, median, x-median, iqr 
    if isinstance(group_col, str):
        group_col = [group_col]
    if isinstance(num_col, str):
        num_col = [num_col]
        
    df_group = df.groupby(group_col)
    
    functions = {
        'mean':'mean',
        'max':'max',
        'min':'min',
        'std':'std',
        'median':'median',
#         'min-max': lambda x: x.max()- x.min()
    }
    final_columns, apply_dict = [], {}
    prefix = 'GRP_' + '_'.join(map(str, group_col))
    for col in num_col:
        methods = []            
        for key, val in functions.iteritems():
            methods.append(val)
            final_columns.append("{}#{}#{}".format(prefix , col, key)) 
        apply_dict[col] = methods
        
    df_var = df_group.agg(apply_dict)
    df_var.columns = final_columns
    df_var.reset_index(inplace=True)
    
    df = df.merge(df_var, how='left', on=group_col)

    for col in num_col:
        df["{}#{}#{}".format(prefix , col, 'min_mean')] = df[col] - df["{}#{}#{}".format(prefix , col, 'mean')]
        
    return df


In [10]:
# df.head()
df = create_date_variables(df, 'checkin_date',  week=True, month=True, year=True, dayofweek=True, dayofmonth=True, dayofyear=True)
df = create_date_variables(df, 'checkout_date',  week=True, dayofweek=True)
df = create_date_variables(df, 'booking_date',  week=True, month=True, year=True, dayofyear=True)


# df = create_date_variables(df, 'checkout_date')
# df = create_date_variables(df, 'booking_date')

In [11]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear'],
      dtype='object')

In [12]:
df.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_month,booking_date_year,booking_date_dayofyear
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,4,2018,95
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,1,2015,23
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,1,2015,28
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,5,2015,122
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,9,2015,245


In [13]:
df = cat_interactions(df, ['resort_id', 'checkin_date'])
df = cat_interactions(df, ['resort_id', 'checkout_date'])
df = cat_interactions(df, ['resort_id', 'booking_date'])

df = cat_interactions(df, ['resort_id', 'channel_code'])
df = cat_interactions(df, ['resort_id', 'booking_type_code'])
df = cat_interactions(df, ['resort_id', 'reservationstatusid_code'])
df = cat_interactions(df, ['resort_id', 'resort_type_code'])
df = cat_interactions(df, ['resort_id', 'cluster_code'])
df = cat_interactions(df, ['resort_id', 'room_type_booked_code'])


df = cat_interactions(df, ['persontravellingid', 'member_age_buckets', 'state_code_residence', 'state_code_resort'])
df = cat_interactions(df, ['resort_id', 'memberid'])

df = cat_interactions(df, ['memberid', 'channel_code'])
df = cat_interactions(df, ['memberid', 'booking_type_code'])
df = cat_interactions(df, ['memberid', 'reservationstatusid_code'])
df = cat_interactions(df, ['memberid', 'resort_type_code'])
df = cat_interactions(df, ['memberid', 'cluster_code'])
df = cat_interactions(df, ['memberid', 'room_type_booked_code'])

df = cat_interactions(df, ['memberid', 'checkin_date'])
df = cat_interactions(df, ['memberid', 'checkout_date'])
df = cat_interactions(df, ['memberid', 'booking_date'])

df = cat_interactions(df, ['memberid', 'checkin_date_month'])
df = cat_interactions(df, ['memberid', 'checkin_date_week'])
df = cat_interactions(df, ['memberid', 'checkin_date_dayofweek'])
df = cat_interactions(df, ['memberid', 'checkout_date_week'])
df = cat_interactions(df, ['memberid', 'checkout_date_dayofweek'])
df = cat_interactions(df, ['memberid', 'booking_date_week'])


df = cat_interactions(df, ['resort_id', 'booking_roomnights'])
df = cat_interactions(df, ['memberid', 'booking_roomnights'])


resort_id_checkin_date
resort_id_checkout_date
resort_id_booking_date
resort_id_channel_code
resort_id_booking_type_code
resort_id_reservationstatusid_code
resort_id_resort_type_code
resort_id_cluster_code
resort_id_room_type_booked_code
persontravellingid_member_age_buckets
persontravellingid_state_code_residence
persontravellingid_state_code_resort
member_age_buckets_state_code_residence
member_age_buckets_state_code_resort
state_code_residence_state_code_resort
resort_id_memberid
memberid_channel_code
memberid_booking_type_code
memberid_reservationstatusid_code
memberid_resort_type_code
memberid_cluster_code
memberid_room_type_booked_code
memberid_checkin_date
memberid_checkout_date
memberid_booking_date
memberid_checkin_date_month
memberid_checkin_date_week
memberid_checkin_date_dayofweek
memberid_checkout_date_week
memberid_checkout_date_dayofweek
memberid_booking_date_week
resort_id_booking_roomnights
memberid_booking_roomnights


In [14]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear',
       u'resort_id_checkin

In [15]:
df = num_div_interactions(df, num_cols=['numberofadults', 'numberofchildren','total_pax',  
                                        'roomnights', 'booking_in_advance_days', 'booking_roomnights', 
                                        'total_persons_travelling'])

df = num_dif_interactions(df, num_cols=[ 'roomnights', 'booking_roomnights'])
df = num_dif_interactions(df, num_cols=[ 'total_persons_travelling', 'total_pax'])
df = num_dif_interactions(df, num_cols=[ 'checkout_date_week', 'checkin_date_week', 'booking_date_week'])
df.shape
                                        

(488189, 98)

In [16]:
df.columns

Index([u'reservation_id', u'booking_date', u'checkin_date', u'checkout_date',
       u'memberid', u'numberofadults', u'numberofchildren', u'total_pax',
       u'persontravellingid', u'member_age_buckets', u'state_code_residence',
       u'resort_id', u'resort_region_code', u'resort_type_code',
       u'state_code_resort', u'cluster_code', u'channel_code',
       u'booking_type_code', u'room_type_booked_code', u'roomnights',
       u'season_holidayed_code', u'reservationstatusid_code',
       u'main_product_code', u'amount_spent_per_room_night_scaled',
       u'booking_in_advance_days', u'booking_roomnights',
       u'total_persons_travelling', u'checkin_date_week',
       u'checkin_date_month', u'checkin_date_year', u'checkin_date_dayofweek',
       u'checkin_date_dayofmonth', u'checkin_date_dayofyear',
       u'checkout_date_week', u'checkout_date_dayofweek', u'booking_date_week',
       u'booking_date_month', u'booking_date_year', u'booking_date_dayofyear',
       u'resort_id_checkin

In [17]:
for c in [u'booking_date', u'checkin_date', u'checkout_date', u'memberid', u'resort_id',
        u'resort_id_checkin_date', u'resort_id_checkout_date', 
          
        u'resort_id_channel_code', u'resort_id_booking_type_code', u'resort_id_reservationstatusid_code',
        u'resort_id_room_type_booked_code',

        u'persontravellingid_member_age_buckets',
        u'persontravellingid_state_code_residence',
        u'persontravellingid_state_code_resort',
        u'member_age_buckets_state_code_residence',
        u'member_age_buckets_state_code_resort',

        u'memberid_channel_code', u'memberid_booking_type_code',u'memberid_reservationstatusid_code', 
        u'memberid_resort_type_code', u'memberid_cluster_code', u'memberid_room_type_booked_code',

#             u'memberid_checkin_date', u'memberid_checkout_date',
          
         ]:
    print c
    df = create_group_variables(df, group_col=[c], num_col=[
         u'total_pax',u'roomnights','total_persons_travelling', 'booking_roomnights', 
        u'booking_in_advance_days',
        u'total_persons_travelling_dif_total_pax', u'roomnights_dif_booking_roomnights'
    ])
    print df.shape

booking_date
(488189, 140)
checkin_date
(488189, 182)
checkout_date
(488189, 224)
memberid
(488189, 266)
resort_id
(488189, 308)
resort_id_checkin_date
(488189, 350)
resort_id_checkout_date
(488189, 392)
resort_id_channel_code
(488189, 434)
resort_id_booking_type_code
(488189, 476)
resort_id_reservationstatusid_code
(488189, 518)
resort_id_room_type_booked_code
(488189, 560)
persontravellingid_member_age_buckets
(488189, 602)
persontravellingid_state_code_residence
(488189, 644)
persontravellingid_state_code_resort
(488189, 686)
member_age_buckets_state_code_residence
(488189, 728)
member_age_buckets_state_code_resort
(488189, 770)
memberid_channel_code
(488189, 812)
memberid_booking_type_code
(488189, 854)
memberid_reservationstatusid_code
(488189, 896)
memberid_resort_type_code
(488189, 938)
memberid_cluster_code
(488189, 980)
memberid_room_type_booked_code
(488189, 1022)


In [18]:
df.shape
df.head()

(488189, 1022)

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_month,booking_date_year,booking_date_dayofyear,resort_id_checkin_date,resort_id_checkout_date,resort_id_booking_date,resort_id_channel_code,resort_id_booking_type_code,resort_id_reservationstatusid_code,resort_id_resort_type_code,resort_id_cluster_code,resort_id_room_type_booked_code,persontravellingid_member_age_buckets,persontravellingid_state_code_residence,...,GRP_memberid_cluster_code#roomnights_dif_booking_roomnights#mean,GRP_memberid_cluster_code#total_pax#min_mean,GRP_memberid_cluster_code#roomnights#min_mean,GRP_memberid_cluster_code#total_persons_travelling#min_mean,GRP_memberid_cluster_code#booking_roomnights#min_mean,GRP_memberid_cluster_code#booking_in_advance_days#min_mean,GRP_memberid_cluster_code#total_persons_travelling_dif_total_pax#min_mean,GRP_memberid_cluster_code#roomnights_dif_booking_roomnights#min_mean,GRP_memberid_room_type_booked_code#total_pax#std,GRP_memberid_room_type_booked_code#total_pax#max,GRP_memberid_room_type_booked_code#total_pax#min,GRP_memberid_room_type_booked_code#total_pax#median,GRP_memberid_room_type_booked_code#total_pax#mean,GRP_memberid_room_type_booked_code#roomnights#std,GRP_memberid_room_type_booked_code#roomnights#max,GRP_memberid_room_type_booked_code#roomnights#min,GRP_memberid_room_type_booked_code#roomnights#median,GRP_memberid_room_type_booked_code#roomnights#mean,GRP_memberid_room_type_booked_code#total_persons_travelling#std,GRP_memberid_room_type_booked_code#total_persons_travelling#max,GRP_memberid_room_type_booked_code#total_persons_travelling#min,GRP_memberid_room_type_booked_code#total_persons_travelling#median,GRP_memberid_room_type_booked_code#total_persons_travelling#mean,GRP_memberid_room_type_booked_code#booking_roomnights#std,GRP_memberid_room_type_booked_code#booking_roomnights#max,GRP_memberid_room_type_booked_code#booking_roomnights#min,GRP_memberid_room_type_booked_code#booking_roomnights#median,GRP_memberid_room_type_booked_code#booking_roomnights#mean,GRP_memberid_room_type_booked_code#booking_in_advance_days#std,GRP_memberid_room_type_booked_code#booking_in_advance_days#max,GRP_memberid_room_type_booked_code#booking_in_advance_days#min,GRP_memberid_room_type_booked_code#booking_in_advance_days#median,GRP_memberid_room_type_booked_code#booking_in_advance_days#mean,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#std,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#max,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#min,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#median,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#mean,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#std,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#max,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#min,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#median,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#mean,GRP_memberid_room_type_booked_code#total_pax#min_mean,GRP_memberid_room_type_booked_code#roomnights#min_mean,GRP_memberid_room_type_booked_code#total_persons_travelling#min_mean,GRP_memberid_room_type_booked_code#booking_roomnights#min_mean,GRP_memberid_room_type_booked_code#booking_in_advance_days#min_mean,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#min_mean,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#min_mean
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,4,2018,95,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,46_F,46_7.0,...,2.4,3.4,-1.8,1.6,-48.4,-2.8,-4.2,-2.4,1.48324,2,-2,0.0,-0.2,0.894427,4,2,2.0,2.6,0.0,0,0,0.0,0.0,17.487138,40,0,8.0,14.6,2.345208,6,1,2.0,3.0,2.345208,6,1,2.0,3.0,0.894427,4,2,2.0,2.4,3.2,-1.6,2.0,-13.6,-3.0,-4.0,-2.4
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,1,2015,23,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,46_F,46_7.0,...,2.4,2.4,2.2,1.6,-44.4,75.2,-3.2,-2.4,0.534522,1,-1,0.0,0.0,0.534522,3,1,2.0,2.0,0.744024,2,0,0.5,0.625,42.610361,110,0,55.0,53.75,1.752549,5,1,2.5,2.75,1.407886,5,1,3.5,3.375,0.0,2,2,2.0,2.0,2.0,3.0,1.375,-48.75,75.25,-3.375,-2.0
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,1,2015,28,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,47_F,47_7.0,...,2.0,2.0,2.0,1.5,-53.0,0.5,-4.0,-2.0,0.534522,1,-1,0.0,0.0,0.534522,3,1,2.0,2.0,0.744024,2,0,0.5,0.625,42.610361,110,0,55.0,53.75,1.752549,5,1,2.5,2.75,1.407886,5,1,3.5,3.375,0.0,2,2,2.0,2.0,2.0,2.0,1.375,-49.75,1.25,-3.375,-2.0
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,5,2015,122,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,2.666667,2.0,2.333333,4.0,-51.0,36.0,-2.0,-2.666667,1.48324,2,-2,0.0,-0.2,0.894427,4,2,2.0,2.6,0.0,0,0,0.0,0.0,17.487138,40,0,8.0,14.6,2.345208,6,1,2.0,3.0,2.345208,6,1,2.0,3.0,0.894427,4,2,2.0,2.4,2.2,2.4,4.0,-9.6,37.0,-1.0,-2.4
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,9,2015,245,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,2.666667,2.0,2.333333,2.0,-51.0,99.0,-4.0,-2.666667,0.534522,1,-1,0.0,0.0,0.534522,3,1,2.0,2.0,0.744024,2,0,0.5,0.625,42.610361,110,0,55.0,53.75,1.752549,5,1,2.5,2.75,1.407886,5,1,3.5,3.375,0.0,2,2,2.0,2.0,2.0,3.0,1.375,-48.75,100.25,-3.375,-2.0


In [19]:
# df_resort_date = pd.read_csv('resort_date_vars.csv', header=None)
# df_resort_date.columns = ['resort_vars_{}'.format(c) for c in df_resort_date.columns]
# df_resort_date.shape

In [20]:
# df = pd.concat([df, df_resort_date[df_resort_date.columns[1:]]], axis=1)

In [21]:
df.shape
df.head()

(488189, 1022)

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,memberid,numberofadults,numberofchildren,total_pax,persontravellingid,member_age_buckets,state_code_residence,resort_id,resort_region_code,resort_type_code,state_code_resort,cluster_code,channel_code,booking_type_code,room_type_booked_code,roomnights,season_holidayed_code,reservationstatusid_code,main_product_code,amount_spent_per_room_night_scaled,booking_in_advance_days,booking_roomnights,total_persons_travelling,checkin_date_week,checkin_date_month,checkin_date_year,checkin_date_dayofweek,checkin_date_dayofmonth,checkin_date_dayofyear,checkout_date_week,checkout_date_dayofweek,booking_date_week,booking_date_month,booking_date_year,booking_date_dayofyear,resort_id_checkin_date,resort_id_checkout_date,resort_id_booking_date,resort_id_channel_code,resort_id_booking_type_code,resort_id_reservationstatusid_code,resort_id_resort_type_code,resort_id_cluster_code,resort_id_room_type_booked_code,persontravellingid_member_age_buckets,persontravellingid_state_code_residence,...,GRP_memberid_cluster_code#roomnights_dif_booking_roomnights#mean,GRP_memberid_cluster_code#total_pax#min_mean,GRP_memberid_cluster_code#roomnights#min_mean,GRP_memberid_cluster_code#total_persons_travelling#min_mean,GRP_memberid_cluster_code#booking_roomnights#min_mean,GRP_memberid_cluster_code#booking_in_advance_days#min_mean,GRP_memberid_cluster_code#total_persons_travelling_dif_total_pax#min_mean,GRP_memberid_cluster_code#roomnights_dif_booking_roomnights#min_mean,GRP_memberid_room_type_booked_code#total_pax#std,GRP_memberid_room_type_booked_code#total_pax#max,GRP_memberid_room_type_booked_code#total_pax#min,GRP_memberid_room_type_booked_code#total_pax#median,GRP_memberid_room_type_booked_code#total_pax#mean,GRP_memberid_room_type_booked_code#roomnights#std,GRP_memberid_room_type_booked_code#roomnights#max,GRP_memberid_room_type_booked_code#roomnights#min,GRP_memberid_room_type_booked_code#roomnights#median,GRP_memberid_room_type_booked_code#roomnights#mean,GRP_memberid_room_type_booked_code#total_persons_travelling#std,GRP_memberid_room_type_booked_code#total_persons_travelling#max,GRP_memberid_room_type_booked_code#total_persons_travelling#min,GRP_memberid_room_type_booked_code#total_persons_travelling#median,GRP_memberid_room_type_booked_code#total_persons_travelling#mean,GRP_memberid_room_type_booked_code#booking_roomnights#std,GRP_memberid_room_type_booked_code#booking_roomnights#max,GRP_memberid_room_type_booked_code#booking_roomnights#min,GRP_memberid_room_type_booked_code#booking_roomnights#median,GRP_memberid_room_type_booked_code#booking_roomnights#mean,GRP_memberid_room_type_booked_code#booking_in_advance_days#std,GRP_memberid_room_type_booked_code#booking_in_advance_days#max,GRP_memberid_room_type_booked_code#booking_in_advance_days#min,GRP_memberid_room_type_booked_code#booking_in_advance_days#median,GRP_memberid_room_type_booked_code#booking_in_advance_days#mean,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#std,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#max,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#min,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#median,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#mean,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#std,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#max,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#min,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#median,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#mean,GRP_memberid_room_type_booked_code#total_pax#min_mean,GRP_memberid_room_type_booked_code#roomnights#min_mean,GRP_memberid_room_type_booked_code#total_persons_travelling#min_mean,GRP_memberid_room_type_booked_code#booking_roomnights#min_mean,GRP_memberid_room_type_booked_code#booking_in_advance_days#min_mean,GRP_memberid_room_type_booked_code#total_persons_travelling_dif_total_pax#min_mean,GRP_memberid_room_type_booked_code#roomnights_dif_booking_roomnights#min_mean
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,3,46,F,7.0,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,3,3,3,F,3,1,3,1,2.0,C,1,7.706428,0,1,2,14,4,2018,3,5,95,14,4,14,4,2018,95,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,46_F,46_7.0,...,2.4,3.4,-1.8,1.6,-48.4,-2.8,-4.2,-2.4,1.48324,2,-2,0.0,-0.2,0.894427,4,2,2.0,2.6,0.0,0,0,0.0,0.0,17.487138,40,0,8.0,14.6,2.345208,6,1,2.0,3.0,2.345208,6,1,2.0,3.0,0.894427,4,2,2.0,2.4,3.2,-1.6,2.0,-13.6,-3.0,-4.0,-2.4
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,3,3,5,F,1,1,4,5,2.0,A,1,6.662563,78,5,2,15,4,2015,5,11,101,16,3,4,1,2015,23,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,46_F,46_7.0,...,2.4,2.4,2.2,1.6,-44.4,75.2,-3.2,-2.4,0.534522,1,-1,0.0,0.0,0.534522,3,1,2.0,2.0,0.744024,2,0,0.5,0.625,42.610361,110,0,55.0,53.75,1.752549,5,1,2.5,2.75,1.407886,5,1,3.5,3.375,0.0,2,2,2.0,2.0,2.0,3.0,1.375,-48.75,75.25,-3.375,-2.0
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,47,F,7.0,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,1,5,1,E,1,1,4,4,2.0,A,1,7.871602,4,4,2,5,2,2015,6,1,32,6,3,5,1,2015,28,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,47_F,47_7.0,...,2.0,2.0,2.0,1.5,-53.0,0.5,-4.0,-2.0,0.534522,1,-1,0.0,0.0,0.534522,3,1,2.0,2.0,0.744024,2,0,0.5,0.625,42.610361,110,0,55.0,53.75,1.752549,5,1,2.5,2.75,1.407886,5,1,3.5,3.375,0.0,2,2,2.0,2.0,2.0,2.0,1.375,-49.75,1.25,-3.375,-2.0
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,2,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,3,5,2.0,A,1,5.344943,40,5,4,24,6,2015,3,11,162,25,1,18,5,2015,122,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,2.666667,2.0,2.333333,4.0,-51.0,36.0,-2.0,-2.666667,1.48324,2,-2,0.0,-0.2,0.894427,4,2,2.0,2.6,0.0,0,0,0.0,0.0,17.487138,40,0,8.0,14.6,2.345208,6,1,2.0,3.0,2.345208,6,1,2.0,3.0,0.894427,4,2,2.0,2.4,2.2,2.4,4.0,-9.6,37.0,-1.0,-2.4
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,2,0,2,46,F,7.0,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,2,2,2,D,1,1,4,5,2.0,A,1,7.059346,103,5,2,51,12,2015,0,14,348,51,5,36,9,2015,245,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,46_F,46_7.0,...,2.666667,2.0,2.333333,2.0,-51.0,99.0,-4.0,-2.666667,0.534522,1,-1,0.0,0.0,0.534522,3,1,2.0,2.0,0.744024,2,0,0.5,0.625,42.610361,110,0,55.0,53.75,1.752549,5,1,2.5,2.75,1.407886,5,1,3.5,3.375,0.0,2,2,2.0,2.0,2.0,3.0,1.375,-48.75,100.25,-3.375,-2.0


In [22]:
# group resort date and checkin_date for number of bookings for that day

In [23]:
unique_id = 'reservation_id'

date_cols = []

target_col = 'amount_spent_per_room_night_scaled'

cat_cols = [
'memberid',
u'resort_id_memberid',
u'memberid_channel_code', u'memberid_booking_type_code',
u'memberid_reservationstatusid_code', u'memberid_resort_type_code',
u'memberid_cluster_code', u'memberid_room_type_booked_code',
u'memberid_checkin_date', u'memberid_checkout_date',
u'memberid_booking_date', u'memberid_checkin_date_month',
u'memberid_checkin_date_week', u'memberid_checkin_date_dayofweek',
u'memberid_checkout_date_week', u'memberid_checkout_date_dayofweek',
u'memberid_booking_date_week', 
u'memberid_booking_roomnights',
    
]

cat_boost_cols = [
u'resort_id_checkin_date', u'resort_id_checkout_date',
u'resort_id_booking_date', u'resort_id_channel_code',
u'resort_id_booking_type_code', u'resort_id_reservationstatusid_code',
u'resort_id_resort_type_code', u'resort_id_cluster_code',
u'resort_id_room_type_booked_code',
u'persontravellingid_member_age_buckets',
u'persontravellingid_state_code_residence',
u'persontravellingid_state_code_resort',
u'member_age_buckets_state_code_residence',
u'member_age_buckets_state_code_resort',
u'state_code_residence_state_code_resort',
u'resort_id_booking_roomnights',
'persontravellingid', 'member_age_buckets', 'state_code_residence', 
'resort_id', 'resort_region_code', 'resort_type_code', 'state_code_resort', 'cluster_code', 'channel_code',
'booking_type_code', 'room_type_booked_code', 'season_holidayed_code', 'reservationstatusid_code', 'main_product_code',
u'checkin_date', u'checkout_date', u'booking_date',
]

columns_to_drop = []


In [24]:
from ml_modules.encoding import FreqeuncyEncoding, LabelEncoding

In [25]:
fE = FreqeuncyEncoding(categorical_columns=cat_cols, return_df=True)

In [26]:
%%time
df1 = fE.fit_transform(df)

CPU times: user 58.2 s, sys: 52.7 s, total: 1min 50s
Wall time: 43.7 s


In [27]:
lE = LabelEncoding(categorical_columns=cat_boost_cols, return_df=True)

In [28]:
df1 = lE.fit_transform(df1)

In [29]:
df1.shape

(488189, 1022)

In [30]:
df1.drop(columns=date_cols +[unique_id, target_col] + columns_to_drop, inplace=True)

In [31]:
x_train, y_train = df1[:341424].values, train[target_col].values
x_train.shape, y_train.shape

((341424, 1020), (341424,))

In [32]:
cat_indices = [ df1.columns.tolist().index(c) for c in cat_boost_cols]

In [33]:
from xgboost import XGBClassifier, XGBRegressor
from lightgbm import LGBMClassifier, LGBMRegressor
from catboost import CatBoostClassifier, CatBoostRegressor

In [34]:
from ml_modules.custom_estimator import Estimator
from ml_modules.custom_fold_generator import FoldScheme

In [35]:
from sklearn.metrics import mean_squared_error
from math import sqrt

def rmse(y_true, y_pred):
    return sqrt(mean_squared_error(y_true, y_pred))

####  catboost

In [36]:
est = Estimator(model=CatBoostRegressor(eval_metric='RMSE', n_estimators=20000, od_type='Iter', 
                    od_wait=200, colsample_bylevel=0.7, max_depth=6, learning_rate=0.1),
              early_stopping_rounds=200, random_state=50, validation_scheme=FoldScheme.KFold,
              eval_metric='rmse', task_type='regression', scoring_metric=rmse, n_splits=5,
              categorical_features_indices=cat_indices)

In [None]:
train_preds = est.fit_transform(x_train, y_train)

0:	learn: 7.0331923	test: 7.0358099	best: 7.0358099 (0)	total: 639ms	remaining: 3h 32m 56s
100:	learn: 0.9696110	test: 0.9745590	best: 0.9745590 (100)	total: 29.9s	remaining: 1h 38m 5s
200:	learn: 0.9634880	test: 0.9713717	best: 0.9713717 (200)	total: 57.2s	remaining: 1h 33m 56s
300:	learn: 0.9595932	test: 0.9703135	best: 0.9703135 (300)	total: 1m 24s	remaining: 1h 31m 41s
400:	learn: 0.9559126	test: 0.9696199	best: 0.9696199 (400)	total: 1m 50s	remaining: 1h 30m 25s
500:	learn: 0.9529202	test: 0.9692354	best: 0.9692354 (500)	total: 2m 17s	remaining: 1h 29m 16s
600:	learn: 0.9501155	test: 0.9689265	best: 0.9689265 (600)	total: 2m 44s	remaining: 1h 28m 31s
700:	learn: 0.9474742	test: 0.9686731	best: 0.9686553 (689)	total: 3m 11s	remaining: 1h 27m 49s
800:	learn: 0.9446962	test: 0.9684537	best: 0.9684537 (800)	total: 3m 38s	remaining: 1h 27m 26s
900:	learn: 0.9419517	test: 0.9683607	best: 0.9682872 (883)	total: 4m 6s	remaining: 1h 27m 7s
1000:	learn: 0.9392740	test: 0.9680161	best: 0.968

In [40]:
np.mean(est.cv_scores), np.std(est.cv_scores), est.cv_scores

(0.9664974056279819,
 0.0026505073777112866,
 [0.9678618543746916,
  0.965892257903189,
  0.9623305132575748,
  0.9659956184745394,
  0.9704067841299145])

In [41]:
joblib.dump(est, 'cat-1020-k5.pkl')

['cat-1020-k5.pkl']

In [None]:
x_test = df1[341424:].values
test_preds = est.transform(x_test)

  return np.mean(np.column_stack((est.predict(x) for est in self.fitted_models)), axis=1)


In [None]:
pd.DataFrame(zip(test[unique_id], test_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('cat-1020-k5-test.csv', index=False)

In [None]:
pd.DataFrame(zip(train[unique_id], train_preds), columns=['reservation_id', 
        'amount_spent_per_room_night_scaled']).to_csv('cat-1020-k5-train.csv', index=False)

In [52]:
_df = pd.read_csv('../club_mahendra/cat_1020_9664_k5_train.csv')
_df.amount_spent_per_room_night_scaled.corr(pd.Series(train_preds))

1.0

In [51]:
_df = pd.read_csv('../club_mahendra/cat_1020_9664_k5_test.csv')
_df.amount_spent_per_room_night_scaled.corr(pd.Series(test_preds))

0.9999999999999998