## Data Preprocessing and Feature Engineering

In [1]:
import os
import re
import json
import time
import joblib

import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set_style("whitegrid", {'axes.grid' : False})
import scikitplot as skplt

from datetime import datetime, date, timedelta
from collections import Counter
from tqdm import tqdm_notebook

from tqdm.auto import tqdm
tqdm.pandas()

from category_encoders import OrdinalEncoder
from sklearn.preprocessing import FunctionTransformer

### Loading datasets

In [2]:
# to parse dates
lambda_date_parser = lambda x: pd.datetime.strptime(x, '%d/%m/%y')

# to treat column names
lambda_column_parser = lambda c: re.sub(r"\.", "_", str(c).lower())

df_train = pd.read_csv('data/train.csv', parse_dates=['booking_date', 'checkin_date', 'checkout_date'], date_parser=lambda_date_parser)
df_train.columns = map(lambda_column_parser, df_train.columns)

df_validation = pd.read_csv('data/test.csv', parse_dates=['booking_date', 'checkin_date', 'checkout_date'], date_parser=lambda_date_parser)
df_validation.columns = map(lambda_column_parser, df_validation.columns)

In [3]:
df_train.shape, df_validation.shape

((341424, 24), (146765, 23))

In [4]:
df_train.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,...,state_code_residence,state_code_resort,total_pax,member_age_buckets,booking_type_code,memberid,cluster_code,reservationstatusid_code,resort_id,amount_spent_per_room_night_scaled
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3,1,2,0,46,3,...,7.0,3,3,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,C,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,7.706428
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,1,1,2,0,46,3,...,7.0,5,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,A,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,6.662563
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,1,1,2,0,47,1,...,7.0,1,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,E,A,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,7.871602
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,1,1,2,2,46,2,...,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,5.344943
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,1,1,2,0,46,2,...,7.0,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,7.059346


In [5]:
df_validation.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,...,season_holidayed_code,state_code_residence,state_code_resort,total_pax,member_age_buckets,booking_type_code,memberid,cluster_code,reservationstatusid_code,resort_id
0,7dae1ce6bc8f69481328f2be5c4943077dad5598b5f66d...,2018-04-05,2018-04-05,2018-04-06,3,2,2,0,45,3,...,2.0,2.0,3,3,H,1,2114944930dcc42ce5b9b50ae965cf8a9c04e46be63d84...,A,C,9f14025af0065b30e47e23ebb3b491d39ae8ed17d33739...
1,fe0d4e444e1818436c88f72f1cf800536c2f785e59baeb...,2015-05-24,2015-06-23,2015-06-27,1,2,4,0,45,1,...,2.0,2.0,4,2,H,1,2114944930dcc42ce5b9b50ae965cf8a9c04e46be63d84...,F,A,e7f6c011776e8db7cd330b54174fd76f7d0216b612387a...
2,540bd4285ad8168e8388f84ee74a82cd4f97dc0a404d3e...,2015-07-21,2015-08-07,2015-08-10,3,2,3,0,45,2,...,4.0,2.0,2,3,H,1,2114944930dcc42ce5b9b50ae965cf8a9c04e46be63d84...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...
3,09593c907ae262e46f655b4db9e14f54a19eadcfdd2679...,2015-07-31,2015-10-22,2015-10-25,1,2,3,0,45,1,...,2.0,2.0,6,3,H,1,2114944930dcc42ce5b9b50ae965cf8a9c04e46be63d84...,F,A,49d180ecf56132819571bf39d9b7b342522a2ac6d23c14...
4,f4c50caac68051faf37551d70bb17eebef2a20e2244cb1...,2016-07-01,2016-10-01,2016-10-05,1,2,4,0,45,1,...,2.0,2.0,11,2,H,1,2114944930dcc42ce5b9b50ae965cf8a9c04e46be63d84...,F,A,e29c9c180c6279b0b02abd6a1801c7c04082cf486ec027...


### Columns check

In [6]:
print("columns not in validation data", [c for c in df_train.columns if c not in df_validation.columns])

columns not in validation data ['amount_spent_per_room_night_scaled']


In [7]:
original_timestamp_features = ['booking_date','checkin_date','checkout_date']
original_continuous_features = ['numberofadults','numberofchildren','roomnights','total_pax']
original_categorical_features = ['channel_code','main_product_code','persontravellingid','resort_region_code','resort_type_code','room_type_booked_code','season_holidayed_code','state_code_residence','state_code_resort','member_age_buckets','booking_type_code','memberid','cluster_code','reservationstatusid_code','resort_id']

In [8]:
assert(len(original_timestamp_features + original_continuous_features + original_categorical_features) == df_train.shape[1] - 2)

### Dropping complete duplicate records

In [9]:
df_train.drop(index=df_train[df_train[original_timestamp_features + original_continuous_features + original_categorical_features].duplicated(keep='first')].index, inplace=True)
df_train.reset_index(drop=True, inplace=True)
df_train.shape

(338192, 24)

In [10]:
df_train['source'] = 'train'
df_validation['source'] = 'validation'
df_validation['amount_spent_per_room_night_scaled'] = np.nan

df_all = pd.concat([df_train, df_validation], axis=0, sort=False)
df_all.reset_index(drop=True, inplace=True)
df_all.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,...,state_code_resort,total_pax,member_age_buckets,booking_type_code,memberid,cluster_code,reservationstatusid_code,resort_id,amount_spent_per_room_night_scaled,source
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3,1,2,0,46,3,...,3,3,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,C,4e07408562bedb8b60ce05c1decfe3ad16b72230967de0...,7.706428,train
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,1,1,2,0,46,3,...,5,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,F,A,39fa9ec190eee7b6f4dff1100d6343e10918d044c75eac...,6.662563,train
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,1,1,2,0,47,1,...,1,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,E,A,535fa30d7e25dd8a49f1536779734ec8286108d115da50...,7.871602,train
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,1,1,2,2,46,2,...,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,5.344943,train
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,1,1,2,0,46,2,...,2,2,F,1,3d1539e56495b6991f0a3ef5a61ca3d03ce4fff7380e9a...,D,A,d4735e3a265e16eee03f59718b9b5d03019c07d8b6c51f...,7.059346,train


In [11]:
df_all.shape

(484957, 25)

### Unique key assert

In [12]:
assert(df_all['reservation_id'].unique().size == df_all.shape[0]), "duplicate records against reservation_id found!"

In [13]:
df_all.isnull().sum()

reservation_id                             0
booking_date                               0
checkin_date                               0
checkout_date                              0
channel_code                               0
main_product_code                          0
numberofadults                             0
numberofchildren                           0
persontravellingid                         0
resort_region_code                         0
resort_type_code                           0
room_type_booked_code                      0
roomnights                                 0
season_holidayed_code                    148
state_code_residence                    6983
state_code_resort                          0
total_pax                                  0
member_age_buckets                         0
booking_type_code                          0
memberid                                   0
cluster_code                               0
reservationstatusid_code                   0
resort_id 

In [14]:
df_all['season_holidayed_code'].fillna(df_all['season_holidayed_code'].mode()[0], inplace=True)
df_all['state_code_residence'].fillna(df_all['state_code_residence'].mode()[0], inplace=True)

### Date features

In [15]:
def date_dissect(df, datetime_col):
    df["cat_{}_day".format(datetime_col)] = df[datetime_col].map(lambda x: x.date().day)
    df["cat_{}_month".format(datetime_col)] = df[datetime_col].map(lambda x: x.date().month)
    df["cat_{}_year".format(datetime_col)] = df[datetime_col].map(lambda x: x.date().year)
    
    df["cat_{}_weekday".format(datetime_col)] = df[datetime_col].map(lambda x: x.date().weekday())
    df["cat_{}_is_weekend".format(datetime_col)] = df[datetime_col].map(lambda x: 1 if x.date().weekday() in [5, 6] else 0)
    df["cat_{}_week".format(datetime_col)] = df[datetime_col].map(lambda x: x.date().isocalendar()[1])
    
    return df

In [16]:
df_all = date_dissect(df_all, 'booking_date')
df_all = date_dissect(df_all, 'checkin_date')
df_all = date_dissect(df_all, 'checkout_date')

In [17]:
df_all.head()

Unnamed: 0,reservation_id,booking_date,checkin_date,checkout_date,channel_code,main_product_code,numberofadults,numberofchildren,persontravellingid,resort_region_code,...,cat_checkin_date_year,cat_checkin_date_weekday,cat_checkin_date_is_weekend,cat_checkin_date_week,cat_checkout_date_day,cat_checkout_date_month,cat_checkout_date_year,cat_checkout_date_weekday,cat_checkout_date_is_weekend,cat_checkout_date_week
0,07659f3758d8aee27f5a7e2887adeacb67021cb95ada1b...,2018-04-05,2018-04-05,2018-04-06,3,1,2,0,46,3,...,2018,3,0,14,6,4,2018,4,0,14
1,03930f033646d073462b35d411616323597715ac4fc398...,2015-01-23,2015-04-11,2015-04-16,1,1,2,0,46,3,...,2015,5,1,15,16,4,2015,3,0,16
2,d145a32920e6587ad95bfe299d80c0affa268220535aaf...,2015-01-28,2015-02-01,2015-02-05,1,1,2,0,47,1,...,2015,6,1,5,5,2,2015,3,0,6
3,cfd77f44811ed62f25a220b53324cdbafc662a4c9e5f04...,2015-05-02,2015-06-11,2015-06-16,1,1,2,2,46,2,...,2015,3,0,24,16,6,2015,1,0,25
4,937cff9e4dcfc2459620153dfc8b9962ac22bea67dfb29...,2015-09-02,2015-12-14,2015-12-19,1,1,2,0,46,2,...,2015,0,0,51,19,12,2015,5,1,51


In [18]:
def weekday_counter(row):
    """
    To check number of weekends during stay
    https://stackoverflow.com/questions/43692340/how-to-find-number-of-mondays-or-any-other-weekday-between-two-dates-in-python
    """
    start_date  = row.checkin_date
    end_date    = row.checkout_date
    
    weekdays, weekends = 0, 0
    for i in range((end_date - start_date).days):
        if (start_date + timedelta(days=i+1)).weekday() in [5, 6]:
            weekends += 1
        else:
            weekdays += 1
    return pd.Series({'cont_weekdays': weekdays, 'cont_weekends': weekends})

def stay_period(row):
    """
    To calculate stay period
    """
    return (row.checkout_date - row.checkin_date).days

In [19]:
if os.path.isfile('data/df/df_week_info.pkl'):
    df_week_info = pd.read_pickle('data/df/df_week_info.pkl')
else:
    df_week_info = df_all.apply(weekday_counter, axis=1)
    df_week_info.to_pickle('data/df/df_week_info.pkl')

In [20]:
df_week_info.head()

Unnamed: 0,cont_weekdays,cont_weekends
0,1,0
1,4,1
2,4,0
3,3,2
4,4,1


In [21]:
df_all = df_all.merge(df_week_info, how='inner', left_index=True, right_index=True)

In [22]:
df_all['cont_stay_period'] = df_all.apply(stay_period, axis=1)

In [24]:
df_all['cont_stay_nights_diff'] = df_all.apply(lambda row: row.roomnights - row.cont_stay_period, axis=1)

In [25]:
df_all['cont_first_diff'] = (df_all['checkin_date'] - df_all['booking_date']).map(lambda x: x.days)
df_all['cont_first_diff'] = df_all['cont_first_diff'].map(lambda x: np.maximum(0.0, x))

In [26]:
df_all['cont_second_diff'] = (df_all['checkout_date'] - df_all['booking_date']).map(lambda x: x.days)
df_all['cont_second_diff'] = df_all['cont_second_diff'].map(lambda x: np.maximum(1.0, x))

In [27]:
df_all['cont_family_count'] = df_all['numberofadults'] + df_all['numberofchildren']

In [29]:
df_all['cont_family_pax_less'] = df_all['cont_family_count'] - df_all['total_pax']

In [30]:
df_all['cat_are_all_travelling'] = df_all.apply(lambda row: 1 if row.cont_family_count == row.total_pax else 0, axis=1)

In [31]:
def engineer_ratio_fet(df, c1, c2, cnew):
    """
    For calculating ratio features
    """
    df[cnew] = (df[c1] / df[c2]).fillna(0).replace([np.inf, -np.inf], 0.0)
    return df

In [32]:
ratio_fet = [
    ('cont_noc_noa', 'numberofchildren', 'numberofadults'),
    ('cont_fd_noc', 'cont_first_diff', 'numberofchildren'),
    ('cont_fd_noa', 'cont_first_diff', 'numberofadults'),
    ('cont_fd_fc', 'cont_first_diff', 'cont_family_count'),
    ('cont_wd_we', 'cont_weekdays', 'cont_weekends'),
    ('cont_we_wd', 'cont_weekends', 'cont_weekdays'),
    ('cont_sp_noc', 'cont_stay_period', 'numberofchildren'),
    ('cont_sp_noa', 'cont_stay_period', 'numberofadults'),
    ('cont_sp_fc', 'cont_stay_period', 'cont_family_count'),
    ('cont_sp_fd', 'cont_stay_period', 'cont_first_diff'),
    ('cont_snd_noc', 'cont_stay_nights_diff', 'numberofchildren'),
    ('cont_snd_noa', 'cont_stay_nights_diff', 'numberofadults'),
    ('cont_snd_fc', 'cont_stay_nights_diff', 'cont_family_count'),
    ('cont_snd_fd', 'cont_stay_nights_diff', 'cont_first_diff'),
]

for cnew, c1, c2 in ratio_fet:
    df_all = engineer_ratio_fet(df_all, c1, c2, cnew)

In [33]:
continuous_features  = ['numberofadults','numberofchildren','roomnights','total_pax']
categorical_features = ['channel_code','main_product_code','persontravellingid','resort_region_code','resort_type_code','room_type_booked_code','season_holidayed_code','state_code_residence','state_code_resort','member_age_buckets','booking_type_code','memberid','cluster_code','reservationstatusid_code','resort_id']

target_features = ['amount_spent_per_room_night_scaled']
print(continuous_features.__len__(), categorical_features.__len__(), target_features.__len__())

continuous_features.extend([c for c in df_all.columns if 'cont_' in c])
categorical_features.extend([c for c in df_all.columns if 'cat_' in c])
print(continuous_features.__len__(), categorical_features.__len__(), target_features.__len__())

4 15 1
26 34 1


In [34]:
if os.path.isfile('models/preprocessing/oe.joblib'):
    oe = joblib.load('models/preprocessing/oe.joblib')
    df_all[categorical_features] = oe.transform(df_all[categorical_features])
else:
    # see issue in ordinal encoder vs https://lightgbm.readthedocs.io/en/latest/Parameters.html#categorical_feature
    # http://contrib.scikit-learn.org/categorical-encoding/ordinal.html , hence mapping is required

    oe_mapping = [{'col': c, 'mapping': [(map_, map_idx) for map_idx, map_ in enumerate(df_all[c].unique())]} for c in categorical_features]
    oe = OrdinalEncoder(cols=categorical_features, handle_unknown='ignore', mapping=oe_mapping, return_df=False)
    df_all[categorical_features] = oe.fit_transform(df_all[categorical_features])
    joblib.dump(oe, 'models/preprocessing/oe.joblib')

In [1]:
def groupper(df, p_col, agg_col, aggs=['mean','min', 'max', 'std']):
    """
    For calculating grouping aggregations
    """
    dfgb = df.groupby(p_col)[agg_col].agg(aggs).reset_index()
    new_cols = ["cont_gr_{}_{}_{}".format("_".join(p_col), agg_col, ag) for ag in aggs]
    dfgb.columns = p_col + new_cols
    
    assert(df.merge(dfgb, on=p_col, how="left")[p_col + new_cols].isnull().sum().any() == False)  
    return df.merge(dfgb, on=p_col, how="left")

In [36]:
# for sake of small column names
df_all['asprns'] = df_all['amount_spent_per_room_night_scaled'].values

In [43]:
def pivot_counter(df, count_col, index_col, agg_col):
    """
    For caculating pivot aggregations
    """
    df_p = pd.pivot_table(df, values=count_col, index=index_col, columns=agg_col, aggfunc="count", fill_value=0).reset_index()
    cat_pivot_cols = ["cont_pi_{}_{}_count".format(agg_col, i) for i in range(df[agg_col].unique().size)]
    df_p.columns = [index_col] + cat_pivot_cols
    return df.merge(df_p, on=index_col, how='inner')

In [49]:
def checkin_feat(df):
    """
    Feature engineering based on checkin date
    """
    df = df.sort_values(['checkin_date'])
    
    df['cont_cf_days_since_last_checkin'] = (df['checkin_date'] - df['checkin_date'].shift(1)).dt.days
    df['cont_cf_days_since_last_checkin'] = df['cont_cf_days_since_last_checkin'].fillna(0.0)
    
    df['cont_visit_count'] = range(1, df.shape[0] + 1)
    
    df['cont_cf_total_pax_diff'] = df['total_pax'].diff()
    df['cont_cf_total_pax_diff'] = df['cont_cf_total_pax_diff'].fillna(0.0)
    
    df['cont_cf_stay_period_diff'] = df['cont_stay_period'].diff()
    df['cont_cf_stay_period_diff'] = df['cont_cf_stay_period_diff'].fillna(0.0)
    
    df['cont_cf_family_count_diff'] = df['cont_family_count'].diff()
    df['cont_cf_family_count_diff'] = df['cont_cf_family_count_diff'].fillna(0.0)
    
    df['cont_cf_first_diff'] = df['cont_first_diff'].diff()
    df['cont_cf_first_diff'] = df['cont_cf_first_diff'].fillna(0.0)
    
    df['cont_cf_sp_noa_diff'] = df['cont_sp_noa'].diff()
    df['cont_cf_sp_noa_diff'] = df['cont_cf_sp_noa_diff'].fillna(0.0)
    
    df['cont_cf_snd_fd_diff'] = df['cont_snd_fd'].diff()
    df['cont_cf_snd_fd_diff'] = df['cont_cf_snd_fd_diff'].fillna(0.0)
    
    df['cont_cf_sp_fc_diff'] = df['cont_sp_fc'].diff()
    df['cont_cf_sp_fc_diff'] = df['cont_cf_sp_fc_diff'].fillna(0.0)
    
    df['cat_cf_is_resort_changed'] = (df['resort_id'] == df['resort_id'].shift(1)).map({True: 0, False: 1})
    df['cat_cf_is_room_type_booked_changed'] = (df['room_type_booked_code'] == df['room_type_booked_code'].shift(1)).map({True: 0, False: 1})
    df['cat_cf_is_resort_type_changed'] = (df['resort_type_code'] == df['resort_type_code'].shift(1)).map({True: 0, False: 1})
    df['cat_cf_is_cluster_changed'] = (df['cluster_code'] == df['cluster_code'].shift(1)).map({True: 0, False: 1})
    df['cat_cf_is_state_resort_changed'] = (df['state_code_resort'] == df['state_code_resort'].shift(1)).map({True: 0, False: 1})
    df['cat_cf_is_season_holidayed_changed'] = (df['season_holidayed_code'] == df['season_holidayed_code'].shift(1)).map({True: 0, False: 1})
    df['cat_cf_is_resort_region_changed'] = (df['resort_region_code'] == df['resort_region_code'].shift(1)).map({True: 0, False: 1})
     
    return df

In [50]:
def booking_feat(df):
    """
    Feature engineering based on booking date
    """
    df = df.sort_values(['booking_date'])

    df['cont_bf_days_since_last_booking'] = (df['booking_date'] - df['booking_date'].shift(1)).dt.days
    df['cont_bf_days_since_last_booking'] = df['cont_bf_days_since_last_booking'].fillna(0.0)

    return df

In [51]:
cf_cols = ['reservation_id', 'memberid', 'checkin_date', 'total_pax', 'cont_stay_period', 'cont_first_diff', 
       'cont_family_count', 'resort_id', 'room_type_booked_code', 'resort_type_code', 'cluster_code', 
       'state_code_resort', 'season_holidayed_code', 'resort_region_code', 'cont_sp_fc', 'cont_sp_noa', 'cont_snd_fd']
    
if os.path.isfile('data/df/df_checkin_ft.pkl'):
    df_checkin_ft = pd.read_pickle('data/df/df_checkin_ft.pkl')
else:
    df_checkin_ft = df_all[cf_cols].groupby(['memberid']).progress_apply(checkin_feat)
    df_checkin_ft.to_pickle('data/df/df_checkin_ft.pkl')
df_checkin_ft.index = range(df_checkin_ft.shape[0])
cf_cols.remove('reservation_id')
df_checkin_ft.drop(cf_cols, axis=1, inplace=True)

In [52]:
cf_cols = ['reservation_id', 'memberid', 'booking_date']
    
if os.path.isfile('data/df/df_booking_ft.pkl'):
    df_booking_ft = pd.read_pickle('data/df/df_booking_ft.pkl')
else:
    df_booking_ft = df_all[cf_cols].groupby(['memberid']).progress_apply(booking_feat)
    df_booking_ft.to_pickle('data/df/df_booking_ft.pkl')
df_booking_ft.index = range(df_booking_ft.shape[0])
cf_cols.remove('reservation_id')
df_booking_ft.drop(cf_cols, axis=1, inplace=True)

In [55]:
df_all.shape

(484957, 67)

In [56]:
df_all = df_all.merge(df_checkin_ft, how='inner', on=['reservation_id'])
df_all = df_all.merge(df_booking_ft, how='inner', on=['reservation_id'])

In [57]:
df_all.shape

(484957, 84)

In [58]:
continuous_features.extend([c for c in df_all.columns if 'cont_' in c])
categorical_features.extend([c for c in df_all.columns if 'cat_' in c])

continuous_features = sorted(list(set(continuous_features)))
categorical_features = sorted(list(set(categorical_features)))
print(continuous_features.__len__(), categorical_features.__len__(), target_features.__len__())

36 41 1


In [59]:
joblib.dump(continuous_features, 'data/iterables/continuous_features.joblib')
joblib.dump(categorical_features, 'data/iterables/categorical_features.joblib')
joblib.dump(target_features, 'data/iterables/target_features.joblib')

['data/iterables/target_features.joblib']

In [60]:
df_all.to_pickle('data/df/df_all.pkl')