# 1+p Log Transformation To Target and Binary Encoding (if required)

In [37]:
import os

import numpy as np
import pandas as pd

import joblib

from category_encoders import BinaryEncoder
from sklearn.preprocessing import FunctionTransformer

In [39]:
df_all = pd.read_pickle('data/df/df_all.pkl')

In [40]:
continuous_features = joblib.load('data/iterables/continuous_features.joblib')
categorical_features = joblib.load('data/iterables/categorical_features.joblib')
target_features = joblib.load('data/iterables/target_features.joblib')

### Execute below if you need data for algorithms that don't support categorical variables
### ============================================================================

In [23]:
be = BinaryEncoder(cols=categorical_features, return_df=True, handle_unknown='ignore')
df_encoded = be.fit_transform(df_all[categorical_features])

In [24]:
joblib.dump(be, 'models/preprocessing/be.joblib')

['models/preprocessing/be.joblib']

In [25]:
df_encoded.head()

Unnamed: 0,cat_cf_is_cluster_changed_0,cat_cf_is_cluster_changed_1,cat_cf_is_resort_region_changed_0,cat_cf_is_resort_region_changed_1,cat_checkin_date_is_weekend_0,cat_checkin_date_is_weekend_1,cat_cf_is_room_type_booked_changed_0,cat_cf_is_room_type_booked_changed_1,cat_booking_date_month_0,cat_booking_date_month_1,...,cat_checkin_date_day_0,cat_checkin_date_day_1,cat_checkin_date_day_2,cat_checkin_date_day_3,cat_checkin_date_day_4,cat_checkin_date_day_5,cluster_code_0,cluster_code_1,cluster_code_2,cluster_code_3
0,0,1,0,1,0,1,0,1,0,0,...,0,0,0,0,0,1,0,0,0,1
1,0,1,1,0,1,0,0,1,0,0,...,0,0,0,0,1,0,0,0,0,1
2,0,1,1,0,1,0,1,0,0,0,...,0,0,0,0,1,1,0,0,1,0
3,0,1,1,0,0,1,1,0,0,0,...,0,0,0,0,1,0,0,0,1,1
4,1,0,0,1,0,1,1,0,0,0,...,0,0,0,1,0,0,0,0,1,1


In [26]:
categorical_features_encoded = df_encoded.columns.tolist()
print(categorical_features_encoded.__len__())

180


In [None]:
joblib.dump(categorical_features_encoded, 'data/iterables/categorical_features_encoded.joblib')

In [27]:
df_all.reset_index(inplace=True, drop=True)
df_encoded.reset_index(inplace=True, drop=True)

In [28]:
df_all = df_all.merge(df_encoded, left_index=True, right_index=True, how='inner')

### ============================================================================

In [41]:
df_train_dataset = df_all[df_all['source'] == 'train'].copy()
df_train_dataset.reset_index(inplace=True, drop=True)

df_validation_dataset = df_all[df_all['source'] == 'validation'].copy()
df_validation_dataset.reset_index(inplace=True, drop=True)

In [42]:
df_train_dataset.shape, df_validation_dataset.shape

((338192, 84), (146765, 84))

In [43]:
target_transformer = FunctionTransformer(func=np.log1p, inverse_func=np.expm1, validate=True)
df_train_dataset[target_features] = target_transformer.transform(df_train_dataset[target_features].values)

In [44]:
df_train_dataset.to_pickle('data/df/df_train_dataset.pkl')
df_validation_dataset.to_pickle('data/df/df_validation_dataset.pkl')

In [45]:
joblib.dump(target_transformer, 'models/preprocessing/target_transformer.joblib')

['models/preprocessing/target_transformer.joblib']