In [1]:
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_val_score
from pprint import pprint

pd.options.mode.chained_assignment = None  # default='warn'

### Read in training data

In [57]:
x = pd.read_parquet('training_set_VU_DM.parquet.gzip')

### Assign new weight to booking_bool

In [3]:
x['booking_bool'] = np.where(x['booking_bool'] == 1, 4, 0)

### Create our target class by combining click_bool and booking_bool

In [4]:
x['target'] = x['booking_bool'] + x['click_bool']

### Some feature engineering

In [14]:
# if countries are the same for hotel and user
x['same_location'] = np.where(x['prop_country_id'] == x['visitor_location_country_id'], 1, 0)

# fill nans for prop_location_score2 with prop_location_score1 (since only ~20% missing. note: scaling is different)
multiplier = x['prop_location_score2'].mean() / x['prop_location_score1'].mean()
x['prop_location_score2'].fillna(x['prop_location_score1']*multiplier, inplace=True)

# fill nans for review score with the mean of all review scores
x['prop_review_score'].fillna(x['prop_review_score'].mean(), inplace=True)

x['visitor_hist_starrating'].fillna(0, inplace=True)
x['visitor_hist_adr_usd'].fillna(0, inplace=True)

# if user has rated any hotels in the past and has a booking history
x['has_rate_hist'] = np.where((x['visitor_hist_starrating'] > 0) & (x['visitor_hist_adr_usd'] > 0), 1, 0)

### Fill missing orig_destination_distance data with mean distance for that property id

In [15]:
x["distance"] = x.groupby('prop_id')['orig_destination_distance'].transform(lambda x: x.fillna(x.mean()))

### Normalize price based on prop_country_id

In [None]:
x['new_price'] = x.groupby('prop_country_id')['price_usd'].transform(lambda x: (x - x.mean()) / x.std())

### Use robust scaler on the distances

In [50]:
x['distance'] = RobustScaler().fit_transform(np.array(x['distance'], dtype = float).reshape(-1, 1))

### Delete some columns we do not need at this point

In [33]:
del x['date_time']
del x['site_id']
del x['booking_bool']
del x['click_bool']
del x['gross_bookings_usd']
del x['visitor_location_country_id']
del x['prop_country_id']
del x['visitor_hist_starrating']
del x['visitor_hist_adr_usd']
del x['orig_destination_distance']
del x['srch_query_affinity_score']
del x['position']
del x['prop_id']
del x['srch_id']
del x['srch_destination_id']
del x['price_usd']

### New features for competitor rates

In [34]:
x_copy = x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].copy()
x['comp_rate_lower'] = (x_copy == -1).T.sum()
x['comp_rate_equal'] = (x_copy == 0).T.sum()
x['comp_rate_higher'] = (x_copy == 1).T.sum()

### New features for competitor availability

In [35]:
x_copy = x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].copy()
x['comp_inv_same'] = (x_copy == 0).T.sum()
x['comp_inv_better'] = (x_copy == 1).T.sum()

### New features for competitor rate percent difference

In [36]:
x_copy = x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].copy()
x['comp_rate_percent_diff_low'] = x_copy.T.min()
x['comp_rate_percent_diff_high'] = x_copy.T.max()

### Delete the old competitor data

In [37]:
x.drop(['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate'], axis=1,inplace=True)
x.drop(['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv'], axis=1,inplace=True)
x.drop(['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff'], axis=1,inplace=True)

### Fill remaining NaN values

In [52]:
x['distance'].fillna(x['distance'].mean(), inplace=True)

### Make sure we have no missing values

In [53]:
missing_values = x.isna().mean() * 100
missing_values

prop_starrating              0.0
prop_review_score            0.0
prop_brand_bool              0.0
prop_location_score1         0.0
prop_location_score2         0.0
prop_log_historical_price    0.0
promotion_flag               0.0
srch_length_of_stay          0.0
srch_booking_window          0.0
srch_adults_count            0.0
srch_children_count          0.0
srch_room_count              0.0
srch_saturday_night_bool     0.0
random_bool                  0.0
target                       0.0
same_location                0.0
has_rate_hist                0.0
distance                     0.0
new_price                    0.0
comp_rate_lower              0.0
comp_rate_equal              0.0
comp_rate_higher             0.0
comp_inv_same                0.0
comp_inv_better              0.0
dtype: float64

### Delete the competitor low/high rate percent differences for now, since ~68% of entries have no values at all

In [40]:
del x['comp_rate_percent_diff_low']
del x['comp_rate_percent_diff_high']

### Save the modified parquet up to this point

In [54]:
x.to_parquet('training_set_v2.parquet.gzip', compression = 'gzip')