In [1]:
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import RobustScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_val_score
from pprint import pprint
# from sklearn.decomposition import PCA

pd.options.mode.chained_assignment = None  # default='warn'

### Read in training data

In [2]:
x = pd.read_parquet('train_test_data/training_set_VU_DM.parquet.gzip')

### Assign new weight to booking_bool

In [3]:
x['booking_bool'] = np.where(x['booking_bool'] == 1, 2, 0)

### Create our target class by combining click_bool and booking_bool

In [4]:
target = pd.DataFrame(x['booking_bool'] + x['click_bool'])

### Some feature engineering

In [5]:
# if countries are the same for hotel and user
x['same_location'] = np.where(x['prop_country_id'] == x['visitor_location_country_id'], 1, 0)

# fill nans for prop_location_score2 with prop_location_score1 (since only ~20% missing. note: scaling is different)
multiplier = x['prop_location_score2'].mean() / x['prop_location_score1'].mean()
x['prop_location_score2'].fillna(x['prop_location_score1'] * multiplier, inplace=True)

# fill nans for review score with the mean of all review scores
x['prop_review_score'].fillna(x['prop_review_score'].mean(), inplace=True)

##### THESE WILL BE DELETED LATER, STILL TOO MUCH MISSING DATA AFTERWARDS #####
x['visitor_hist_starrating'].fillna(0, inplace=True)
x['visitor_hist_adr_usd'].fillna(0, inplace=True)
x['has_rate_hist'] = np.where((x['visitor_hist_starrating'] > 0) & (x['visitor_hist_adr_usd'] > 0), 1, 0)

### Fill missing orig_destination_distance data with mean distance for that property id

In [6]:
x["orig_destination_distance"] = x.groupby('prop_id')['orig_destination_distance'].transform(lambda x: x.fillna(x.mean()))
x['orig_destination_distance'].fillna(x['orig_destination_distance'].mean(), inplace=True)

### New features for competitor rates

In [7]:
x_copy = x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].copy()
x['comp_rate_lower'] = (x_copy == -1).T.sum()
x['comp_rate_equal'] = (x_copy == 0).T.sum()
x['comp_rate_higher'] = (x_copy == 1).T.sum()

### New features for competitor availability

In [8]:
x_copy = x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].copy()
x['comp_inv_same'] = (x_copy == 0).T.sum()
x['comp_inv_better'] = (x_copy == 1).T.sum()

### New features for competitor rate percent difference

In [9]:
x_copy = x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].copy()
x['comp_rate_percent_diff_low'] = x_copy.T.min()
x['comp_rate_percent_diff_high'] = x_copy.T.max()

### Delete the old competitor data

In [10]:
x.drop(['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate'], axis=1,inplace=True)
x.drop(['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv'], axis=1,inplace=True)
x.drop(['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff'], axis=1,inplace=True)

### Combine some competitor features, since some of these columns were quite thin

In [11]:
x['rate_equal_or_better'] = x['comp_rate_equal'] + x['comp_rate_higher']
x['inv_equal_or_better'] = x['comp_inv_same'] + x['comp_inv_better']

### Normalize the columns we will use

In [12]:
# x['prop_starrating'] = MinMaxScaler().fit_transform(np.array(x['prop_starrating'], dtype = float).reshape(-1, 1))
# x['prop_review_score'] = MinMaxScaler().fit_transform(np.array(x['prop_review_score'], dtype = float).reshape(-1, 1))
# x['prop_brand_bool'] = MinMaxScaler().fit_transform(np.array(x['prop_brand_bool'], dtype = float).reshape(-1, 1))
# x['prop_location_score1'] = MinMaxScaler().fit_transform(np.array(x['prop_location_score1'], dtype = float).reshape(-1, 1))
# x['prop_location_score2'] = MinMaxScaler().fit_transform(np.array(x['prop_location_score2'], dtype = float).reshape(-1, 1))
# x['prop_log_historical_price'] = MinMaxScaler().fit_transform(np.array(x['prop_log_historical_price'], dtype = float).reshape(-1, 1))
# x['price_usd'] = x.groupby('prop_country_id')['price_usd'].transform(lambda x: (x - min(x)) / (max(x) - min(x)))
# x['promotion_flag'] = MinMaxScaler().fit_transform(np.array(x['promotion_flag'], dtype = float).reshape(-1, 1))
# x['orig_destination_distance'] = MinMaxScaler().fit_transform(np.array(x['orig_destination_distance'], dtype = float).reshape(-1, 1))
# x['random_bool'] = MinMaxScaler().fit_transform(np.array(x['random_bool'], dtype = float).reshape(-1, 1))
# x['same_location'] = MinMaxScaler().fit_transform(np.array(x['same_location'], dtype = float).reshape(-1, 1))
# x['rate_equal_or_better'] = MinMaxScaler().fit_transform(np.array(x['rate_equal_or_better'], dtype = float).reshape(-1, 1))
# x['inv_equal_or_better'] = MinMaxScaler().fit_transform(np.array(x['inv_equal_or_better'], dtype = float).reshape(-1, 1))

### mean, median and std for prop\_starrating, prop\_review\_score, prop\_location\_score1, and prop\_location\_score2 for each hotel (grouped by "prop\_id"

In [1]:
features = ["prop_starrating", "prop_review_score", "prop_location_score1", "prop_location_score2"]

for feat in features: 
    mean = x.groupby("prop_id")[feat].mean()
    median = x.groupby("prop_id")[feat].median()
    std = x.groupby("prop_id")[feat].std(ddof=0)
    
#     print (std.min())
#     print (std.mean())
#     print (std.max())

    x[feat+"_mean"] = mean[x.prop_id].values
    x[feat+"_median"] = median[x.prop_id].values
    x[feat+"_std"] = std[x.prop_id].values

NameError: name 'x' is not defined

### Delete columns we do not need

In [13]:
del x['date_time']
del x['site_id']
del x['booking_bool']
del x['click_bool']
del x['gross_bookings_usd']
del x['visitor_location_country_id']
del x['prop_country_id']
del x['visitor_hist_starrating']
del x['visitor_hist_adr_usd']
del x['srch_query_affinity_score']
del x['position']
del x['prop_id']
del x['srch_id']
del x['srch_destination_id']
del x['srch_booking_window']
del x['srch_adults_count']
del x['srch_children_count']
del x['srch_room_count']
del x['srch_saturday_night_bool']
del x['srch_length_of_stay']
del x['comp_rate_lower']
del x['comp_rate_equal']
del x['comp_rate_higher']
del x['comp_inv_same']
del x['comp_inv_better']

### Delete the competitor low/high rate percent differences for now, since ~68% of entries have no values at all. Also delete users that have booked & rated in the past, since ~95% of this is still empty

In [14]:
del x['comp_rate_percent_diff_low']
del x['comp_rate_percent_diff_high']
del x['has_rate_hist']

### Try deleting some columns to see what happens

In [15]:
# del x['prop_log_historical_price']
# del x['random_bool']

### Check final column list

In [16]:
x.columns

Index(['prop_starrating', 'prop_review_score', 'prop_brand_bool',
       'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'price_usd', 'promotion_flag',
       'orig_destination_distance', 'random_bool', 'same_location',
       'rate_equal_or_better', 'inv_equal_or_better'],
      dtype='object')

### PCA

In [17]:
# pca = PCA(n_components='mle', svd_solver='full')
# pca.fit(x)
# pca.explained_variance_ratio_
# pca.singular_values_

PCA(copy=True, iterated_power='auto', n_components='mle', random_state=None,
  svd_solver='full', tol=0.0, whiten=False)

In [20]:
# x2 = pd.DataFrame(pca.transform(x))

In [22]:
# x2.columns = ['f0','f1','f2','f3','f4','f5','f6','f7','f8','f9','f10','f11']
# x2['target'] = target

In [23]:
# x2.head

<bound method NDFrame.head of                f0        f1        f2        f3        f4        f5        f6  \
0        0.176052  0.591387  0.775496 -0.147247 -0.247457  0.341513  0.357872   
1        0.199128  0.603223  0.788989 -0.173055 -0.200469  0.183185  0.197283   
2        0.151533  0.587500  0.791846 -0.141833 -0.258191  0.332851  0.313542   
3        0.197911  0.634375  0.711323 -0.226685 -0.010863  0.179801  0.269697   
4        0.188803  0.580059  0.804845 -0.130122 -0.294459  0.351346  0.259126   
5        0.208504  0.591895  0.808628 -0.153476 -0.271196  0.182580  0.089333   
6        0.191107  0.620507  0.743674 -0.199217 -0.109791  0.198048  0.320490   
7        0.929516  0.751165 -0.181375 -0.374728  0.295956 -0.084579  0.522560   
8        0.209086  0.598645  0.794424 -0.163368 -0.240937  0.179274  0.151952   
9        0.182133  0.574063  0.818532 -0.117542 -0.343188  0.331869  0.216057   
10       0.234902  0.651737  0.667769 -0.244338 -0.009131  0.166545  0.642407  

### Make sure we have no missing values

In [24]:
missing_values = x2.isna().mean() * 100
missing_values

f0        0.0
f1        0.0
f2        0.0
f3        0.0
f4        0.0
f5        0.0
f6        0.0
f7        0.0
f8        0.0
f9        0.0
f10       0.0
f11       0.0
target    0.0
dtype: float64

### Save the modified parquet up to this point

In [25]:
x2.to_parquet('training_set_v5.parquet.gzip', compression = 'gzip')