In [1]:
import pandas as pd  
import numpy as np 
import matplotlib.pyplot as plt
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn.metrics import roc_auc_score
from sklearn.metrics import classification_report, confusion_matrix  
from sklearn import metrics
from sklearn.tree import export_graphviz
from sklearn.model_selection import cross_val_score
from pprint import pprint

pd.options.mode.chained_assignment = None  # default='warn'

### Read in training data

In [2]:
x = pd.read_parquet('training_set_VU_DM.parquet.gzip')

### Get target and save columns for submission

In [3]:
x['target'] = x['booking_bool'] + x['click_bool']

### Some feature engineering

In [4]:
# if countries are the same for hotel and user
x['same_location'] = np.where(x['prop_country_id'] == x['visitor_location_country_id'], 1, 0)

# if user has rated any hotels in the past
x['has_rated'] = np.where(x['visitor_hist_starrating'] > 0, 1, 0)

# if user has booking history
x['has_history'] = np.where(x['visitor_hist_adr_usd'] > 0, 1, 0)

# scale affinity scores, fill nans with zero
scaler = MinMaxScaler(feature_range=(0,1))
col = scaler.fit_transform(np.array(x['srch_query_affinity_score']).reshape(-1,1))
x['srch_query_affinity_score'] = scaler.fit_transform(col)
x['srch_query_affinity_score'].fillna(0, inplace=True)

# fill nans for prop_location_score2 with prop_location_score1 (since only ~20% missing)
x['prop_location_score2'].fillna(x['prop_location_score1'], inplace=True)

# fill nans for review score to zero
x['prop_review_score'].fillna(0, inplace=True)

### Delete irrelevant columns

In [5]:
del x['date_time']
del x['site_id']
del x['booking_bool']
del x['click_bool']
del x['gross_bookings_usd']
del x['visitor_location_country_id']
del x['prop_country_id']
del x['visitor_hist_starrating']
del x['visitor_hist_adr_usd']
del x['orig_destination_distance']

In [6]:
print(x.columns)
x.head()

Index(['srch_id', 'prop_id', 'prop_starrating', 'prop_review_score',
       'prop_brand_bool', 'prop_location_score1', 'prop_location_score2',
       'prop_log_historical_price', 'position', 'price_usd', 'promotion_flag',
       'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window',
       'srch_adults_count', 'srch_children_count', 'srch_room_count',
       'srch_saturday_night_bool', 'srch_query_affinity_score', 'random_bool',
       'comp1_rate', 'comp1_inv', 'comp1_rate_percent_diff', 'comp2_rate',
       'comp2_inv', 'comp2_rate_percent_diff', 'comp3_rate', 'comp3_inv',
       'comp3_rate_percent_diff', 'comp4_rate', 'comp4_inv',
       'comp4_rate_percent_diff', 'comp5_rate', 'comp5_inv',
       'comp5_rate_percent_diff', 'comp6_rate', 'comp6_inv',
       'comp6_rate_percent_diff', 'comp7_rate', 'comp7_inv',
       'comp7_rate_percent_diff', 'comp8_rate', 'comp8_inv',
       'comp8_rate_percent_diff', 'target', 'same_location', 'has_rated',
       'has_history'],
  

Unnamed: 0,srch_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_location_score2,prop_log_historical_price,position,price_usd,...,comp7_rate,comp7_inv,comp7_rate_percent_diff,comp8_rate,comp8_inv,comp8_rate_percent_diff,target,same_location,has_rated,has_history
0,1,893,3,3.5,1,2.83,0.0438,4.95,27,104.77,...,,,,0.0,0.0,,0,0,0,0
1,1,10404,4,4.0,1,2.2,0.0149,5.03,26,170.74,...,,,,0.0,0.0,,0,0,0,0
2,1,21315,3,4.5,1,2.2,0.0245,4.92,21,179.8,...,,,,0.0,0.0,,0,0,0,0
3,1,27348,2,4.0,1,2.83,0.0125,4.39,34,602.77,...,,,,-1.0,0.0,5.0,0,0,0,0
4,1,29604,4,3.5,1,2.64,0.1241,4.93,4,143.58,...,,,,0.0,0.0,,0,0,0,0


### Fill in missing competitor rates

In [8]:
x['comp1_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp2_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp3_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp4_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp5_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp6_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp7_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)
x['comp8_rate'].fillna(x[['comp1_rate','comp2_rate','comp3_rate','comp4_rate','comp5_rate','comp6_rate','comp7_rate','comp8_rate']].mean(axis=1), inplace=True)

### Fill in missing competitor availability

In [9]:
x['comp1_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp2_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp3_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp4_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp5_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp6_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp7_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)
x['comp8_inv'].fillna(x[['comp1_inv','comp2_inv','comp3_inv','comp4_inv','comp5_inv','comp6_inv','comp7_inv','comp8_inv']].mean(axis=1), inplace=True)

### Fill in missing competitor rate percent difference

In [10]:
x['comp1_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp2_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp3_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp4_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp5_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp6_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp7_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)
x['comp8_rate_percent_diff'].fillna(x[['comp1_rate_percent_diff','comp2_rate_percent_diff','comp3_rate_percent_diff','comp4_rate_percent_diff','comp5_rate_percent_diff','comp6_rate_percent_diff','comp7_rate_percent_diff','comp8_rate_percent_diff']].mean(axis=1), inplace=True)

### Fill in the competitor data that only contain NaN values with zeros

In [11]:
x['comp1_rate'].fillna(0, inplace=True)
x['comp2_rate'].fillna(0, inplace=True)
x['comp3_rate'].fillna(0, inplace=True)
x['comp4_rate'].fillna(0, inplace=True)
x['comp5_rate'].fillna(0, inplace=True)
x['comp6_rate'].fillna(0, inplace=True)
x['comp7_rate'].fillna(0, inplace=True)
x['comp8_rate'].fillna(0, inplace=True)

x['comp1_inv'].fillna(0, inplace=True)
x['comp2_inv'].fillna(0, inplace=True)
x['comp3_inv'].fillna(0, inplace=True)
x['comp4_inv'].fillna(0, inplace=True)
x['comp5_inv'].fillna(0, inplace=True)
x['comp6_inv'].fillna(0, inplace=True)
x['comp7_inv'].fillna(0, inplace=True)
x['comp8_inv'].fillna(0, inplace=True)

x['comp1_rate_percent_diff'].fillna(0, inplace=True)
x['comp2_rate_percent_diff'].fillna(0, inplace=True)
x['comp3_rate_percent_diff'].fillna(0, inplace=True)
x['comp4_rate_percent_diff'].fillna(0, inplace=True)
x['comp5_rate_percent_diff'].fillna(0, inplace=True)
x['comp6_rate_percent_diff'].fillna(0, inplace=True)
x['comp7_rate_percent_diff'].fillna(0, inplace=True)
x['comp8_rate_percent_diff'].fillna(0, inplace=True)

### Make sure we have no missing values

In [12]:
missing_values = x.isna().mean() * 100
missing_values

srch_id                      0.0
prop_id                      0.0
prop_starrating              0.0
prop_review_score            0.0
prop_brand_bool              0.0
prop_location_score1         0.0
prop_location_score2         0.0
prop_log_historical_price    0.0
position                     0.0
price_usd                    0.0
promotion_flag               0.0
srch_destination_id          0.0
srch_length_of_stay          0.0
srch_booking_window          0.0
srch_adults_count            0.0
srch_children_count          0.0
srch_room_count              0.0
srch_saturday_night_bool     0.0
srch_query_affinity_score    0.0
random_bool                  0.0
comp1_rate                   0.0
comp1_inv                    0.0
comp1_rate_percent_diff      0.0
comp2_rate                   0.0
comp2_inv                    0.0
comp2_rate_percent_diff      0.0
comp3_rate                   0.0
comp3_inv                    0.0
comp3_rate_percent_diff      0.0
comp4_rate                   0.0
comp4_inv 

### Save the modified parquet up to this point

In [13]:
x.to_parquet('training_set_v1.parquet.gzip', compression = 'gzip')

# Using the new parquet file

In [None]:
train = pd.read_parquet('training_set_v1.parquet.gzip')
target = train['target']
out_col1 = x['srch_id']
out_col2 = x['prop_id']
del train['target']
del train['srch_id']
del train['prop_id']

### Double check that we do not have any empty cells

In [None]:
missing_values = train.isna().mean() * 100
missing_values

### Test/train

In [None]:
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=0)

In [None]:
rf = RandomForestRegressor(n_estimators= 100, random_state=42)
rf.fit(x_train, y_train)

In [None]:
predictions = rf.predict(x_test)

In [None]:
# for p in predictions:
#     print(p)

In [None]:
x = pd.read_parquet('test_set_VU_DM.parquet.gzip')

In [None]:
predictions_real = rf.predict(x)

In [None]:
out = pd.DataFrame(x['srch_id'])
out['prop_id'] = x['prop_id']
out['pred'] = predictions_real
out.groupby(['srch_id', 'pred'])
print(out.head)