In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb


# 1. Import data

In [2]:
training_set = pd.read_csv('data/training_set_VU_DM.csv')

# 2. Define relevance

In [3]:
training_set['relevance'] = training_set['click_bool'] + 4 * training_set['booking_bool']

# 3. Train validation split

In [4]:
from sklearn.model_selection import train_test_split

# Split data into training and test sets

unique_queries = training_set['srch_id'].unique()
train_set, val_set = train_test_split(unique_queries, test_size=0.2, random_state=42)

train_set = training_set[training_set['srch_id'].isin(train_set)]
val_set = training_set[training_set['srch_id'].isin(val_set)]

y_train = train_set['relevance']
X_train = train_set.drop(columns=['position', 'random_bool', 'click_bool', 'booking_bool', 'relevance'])

y_val = val_set['relevance']
X_val = val_set.drop(columns=['position', 'random_bool', 'click_bool', 'booking_bool', 'relevance'])




# 4. Missing values

## 4a. Find percentage of missing values

In [5]:
X_train.isna().mean().sort_values(ascending=False)

comp1_rate_percent_diff        0.981110
comp6_rate_percent_diff        0.980588
comp1_rate                     0.976009
comp1_inv                      0.974136
comp4_rate_percent_diff        0.973472
comp7_rate_percent_diff        0.972089
gross_bookings_usd             0.972052
comp6_rate                     0.951568
visitor_hist_starrating        0.948906
visitor_hist_adr_usd           0.948682
comp6_inv                      0.947358
comp4_rate                     0.938002
comp7_rate                     0.936446
srch_query_affinity_score      0.936209
comp4_inv                      0.930640
comp7_inv                      0.928236
comp3_rate_percent_diff        0.904272
comp2_rate_percent_diff        0.887581
comp8_rate_percent_diff        0.875582
comp5_rate_percent_diff        0.829884
comp3_rate                     0.689640
comp3_inv                      0.665881
comp8_rate                     0.612545
comp8_inv                      0.598285
comp2_rate                     0.590776


In [6]:
X_val.isna().mean().sort_values(ascending=False)

comp6_rate_percent_diff        0.980668
comp1_rate_percent_diff        0.980326
comp1_rate                     0.975028
comp4_rate_percent_diff        0.973924
comp1_inv                      0.972807
gross_bookings_usd             0.972240
comp7_rate_percent_diff        0.971964
comp6_rate                     0.951555
visitor_hist_starrating        0.950396
visitor_hist_adr_usd           0.950160
comp6_inv                      0.947400
comp4_rate                     0.938034
comp7_rate                     0.936217
srch_query_affinity_score      0.935090
comp4_inv                      0.930891
comp7_inv                      0.927641
comp3_rate_percent_diff        0.906142
comp2_rate_percent_diff        0.888764
comp8_rate_percent_diff        0.877778
comp5_rate_percent_diff        0.832298
comp3_rate                     0.694264
comp3_inv                      0.671617
comp8_rate                     0.617064
comp8_inv                      0.602659
comp2_rate                     0.595216


## 4b. Missing values handling

### Missing competitor data

- if comp_inv = 0 (exp and comp have availability) and comp_rate = 0 (exp and comp have same rate) then comp_rate_percent_diff = 0
- if comp_inv = 0 and comp_rate = NaN then comp_rate_percent_diff = 0
- if comp_inv = 0 (exp and comp have availability) and comp_rate = 1 (exp has lower rate than comp) then comp_rate_percent_diff = value
- if comp_inv = 1 (exp has availability and comp not) and comp_rate = 0 (exp and comp have same rate) then comp_rate_percent_diff = 0?
- if comp_inv = 1 (exp has availability and comp not) and comp_rate = -1 (exp has higher rate than comp) then comp_rate_percent_diff = -value
- if comp_inv = 1 and comp_rate = NaN then comp_rate_percent_diff = 0
- if comp_inv = -1 then comp_inv = 0
- if comp_inv = NaN (no comp) and comp_rate = NaN (no comp) then comp_rate_percent_diff = comp_rate = comp_inv = 0

### Drop columns and set missing review score to 0

- prop_review_score : encode if there is review or not (binary) or 0
- gross_bookings_usd : drop
- srch_query_affinity_score : drop
- location_score2 : drop
- visitor_hist_adr_usd : drop
- orig_destination_distance : drop
- visitor_hist_starrating : drop

In [7]:
training_set.loc[training_set.comp2_inv == -1,['srch_id', 'prop_id','comp2_inv','comp2_rate','comp2_rate_percent_diff']]

Unnamed: 0,srch_id,prop_id,comp2_inv,comp2_rate,comp2_rate_percent_diff
101,11,69993,-1.0,0.0,
1123,81,138112,-1.0,,
2227,145,90958,-1.0,,
2811,180,74861,-1.0,,
2825,180,134534,-1.0,,
...,...,...,...,...,...
4956893,332667,75374,-1.0,,
4956991,332672,58952,-1.0,,
4957238,332693,48629,-1.0,,
4957481,332718,109431,-1.0,,


In [8]:
def handle_missing_data(dataset):
 
    # Handle missing competitor data
    for i in range(1,9):
        comp_inv_name = 'comp'+str(i)+'_inv'
        comp_rate_name = 'comp'+str(i)+'_rate'
        comp_rate_percent_diff_name = 'comp'+str(i)+'_rate_percent_diff'
        
        dataset[comp_inv_name] = dataset[comp_inv_name].fillna(0)
        dataset[comp_inv_name].replace(-1, 0, inplace=True)
        dataset[comp_rate_name] = dataset[comp_rate_name].fillna(0)
        dataset[comp_rate_percent_diff_name] = dataset[comp_rate_percent_diff_name].fillna(0) * dataset[comp_rate_name]
        
        dataset.drop(columns=comp_rate_name, inplace=True)
        
    #Drop columns with too many missing values
    dataset.drop(columns=['gross_bookings_usd',
                            'srch_query_affinity_score',
                            'visitor_hist_adr_usd',
                            'orig_destination_distance',
                            'visitor_hist_starrating',
                            'prop_location_score2'], inplace=True) 

    # Set missing review scores to 0
    dataset['prop_review_score'] = dataset['prop_review_score'].fillna(0)
    
    return dataset
        

## 4c. Recheck missing value percentage

In [9]:
X_train.isna().mean().sort_values(ascending=False)

comp1_rate_percent_diff        0.981110
comp6_rate_percent_diff        0.980588
comp1_rate                     0.976009
comp1_inv                      0.974136
comp4_rate_percent_diff        0.973472
comp7_rate_percent_diff        0.972089
gross_bookings_usd             0.972052
comp6_rate                     0.951568
visitor_hist_starrating        0.948906
visitor_hist_adr_usd           0.948682
comp6_inv                      0.947358
comp4_rate                     0.938002
comp7_rate                     0.936446
srch_query_affinity_score      0.936209
comp4_inv                      0.930640
comp7_inv                      0.928236
comp3_rate_percent_diff        0.904272
comp2_rate_percent_diff        0.887581
comp8_rate_percent_diff        0.875582
comp5_rate_percent_diff        0.829884
comp3_rate                     0.689640
comp3_inv                      0.665881
comp8_rate                     0.612545
comp8_inv                      0.598285
comp2_rate                     0.590776


In [10]:
X_val.isna().mean().sort_values(ascending=False)

comp6_rate_percent_diff        0.980668
comp1_rate_percent_diff        0.980326
comp1_rate                     0.975028
comp4_rate_percent_diff        0.973924
comp1_inv                      0.972807
gross_bookings_usd             0.972240
comp7_rate_percent_diff        0.971964
comp6_rate                     0.951555
visitor_hist_starrating        0.950396
visitor_hist_adr_usd           0.950160
comp6_inv                      0.947400
comp4_rate                     0.938034
comp7_rate                     0.936217
srch_query_affinity_score      0.935090
comp4_inv                      0.930891
comp7_inv                      0.927641
comp3_rate_percent_diff        0.906142
comp2_rate_percent_diff        0.888764
comp8_rate_percent_diff        0.877778
comp5_rate_percent_diff        0.832298
comp3_rate                     0.694264
comp3_inv                      0.671617
comp8_rate                     0.617064
comp8_inv                      0.602659
comp2_rate                     0.595216


# 5. Feature engineering

## 5a. Dates

- make month and day columns
- drop date_time

## 5b. Transform features

- price_usd : log

## 5c. Aggregate features

### Define Aggregator

In [11]:
def aggregator(df, groupby_columns, target_columns, aggregators):
    old_columns = df.columns.values
    df_grouped = df.groupby(groupby_columns)[target_columns].agg(aggregators)
    df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)
    
    new_columns = ['_'.join(col).strip() + ' | ' + '_'.join(groupby_columns) for col in df_grouped.columns.values]
    df.columns = np.append(old_columns, new_columns)
    
    return df

### Competitor Variables

In [12]:
comp_vars = [i for i in training_set.columns if 'comp' in i and 'rate' not in i]

### By srch_id

In [13]:
srch_id_column_target = ['log_price_usd',
                         'prop_starrating',
                         'prop_location_score1',
                         'prop_review_score',
                         'prop_brand_bool'] + comp_vars


### By prop_id

In [14]:
prop_id_column_target = ['log_price_usd',
                         'prop_starrating',
                         'prop_brand_bool',
                         'promotion_flag'] + comp_vars

### By srch_destination_id

In [15]:
srch_destination_id_column_target = ['log_price_usd', 
                                     'prop_starrating',
                                     'prop_location_score1',
                                     'prop_review_score',
                                     'prop_brand_bool'] + comp_vars

### Define preprocessor

## 6d. Scale features

Scaling or standardization approach per feature

- star rating : min max scaler
- review score : min max scaler
- location score : min max scaler
- log price and log hist price : standard scaler
- length of stay : standard scaler
- booking window : standard scaler
- adults, children, room count : standard scaler
- comp features : standard scaler

In [16]:
features_to_standardize = [i for i in training_set.columns if 'log_' in i] + \
                    [i for i in training_set.columns if 'count' in i and 'id' not in i] + \
                    [i for i in training_set.columns if 'comp' in i and 'rate_percent_diff' in i] + \
                    [i for i in training_set.columns if 'length_of_stay' in i] + \
                    [i for i in training_set.columns if 'booking_window' in i]


features_to_minmax = [i for i in training_set.columns if 'prop' in i and'starrating' in i] + \
                [i for i in training_set.columns if 'review_score' in i] + \
                [i for i in training_set.columns if 'location_score' in i and '1' in i]
                



NEXT STEP : 

- REMOVE TRAINING ONLY COLUMNS (AND MAYBE INCLUDE POSITION DATASET)
- SPLIT TRAINING AND VALIDATION SETS
- DEFINE NEEDS OF LIGHT GBM
    - CATEGORICAL FEATURES
    - GROUPS AND EVAL GROUPS
    - LABEL

# 7. Preprocess with steps defined above

### Define preprocessor

In [17]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler

class Processor():
    def __init__(self, groupby_columns, target_columns, aggregators, features_to_normalize, features_to_standardize):
        self.groupby_columns = groupby_columns
        self.target_columns = target_columns
        self.aggregators = aggregators
        self.features_to_normalize = features_to_normalize
        self.features_to_standardize = features_to_standardize
        
    def process_train(self, X):
        
        X_fit = X.copy()
        
        # Convert date_time to datetime object
        X_fit['date_time'] = pd.to_datetime(X_fit['date_time'])
        X_fit['month'] = X_fit['date_time'].dt.month
        X_fit['day'] = X_fit['date_time'].dt.day
        X_fit.drop(columns=['date_time'], inplace=True)
                
        #Missing data
        X_fit = handle_missing_data(X_fit)
        
        #transform features
        epsilon = 1e-5
        X_fit['log_price_usd'] = np.log(X_fit['price_usd'] + epsilon)
        X_fit.drop(columns=['price_usd'], inplace=True)
        
        #Aggregations
        for i in self.groupby_columns:
            if i not in X_fit.columns:
                raise ValueError('Column ' + i + ' not in dataframe') 
            X_fit = aggregator(X_fit, [i], self.target_columns, self.aggregators)
            
        X_fit.fillna(0, inplace=True)
            
        #Normalize
        self.normalizer = MinMaxScaler()
        X_fit[self.features_to_normalize] = self.normalizer.fit_transform(X_fit[self.features_to_normalize])
        
        #Standardize
        self.standardizer = StandardScaler()
        X_fit[self.features_to_standardize] = self.standardizer.fit_transform(X_fit[self.features_to_standardize])
        
        return X_fit
    
    
    
    def process_test(self, X):
            
            X_transform = X.copy()
            
            # Convert date_time to datetime object
            X_transform['date_time'] = pd.to_datetime(X_transform['date_time'])
            X_transform['month'] = X_transform['date_time'].dt.month
            X_transform['day'] = X_transform['date_time'].dt.day
            X_transform.drop(columns=['date_time'], inplace=True)
                    
            #Missing data
            X_transform = handle_missing_data(X_transform)
            
            #transform features
            epsilon = 1e-5
            X_transform['log_price_usd'] = np.log(X_transform['price_usd'] + epsilon)
            X_transform.drop(columns=['price_usd'], inplace=True)
            
            #Aggregations
            for i in self.groupby_columns:
                if i not in X_transform.columns:
                    raise ValueError('Column ' + i + ' not in dataframe') 
                X_transform = aggregator(X_transform, [i], self.target_columns, self.aggregators)
                
            X_transform.fillna(0, inplace=True)
            
            #Normalize
            X_transform[self.features_to_normalize] = self.normalizer.transform(X_transform[self.features_to_normalize])
            
            #Standardize
            X_transform[self.features_to_standardize] = self.standardizer.transform(X_transform[self.features_to_standardize])
            
            return X_transform    
            

### Preprocess training and validation set

In [18]:
processor = Processor(['srch_id','prop_id', 'srch_destination_id'], srch_id_column_target, ['mean', 'median', 'std'], features_to_minmax, features_to_standardize)

X_train = processor.process_train(X_train)
X_val = processor.process_test(X_val)

  df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)
  df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)
  df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)
  df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)
  df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)
  df = df.merge(df_grouped, how='left', left_on=groupby_columns, right_index=True)


In [19]:
X_train

Unnamed: 0,srch_id,site_id,visitor_location_country_id,prop_country_id,prop_id,prop_starrating,prop_review_score,prop_brand_bool,prop_location_score1,prop_log_historical_price,...,comp5_inv_std | srch_destination_id,comp6_inv_mean | srch_destination_id,comp6_inv_median | srch_destination_id,comp6_inv_std | srch_destination_id,comp7_inv_mean | srch_destination_id,comp7_inv_median | srch_destination_id,comp7_inv_std | srch_destination_id,comp8_inv_mean | srch_destination_id,comp8_inv_median | srch_destination_id,comp8_inv_std | srch_destination_id
0,1,12,187,219,893,0.6,0.7,1,0.405444,0.344792,...,0.175129,0.0,0.0,0.0,0.0,0.0,0.0,0.008128,0.0,0.089791
1,1,12,187,219,10404,0.8,0.8,1,0.315186,0.388379,...,0.175129,0.0,0.0,0.0,0.0,0.0,0.0,0.008128,0.0,0.089791
2,1,12,187,219,21315,0.6,0.9,1,0.315186,0.328446,...,0.175129,0.0,0.0,0.0,0.0,0.0,0.0,0.008128,0.0,0.089791
3,1,12,187,219,27348,0.4,0.8,1,0.405444,0.039681,...,0.175129,0.0,0.0,0.0,0.0,0.0,0.0,0.008128,0.0,0.089791
4,1,12,187,219,29604,0.8,0.7,1,0.378223,0.333895,...,0.175129,0.0,0.0,0.0,0.0,0.0,0.0,0.008128,0.0,0.089791
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4958342,332785,5,219,219,77700,0.6,0.8,1,0.230659,-2.352172,...,0.055104,0.0,0.0,0.0,0.0,0.0,0.0,0.009128,0.0,0.095151
4958343,332785,5,219,219,88083,0.6,0.8,1,0.279370,-2.352172,...,0.055104,0.0,0.0,0.0,0.0,0.0,0.0,0.009128,0.0,0.095151
4958344,332785,5,219,219,94508,0.6,0.7,1,0.157593,-2.352172,...,0.055104,0.0,0.0,0.0,0.0,0.0,0.0,0.009128,0.0,0.095151
4958345,332785,5,219,219,128360,0.6,1.0,1,0.279370,-2.352172,...,0.055104,0.0,0.0,0.0,0.0,0.0,0.0,0.009128,0.0,0.095151


In [20]:
groups = X_train['srch_id'].value_counts(sort=False).sort_index()
X_train.drop(columns=['srch_id'], inplace=True)

groups_val = X_val['srch_id'].value_counts(sort=False).sort_index()
X_val.drop(columns=['srch_id'], inplace=True)

categorical_features = ['site_id', 'visitor_location_country_id', 'prop_country_id', 'prop_id', 'srch_destination_id', 'month', 'day']
categorical_features_indices = [X_train.columns.get_loc(i) for i in categorical_features]

# 8. Train LambdaMART

In [22]:
from optuna.integration import lightgbm as lgb

# create lgb datasets
train_data = lgb.Dataset(X_train, label=y_train, group=groups)
val_data = lgb.Dataset(X_val, label=y_val, group=groups_val)

# define objective function

params = {'objective': 'lambdarank', 
          'boosting': 'dart',
          'metric':'ndcg', 
          'ndcg_eval_at': 5,
          'device': 'cpu',
          'n_jobs': '8',
          'num_iterations': 500,
          }

model = lgb.train(params,
                  train_set = train_data,
                  valid_sets = [train_data, val_data],
                  verbosity = -1,
                  categorical_feature=categorical_features_indices
                  )



KeyboardInterrupt: 