# Training of full dataset and final predictions

In [None]:
import numpy as np 
import pandas as pd 
import matplotlib.pyplot as plt
import pyltr
import utils
from operator import itemgetter
import csv
import pickle

## Training settings

In [None]:
balance_flag = True
downsampling_rate = 3
learning_rate = 0.03

## Load dataset and downsampling

In [None]:
# Choose from 0, 1, 2 ,3

# read datasets from correct path
k=0 
train = pd.read_pickle("train_new_"+str(k))
val = pd.read_pickle("val_new_"+str(k))
test = pd.read_pickle("test_new_"+str(k))

In [None]:
train['relevance'].hist()
plt.title('Relevance class proportions before downsampling')
plt.show()

# balance the dataset, much faster training
if balance_flag:
    train = utils.balance_dataset(train,downsampling_rate)

In [None]:
# Efi's code for data cleaning
train.shape
selected_features = utils.define_features()

In [None]:
features_to_keep = ['booking_bool', 'srch_id', 'click_bool', 'prop_id', 'relevance']
selected_features = list(set(set(selected_features).difference(features_to_keep)))
all_possible_features = features_to_keep + selected_features

all_possible_features = list(set(all_possible_features))
df_to_train = train.ix[:,all_possible_features]
df_to_val = val.ix[:,all_possible_features]
df_to_test = test.ix[:,all_possible_features]

# since we've created a new variable that combined those two, there's no need in keeping them
df_to_train.drop(['booking_bool', 'click_bool'], axis = 1)
df_to_val.drop(['booking_bool', 'click_bool'], axis = 1)
df_to_test.drop(['booking_bool', 'click_bool'], axis = 1)

# rearrange the columns because lambdaMART wants the target variable and the IDs in the first and second column respectively
df_to_train = df_to_train[['relevance', 'srch_id', 'prop_id']+selected_features]
df_to_val =  df_to_val[['relevance', 'srch_id', 'prop_id']+selected_features]
df_to_test =  df_to_test[['relevance', 'srch_id', 'prop_id']+selected_features]

## Prepare dataset for model

In [None]:
#From dataframes to arrays
trrelevance_arr = np.array(df_to_train['relevance'])
trfeature_arr = np.array(df_to_train[selected_features])
trid_arr = np.array(df_to_train.srch_id)

vrelevance_arr = np.array(df_to_val['relevance'])
vfeature_arr = np.array(df_to_val[selected_features])
vid_arr = np.array(df_to_val.srch_id)

terelevance_arr = np.array(df_to_test['relevance'])
tefeature_arr = np.array(df_to_test[selected_features])
teid_arr = np.array(df_to_test.srch_id)

In [None]:
rand_seed=0

## Run LambdaMART model

In [None]:
#Choose nDCG as metric (k arbitrary number)
metric = pyltr.metrics.NDCG(k=31)

#Use validation set, stop_after arbitrary number
# Only needed if you want to perform validation (early stopping & trimming)
monitor = pyltr.models.monitors.ValidationMonitor(
    vfeature_arr, vrelevance_arr, vid_arr, metric=metric, stop_after=300)

feats = list(df_to_test[selected_features].columns)
feats

#Use lambdaMART - have to find the best values for the parametes
n_estimators = 1
min_samples_leaf=64
model = pyltr.models.LambdaMART(
    metric=metric,
    n_estimators=n_estimators,
    learning_rate=learning_rate,
    #max_features=0.5,
    #query_subsample=0.5,
    #max_leaf_nodes=10,
    min_samples_leaf=min_samples_leaf,
    verbose=1
)

model.fit(trfeature_arr, trrelevance_arr, trid_arr, monitor=monitor)
# model.fit(trfeature_arr, trrelevance_arr, trid_arr)

## Make predictions for test set

In [None]:
tepred = model.predict(tefeature_arr)
results_df = df_to_test[['srch_id', 'prop_id']].copy()
results_df['score'] = -1 * tepred

In [None]:
# this is another write_submission than previously, we dont have relevance anymore
def write_submission(recommendations, submission_file):
    """
    Function which writes submission, ordered on the probability obtained by the model.
    The columns are SearchId, PropertyId and Relevance    
    """
    global rows
    submission_path = submission_file
    rows = [(srch_id, prop_id)
        for srch_id, prop_id, rank_float
        in sorted(recommendations, key=itemgetter(0,2))]
    writer = csv.writer(open(submission_path, "w"), lineterminator="\n")
    writer.writerow(("SearchId", "PropertyId"))
    writer.writerows(rows)

# predictions = list(-1.0*predictions)
recommendations = zip(results_df["srch_id"], results_df["prop_id"], results_df['score'])
write_submission(recommendations, "predictionfile.csv")

model_save = pickle.dumps(model)
new_model = pickle.loads(model_save)