In [None]:
import xgboost as xg
# import dask.dataframe as dd
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm

In [None]:
data = pl.read_csv('../data/preprocessed/engineered_training_set.csv')
data.describe()

In [None]:
data = data.sort(['srch_id', 'booking_bool', 'click_bool'], descending=[False, True, True])

In [None]:
# Convert polars DataFrames to pandas DataFrames
data_pd = data.to_pandas()
# replace all NULL values with np.nan
data_pd = data_pd.replace('NULL', np.nan)
ranking_pd = data_pd[['srch_id', 'prop_id']]

# Convert object columns to appropriate data types
object_columns = data_pd.select_dtypes(include=['object']).columns
data_pd[object_columns] = data_pd[object_columns].apply(pd.to_numeric, errors='coerce')

# Split the data into features (X) and target (y)
X = data_pd.drop(['srch_id'], axis=1)
y = ranking_pd['prop_id']


# Split the data into training and testing sets based on srch_id
srch_ids = data_pd['srch_id'].unique()
train_srch_ids, test_srch_ids = train_test_split(srch_ids, test_size=0.2, random_state=42)

# Create training and testing DataFrames
train_data = data_pd[data_pd['srch_id'].isin(train_srch_ids)]
test_data = data_pd[data_pd['srch_id'].isin(test_srch_ids)]

# Create training and testing ranking DataFrames
train_ranking = ranking_pd[ranking_pd['srch_id'].isin(train_srch_ids)]
test_ranking = ranking_pd[ranking_pd['srch_id'].isin(test_srch_ids)]

In [None]:


# Create XGBoost DMatrix objects for training and testing
train_dmatrix = xg.DMatrix(
    train_data.drop(['srch_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1),
    label=train_data['prop_id'],
    group=train_data['srch_id'].value_counts().sort_index().values
)
test_dmatrix = xg.DMatrix(
    test_data.drop(['srch_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1),
    label=test_data['prop_id'],
    group=test_data['srch_id'].value_counts().sort_index().values
)

# MAKE SURE TO USE FULL DATA SET IN THE END
full_dmatrix = xg.DMatrix(
    X.drop(['booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1),
    label=y,
    group=data_pd['srch_id'].value_counts().sort_index().values
)

# Set XGBoost parameters
params = {
    'objective': 'rank:pairwise', # the objective, can also be rank:ndcg, but that is buggy
    'lambdarank_pair_method': 'topk', # instead of looking at the mean, we look at the highest k
    'lambdarank_num_pair_per_sample': 6, # set slightly higher than intended k
    'eval_metric': 'ndcg',
    'learning_rate': 0.1,
    'max_depth': 10,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train the XGBoost ranking model
model = xg.train(params, train_dmatrix, num_boost_round=100)

In [None]:
xg.plot_importance(model)

In [None]:
from sklearn.metrics import ndcg_score
# evalation

# Make predictions on the test set
test_pred = model.predict(test_dmatrix)
test_data['pred'] = test_pred
true_order = test_data[['srch_id', 'prop_id']]

# we can order the prop_ids based on the predictions
results = test_data.sort_values(['srch_id', 'pred'], ascending=[True, False])[['srch_id', 'prop_id']]

grouped = results.groupby('srch_id')['prop_id'].apply(list)
grouped = grouped.reset_index()


#lambda magic and ugly code
grouped['true'] = true_order.groupby('srch_id')['prop_id'].apply(list).reset_index()['prop_id']
grouped['ndcg'] = grouped.apply(lambda x: ndcg_score([x['true']], [x['prop_id']], k=5) if len(x['true']) > 1 else None, axis=1)
print(f'mean_ndcg: {grouped["ndcg"].mean()}')

In [None]:
# Full training
model = xg.train(params, full_dmatrix, num_boost_round=100)

In [None]:
# save the model

model.save_model('models/model.json')
# load the model
model = xg.Booster()
model.load_model('models/model.json')

In [None]:
test_set = pl.read_csv('../data/preprocessed/engineered_test_set.csv')
test_set = test_set.to_pandas()
test_set = test_set.replace('NULL', np.nan)

object_columns = test_set.select_dtypes(include=['object']).columns
test_set[object_columns] = test_set[object_columns].apply(pd.to_numeric, errors='coerce')

In [None]:
test_set_dmatrix = xg.DMatrix(test_set.drop(['srch_id'], axis=1), group=test_set['srch_id'].value_counts().sort_index().values)
test_set['pred'] = model.predict(test_set_dmatrix)

# same as earlier, without need for calculating the ndcg, so less steps
submission = test_set.sort_values(['srch_id', 'pred'], ascending=[True, False])[['srch_id', 'prop_id']]
submission.to_csv('submit/submission.csv', index=False)

In [None]:
# srch_ids = test_set['srch_id'].unique()

# # Create an empty DataFrame to store the predictions
# submission_df = pd.DataFrame(columns=['srch_id', 'prop_id'])

# #  Unique srch_ids in the test set
# srch_ids = test_set['srch_id'].unique()

# # Pre-allocate a list to collect results
# results = []

# # Iterate over each srch_id and make predictions
# for srch_id in tqdm(srch_ids):
#     # Get the data for the current srch_id
#     srch_data = test_set[test_set['srch_id'] == srch_id]

#     # Create DMatrix for the current srch_id
#     srch_dmatrix = xg.DMatrix(srch_data.drop(['srch_id'], axis=1))

#     # Make predictions for the current srch_id
#     srch_pred = model.predict(srch_dmatrix)

#     # Get the corresponding prop_ids for the current srch_id
#     srch_prop_ids = srch_data['prop_id'].values

#     # Sort the prop_ids based on the predicted scores
#     sorted_indices = np.argsort(srch_pred)[::-1]
#     sorted_prop_ids = srch_prop_ids[sorted_indices]

#     # Collect the results for the current srch_id
#     results.append(pd.DataFrame({'srch_id': srch_id, 'prop_id': sorted_prop_ids}))

# # Concatenate all results into a single DataFrame
# submission_df = pd.concat(results, ignore_index=True)


# # Check final submission DataFrame size
# print(f"Expected number of entries: {len(test_set)}")
# print(f"Actual number of entries: {len(submission_df)}")

# # Save the submission DataFrame to a CSV file
# submission_df.to_csv('submit/submission.csv', index=False)