In [None]:
import xgboost as xg
# import dask.dataframe as dd
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm
import matplotlib.pyplot as plt

In [None]:

#For making pretty LaTeX plots
plt.rcParams.update({
    "font.family": "serif",
    "font.size": 18,
    "axes.titlesize": 18,
    "axes.labelsize": 14,
    "xtick.labelsize": 12,
    "ytick.labelsize": 12,
    "legend.fontsize": 12,
    "figure.figsize": (8, 6),
    "figure.dpi": 100,
    "savefig.dpi": 200,
    "savefig.format": "png",
    "savefig.transparent": True,
    "axes.grid": True,
    "grid.linewidth": 0.5,
    "grid.linestyle": "--",
    "grid.color": "0.8",
    "image.cmap": "Blues",
    "lines.linewidth": 1.5,
    "lines.markersize": 6,
    "text.usetex": True, "mathtext.fontset": "cm",
    "pgf.preamble": r"\usepackage[utf8]{inputenc}\usepackage[T1]{fontenc}\usepackage{cmbright}"
})

In [None]:
data = pl.read_csv('../data/preprocessed/engineered_training_set.csv')
display(data)

In [None]:
# hist of prop_country_id
count = data['prop_country_id'].value_counts()

plt.figure(figsize=(12, 6))
plt.bar(count['prop_country_id'], count['count'])
plt.xlabel('prop_country_id')
plt.ylabel('count')
plt.title('Histogram of prop_country_id')

The following code is very messy, but essentially, all you need to do is choose a number of partitions, after which the variable `partition_list` will be a list of np.arrays, each of which are the `prop_country_id`'s for one partitions. Then you can train a model on each partition

In [None]:
partitions = 12
partitions_list = [np.array([219])]
countries = count['prop_country_id'].to_numpy()
indx = countries != 219
countries = countries[indx]
count_array = count['count'].to_numpy()


# We take a cumsum, and get the indices at which we should split the data, sometimes we get empty partitions
count_array = count_array[indx]
cum_sum = np.cumsum(count_array)
total = cum_sum[-1]
partition_size = total // partitions
for i in range(0, partitions):
    idx_min = np.argmax(cum_sum >= partition_size * i)
    idx_max = np.argmax(cum_sum > partition_size * (i + 1))
    if i == partitions - 1:
        partitions_list.append(countries[idx_min:])
    else:
        partitions_list.append(countries[idx_min:idx_max])

# For printing and getting all the empty partitions
counts = 0
zeros = []
for i, part in enumerate(partitions_list):
    countries = count.filter(count['prop_country_id'].is_in(part))
    part_count = countries['count']
    counts += sum(part_count)
    if i < 9:
        print(f'Count for partion  {i+1}: {sum(part_count)}')
    else:
        print(f'Count for partion {i+1}: {sum(part_count)}')

    if sum(part_count) == 0:
        zeros.append(0)
    else:
        zeros.append(1)

# Remove empty partitions
partitions_list = [x for i, x in enumerate(partitions_list) if zeros[i] == 1]

assert counts == sum(count['count']), f'Expected {sum(count["count"])} but got {counts}'

In [None]:
def split_data_set(data, partitions_list):
    data_partitions = []
    for part in partitions_list:
        data_partitions.append(data.loc[data['prop_country_id'].isin(part)])
    return data_partitions


In [None]:
data = data.sort(['srch_id', 'booking_bool', 'click_bool'], descending=[False, True, True])

In [None]:
# Convert polars DataFrames to pandas DataFrames
data_pd = data.to_pandas()
data_pd['weight'] = 5 * data_pd['booking_bool'] + data_pd['click_bool']
# replace all NULL values with np.nan
data_pd = data_pd.replace('NULL', np.nan)
ranking_pd = data_pd[['srch_id', 'prop_id']]

# Convert object columns to appropriate data types
object_columns = data_pd.select_dtypes(include=['object']).columns
data_pd[object_columns] = data_pd[object_columns].apply(pd.to_numeric, errors='coerce')

# Split the data into features (X) and target (y)
X = data_pd.drop(['srch_id'], axis=1)
y = ranking_pd['prop_id']


# Split the data into training and testing sets based on srch_id
srch_ids = data_pd['srch_id'].unique()
train_srch_ids, test_srch_ids = train_test_split(srch_ids, test_size=0.2, random_state=42)

# Create training and testing DataFrames
train_data = data_pd[data_pd['srch_id'].isin(train_srch_ids)]
test_data = data_pd[data_pd['srch_id'].isin(test_srch_ids)]


split_train = split_data_set(train_data, partitions_list)
split_test = split_data_set(test_data, partitions_list)
split_full = split_data_set(data_pd, partitions_list)

In [None]:
def train_model(data, params, num_boost, drop_cols):
    # Create XGBoost DMatrix objects for training and testing
    train_dmatrix = xg.DMatrix(
        data.drop(drop_cols, axis=1),
        label=data['weight'],
        qid=data['srch_id']
    )
    
    return xg.train(params, train_dmatrix, num_boost_round=num_boost)


def dcg_at_k(r, k, method=1):
    r = np.asfarray(r)[:k]
    if r.size:
        if method == 1:
            return r[0] + np.sum(r[1:] / np.log2(np.arange(2, r.size + 1)))
        else:
            return np.sum(r / np.log2(np.arange(2, r.size + 2)))
    return 0.


def ndcg_at_k(r, k, method=1):
    dcg_max = dcg_at_k(sorted(r, reverse=True), k, method)
    if not dcg_max:
        return 0.
    return dcg_at_k(r, k, method) / dcg_max


def calculate_grades(data):
    data['grade'] = np.where(data['booking_bool'] == 1, 5,
                             np.where(data['click_bool'] == 1, 1, 0))
    return data


def predict_and_evaluate(model, dmatrix, t_data, ndcg=True):
    test_pred = model.predict(dmatrix)
    results_data = t_data.copy()
    results_data['pred'] = test_pred

    # Assign grades based on booking and clicking
    if ndcg:
        results_data = calculate_grades(results_data)

    # Sort predictions with highest probability first
    ordered_results = results_data.sort_values(['srch_id', 'pred'], ascending=[True, False])
    if ndcg:
        grouped = ordered_results.groupby('srch_id')['grade'].apply(list).reset_index()

        ndcg_scores = grouped['grade'].apply(lambda grades: ndcg_at_k(grades, 5))
        mean_ndcg = ndcg_scores.mean()
        print(f'Mean NDCG: {mean_ndcg}')
    
    return ordered_results[['srch_id', 'pred', 'prop_id']], mean_ndcg if ndcg else None


def eval_models_partions(models, split_test, drop_cols, ndcg=True):
    df = pd.DataFrame()
    mean_ndcgs = []
    for model, part in zip(models, split_test):
        dmat = xg.DMatrix(
            part.drop(drop_cols, axis=1), 
            group=part['srch_id'].value_counts().sort_index().values
)
        grouped, mean = predict_and_evaluate(model, dmat, part, ndcg=ndcg)

        df = pd.concat([df, grouped])
        if ndcg:
            mean_ndcgs.append(mean*len(part))
    return df, mean_ndcgs

# Set XGBoost parameters
config_= {
    'objective': 'rank:pairwise', # the objective, can also be rank:ndcg, but that is buggy
    'lambdarank_pair_method': 'topk', # instead of looking at the mean, we look at the highest k
    'lambdarank_num_pair_per_sample': 6, # set slightly higher than intended k
    'eval_metric': 'ndcg',
    'eta': 0.13963013806537555,
    'max_depth': 9,
    'subsample': 0.7038375178678972,
    'colsample_bytree': 0.7015452447331039,
    'seed': 42
}
config = {
    'objective': 'rank:ndcg',
    'eval_metric': 'ndcg@5',
    'eta': 0.13963013806537555,
    'max_depth': 9,
    'subsample': 0.7038375178678972,
    'colsample_bytree': 0.7015452447331039,
    'seed': 42
}

def train_models(split_train, params, num_boost, drop_cols):
    models = []
    for part in tqdm(split_train):
        model = train_model(part, params, num_boost, drop_cols)
        models.append(model)
    return models

In [None]:
config = {
        'objective': 'rank:ndcg',  
        'eval_metric': 'ndcg',    
        'learning_rate': 0.014,  
        'ndcg_exp_gain': False,
        'max_depth': 6,            # Shallow trees to prevent overfitting
        'subsample': 0.92,         # Subsampling to prevent overfitting
        'colsample_bytree': 0.78, 
        'seed': 42              
    }

drop_cols = ['srch_id','prop_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool', 'weight']

models = train_models(split_train, config, 120, drop_cols)


In [None]:
drop_cols = ['srch_id','prop_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool', 'weight']
grouped,means = eval_models_partions(models, split_test, drop_cols)

print(f'Total Mean NDCG: {sum(means)/len(test_data)}')

In [None]:
# Full training
models_full = train_models(split_full, config, 120, drop_cols)

In [None]:
# save the model


for i, model in enumerate(models_full):
    model.save_model(f'models/model_{i}.json')
# load the model
models_full = [xg.Booster() for i in range(len(partitions_list))]
[models_full[i].load_model(f'models/model_{i}.json') for i in range(len(models_full))]


In [None]:
test_set = pl.read_csv('../data/preprocessed/engineered_test_set.csv')
test_set = test_set.to_pandas()
test_set = test_set.replace('NULL', np.nan)

object_columns = test_set.select_dtypes(include=['object']).columns
test_set[object_columns] = test_set[object_columns].apply(pd.to_numeric, errors='coerce')

split_eval = split_data_set(test_set, partitions_list)

assert sum([len(x) for x in split_eval]) == len(test_set)

In [None]:
test_set.head()

In [None]:
drop_cols = ['srch_id','prop_id']
grouped, mean = eval_models_partions(models_full, split_eval, ndcg=False, drop_cols=drop_cols)

In [None]:
submission = grouped[['srch_id', 'prop_id']]

assert len(submission) == len(test_set)

submission.head()

In [None]:
submission.to_csv('submit/submission.csv', index=False)

# old stuff for single model


test_set_dmatrix = xg.DMatrix(test_set.drop(['srch_id'], axis=1), group=test_set['srch_id'].value_counts().sort_index().values)
test_set['pred'] = model.predict(test_set_dmatrix)

# same as earlier, without need for calculating the ndcg, so less steps
submission = test_set.sort_values(['srch_id', 'pred'], ascending=[True, False])[['srch_id', 'prop_id']]
submission.to_csv('submit/submission.csv', index=False)

test_set = pl.read_csv('../data/preprocessed/engineered_test_set.csv')
shuffled_test_set = test_set.sort(['srch_id', 'price_per_person'], descending=[False, True])[['srch_id', 'prop_id']]


shuffled_test_set.write_csv('submit/submission.csv')