In [1]:
import xgboost as xg
import polars as pl
import numpy as np
from sklearn.model_selection import train_test_split
import pandas as pd
from tqdm.notebook import tqdm

In [2]:
def test_rankings(pred, order):
    for srch in pred.srch_id.unique():
        srch_pred = pred[pred.srch_id == srch]
        srch_order = order[order.srch_id == srch]
        assert len(srch_pred) == len(srch_order)
        assert (srch_pred.prop_id == srch_order.prop_id).all()

In [3]:
data = pl.read_csv('../data/preprocessed/engineered_training_set.csv')
for col_name in data.columns:
    if data[col_name].dtype == pl.Utf8:
        data = data.with_columns(
            data[col_name]
            .str.replace("NULL", "NaN") 
            .str.replace("N/A", "NaN")   
            .cast(pl.Float64)          
            .alias(col_name)
        )
data.describe()

statistic,srch_id,date_time,site_id,visitor_location_country_id,visitor_hist_starrating,visitor_hist_adr_usd,prop_country_id,prop_id,prop_brand_bool,position,srch_destination_id,srch_booking_window,srch_saturday_night_bool,orig_destination_distance,random_bool,click_bool,gross_bookings_usd,booking_bool,has_hist_starrating,has_hist_adr_usd,score1d2,normalized_prop_starrating,normalized_score1d2,normalized_prop_review_score,hotel_quality,normalized_hotel_quality,price_per_person,avg_price_per_day,ump,price_diff,starrating_diff,total_price,score2ma,promotion_count,promotion_any
str,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64,f64
"""count""",4953228.0,0.0,4953228.0,4953228.0,251792.0,252914.0,4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,4953228.0,3348388.0,4953228.0,4953228.0,138275.0,4953228.0,4953228.0,4953228.0,4953228.0,4953224.0,4953224.0,4953224.0,4953224.0,4953224.0,4953228.0,4953228.0,4953228.0,252914.0,251792.0,4953228.0,0.0,4953228.0,4953228.0
"""null_count""",0.0,4953228.0,0.0,0.0,4701436.0,4700314.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1604840.0,0.0,0.0,4814953.0,0.0,0.0,0.0,0.0,4.0,4.0,4.0,4.0,4.0,0.0,0.0,0.0,4700314.0,4701436.0,0.0,4953228.0,0.0,0.0
"""mean""",166367.108221,,9.953445,175.369853,3.374219,176.010139,173.999283,70077.122916,0.634832,16.857549,14042.200748,37.473272,0.502294,1300.442127,0.295835,0.044751,385.632155,0.027916,1.0,1.0,0.0,1.3289e-09,0.0,5.8107e-09,2.1778e-09,0.57336,85.293109,92.34218,-0.981179,23.608127,0.128882,170.66645,,6.277791,0.898359
"""std""",96110.062236,,7.646976,65.904102,0.692507,107.256968,68.334449,40609.937868,0.481477,10.426479,8111.447094,51.993395,,2023.15977,,,804.686432,,0.0,0.0,0.0,0.970969,0.0,0.979434,0.484224,0.088648,82.613894,82.644123,107.787853,139.708208,1.090254,152.173464,,5.896331,0.302176
"""min""",1.0,,1.0,1.0,1.41,0.0,1.0,1.0,0.0,1.0,2.0,0.0,0.0,0.01,0.0,0.0,0.0,0.0,1.0,1.0,0.0,-5.833333,0.0,-5.43742,-3.131872,0.0,0.0,0.0,-1998.16,-1903.93,-3.5,0.0,,0.0,0.0
"""25%""",82936.0,,5.0,100.0,2.92,109.81,100.0,35010.0,0.0,8.0,7101.0,4.0,,139.76,,,123.99,,1.0,1.0,0.0,-0.645497,0.0,-0.466997,-0.294509,0.5194433,40.0,40.25,-24.529985,-31.56,-0.52,89.0,,2.0,1.0
"""50%""",166503.0,,5.0,219.0,3.45,152.16,219.0,69630.0,1.0,16.0,13541.0,17.0,,386.46,,,218.36,,1.0,1.0,0.0,0.040506,0.0,0.222286,0.057451,0.583878,63.04,71.486667,17.834287,24.21,0.0,129.0,,5.0,1.0
"""75%""",249721.0,,14.0,219.0,3.93,213.41,219.0,105165.0,1.0,26.0,21084.0,48.0,,1499.46,,,429.61,,1.0,1.0,0.0,0.6561673,0.0,0.662652,0.345082,0.6365348,103.0,118.63,50.295562,82.16,0.74,199.64,,9.0,1.0
"""max""",332785.0,,34.0,231.0,5.0,1958.7,230.0,140821.0,1.0,40.0,28416.0,492.0,1.0,11666.64,1.0,1.0,159292.38,1.0,1.0,1.0,0.0,5.833334,0.0,5.294649,2.3304434,1.0,9150.0,1999.0,497.681251,1903.7,5.0,9150.0,,35.0,1.0


In [4]:
data = data.sort(['srch_id', 'booking_bool', 'click_bool'], descending=[False, True, True])

In [5]:
print(data.shape)

(4953228, 35)


> CHANGE THE DATA TO ENGINEERED DATA, EVERYTHING SHOULD WORK

In [38]:
# Convert polars DataFrames to pandas DataFrames
data_pd = data.to_pandas()
# replace all NULL values with np.nan
# data_pd = data_pd.replace('NULL', np.nan)
ranking_pd = data_pd[['srch_id', 'prop_id']]

# Convert object columns to appropriate data types
object_columns = data_pd.select_dtypes(include=['object']).columns
data_pd[object_columns] = data_pd[object_columns].apply(pd.to_numeric, errors='coerce')

# Split the data into features (X) and target (y)
X = data_pd.drop(['srch_id', 'prop_id', 'date_time', 'orig_destination_distance', 'score1d2', 'normalized_prop_starrating', 'normalized_score1d2', 'normalized_prop_review_score', 'hotel_quality', 'score2ma'], axis=1)
y = ranking_pd['prop_id']


# Split the data into training and testing sets based on srch_id
srch_ids = data_pd['srch_id'].unique()
train_srch_ids, test_srch_ids = train_test_split(srch_ids, test_size=0.2, random_state=42)

# Create training and testing DataFrames
columns_to_drop = [
    'date_time', 'orig_destination_distance', 'score1d2', 'normalized_prop_starrating', 'normalized_score1d2', 
    'normalized_prop_review_score', 'hotel_quality', 'score2ma'
]
train_data = data_pd[data_pd['srch_id'].isin(train_srch_ids)].drop(columns_to_drop, axis=1)
test_data = data_pd[data_pd['srch_id'].isin(test_srch_ids)].drop(columns_to_drop, axis=1)

# Create training and testing ranking DataFrames
train_ranking = ranking_pd[ranking_pd['srch_id'].isin(train_srch_ids)]
test_ranking = ranking_pd[ranking_pd['srch_id'].isin(test_srch_ids)]


In [39]:


# Create XGBoost DMatrix objects for training and testing
train_dmatrix = xg.DMatrix(
    train_data.drop(['srch_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1),
    label=train_data['prop_id'],
    group=train_data['srch_id'].value_counts().sort_index().values
)
test_dmatrix = xg.DMatrix(
    test_data.drop(['srch_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1),
    label=test_data['prop_id'],
    group=test_data['srch_id'].value_counts().sort_index().values
)

# MAKE SURE TO USE FULL DATA SET IN THE END
full_dmatrix = xg.DMatrix(
    X.drop(['booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1),
    label=y,
    group=data_pd['srch_id'].value_counts().sort_index().values
)

# Set XGBoost parameters
params = {
    'objective': 'rank:pairwise',
    'eval_metric': 'ndcg',
    'learning_rate': 0.1,
    'max_depth': 7,
    'subsample': 0.8,
    'colsample_bytree': 0.8,
    'seed': 42
}

# Train the XGBoost ranking model
model = xg.train(params, train_dmatrix, num_boost_round=100)



In [40]:
from sklearn.metrics import ndcg_score
# Make predictions on the test set
test_pred = model.predict(test_dmatrix)

# Evaluate the model
ndcg_scores = []
for srch_id in test_srch_ids:
    # Get the data for the current srch_id
    srch_data = test_data[test_data['srch_id'] == srch_id]
    
    # Create DMatrix for the current srch_id
    srch_dmatrix = xg.DMatrix(srch_data.drop(['srch_id', 'booking_bool', 'gross_bookings_usd', 'position', 'click_bool'], axis=1))
    
    # Make predictions for the current srch_id
    srch_pred = model.predict(srch_dmatrix)
    
    # Get the corresponding prop_ids for the current srch_id
    srch_prop_ids = srch_data['prop_id']
    
    # Sort the prop_ids based on the predicted scores
    sorted_prop_ids = srch_prop_ids.iloc[np.argsort(srch_pred)[::-1]]
    
    # Get the ground truth rankings for the current srch_id
    test_ranking_query = test_ranking[test_ranking['srch_id'] == srch_id]['prop_id']
    
    # Calculate NDCG score for the current srch_id
    if len(test_ranking_query) > 1:
        ndcg = ndcg_score([test_ranking_query], [sorted_prop_ids], k=5)
    else:
        print("Number of documents in test_ranking_query:", len(test_ranking_query))
        print("Number of documents in sorted_prop_ids:", len(sorted_prop_ids))
        ndcg = 1  # means only 1 result
    ndcg_scores.append(ndcg)

# Calculate the mean NDCG score
mean_ndcg = sum(ndcg_scores) / len(ndcg_scores)
print(f"Mean NDCG: {mean_ndcg:.4f}")

Number of documents in test_ranking_query: 1
Number of documents in sorted_prop_ids: 1
Number of documents in test_ranking_query: 1
Number of documents in sorted_prop_ids: 1
Mean NDCG: 0.4207


In [17]:
model = xg.train(params, X, num_boost_round=100)

TypeError: ('invalid cache item: DataFrame', [         visitor_hist_starrating  visitor_hist_adr_usd  prop_country_id  \
0                            NaN                   NaN              219   
1                            NaN                   NaN              219   
2                            NaN                   NaN              219   
3                            NaN                   NaN              219   
4                            NaN                   NaN              219   
...                          ...                   ...              ...   
4953223                      NaN                   NaN              219   
4953224                      NaN                   NaN              219   
4953225                      NaN                   NaN              219   
4953226                      NaN                   NaN              219   
4953227                      NaN                   NaN              219   

         prop_brand_bool  position  srch_destination_id  \
0                      1        13                23246   
1                      1        21                23246   
2                      1        34                23246   
3                      1         4                23246   
4                      1         7                23246   
...                  ...       ...                  ...   
4953223                1         7                16974   
4953224                1         2                16974   
4953225                1         3                16974   
4953226                1         4                16974   
4953227                1         6                16974   

         srch_saturday_night_bool  random_bool  click_bool  \
0                            True         True        True   
1                            True         True       False   
2                            True         True       False   
3                            True         True       False   
4                            True         True       False   
...                           ...          ...         ...   
4953223                     False        False       False   
4953224                     False        False       False   
4953225                     False        False       False   
4953226                     False        False       False   
4953227                     False        False       False   

         gross_bookings_usd  ...  has_hist_adr_usd  normalized_hotel_quality  \
0                    114.29  ...                 1                  0.472710   
1                       NaN  ...                 1                  0.618623   
2                       NaN  ...                 1                  0.522208   
3                       NaN  ...                 1                  0.640791   
4                       NaN  ...                 1                  0.690289   
...                     ...  ...               ...                       ...   
4953223                 NaN  ...                 1                  0.481979   
4953224                 NaN  ...                 1                  0.599469   
4953225                 NaN  ...                 1                  0.599469   
4953226                 NaN  ...                 1                  0.579887   
4953227                 NaN  ...                 1                  0.540724   

         price_per_person  avg_price_per_day         ump  price_diff  \
0               25.222500             100.89  -16.115058         NaN   
1               44.950000             179.80  -42.797387         NaN   
2              150.692500             602.77 -522.129581         NaN   
3               35.895000             143.58   -5.200488         NaN   
4               48.830000             195.32  -14.047758         NaN   
...                   ...                ...         ...         ...   
4953223         36.333333             109.00 -108.000000         NaN   
4953224         39.333333             118.00 -117.000000         NaN   
4953225         29.666667              89.00  -88.000000         NaN   
4953226         33.000000              99.00  -98.000000         NaN   
4953227         20.333333              61.00  -60.000000         NaN   

         starrating_diff  total_price  promotion_count  promotion_any  
0                    NaN       100.89                1              1  
1                    NaN       179.80                1              1  
2                    NaN       602.77                1              1  
3                    NaN       143.58                1              1  
4                    NaN       195.32                1              1  
...                  ...          ...              ...            ...  
4953223              NaN       109.00                0              0  
4953224              NaN       118.00                0              0  
4953225              NaN        89.00                0              0  
4953226              NaN        99.00                0              0  
4953227              NaN        61.00                0              0  

[4953228 rows x 22 columns]])

In [3]:
# save the model

model.save_model('model.json')
# load the model
model = xg.Booster()
model.load_model('model.json')

In [4]:

test_set = pl.read_csv('../data/raw/test_set_VU_DM.csv')
test_set = test_set.to_pandas()
test_set = test_set.replace('NULL', np.nan)

object_columns = test_set.select_dtypes(include=['object']).columns
test_set[object_columns] = test_set[object_columns].apply(pd.to_numeric, errors='coerce')

In [14]:
srch_ids = test_set['srch_id'].unique()

# Create an empty DataFrame to store the predictions
submission_df = pd.DataFrame(columns=['srch_id', 'prop_id'])

#  Unique srch_ids in the test set
srch_ids = test_set['srch_id'].unique()

# Pre-allocate a list to collect results
results = []

# Iterate over each srch_id and make predictions
for srch_id in tqdm(srch_ids):
    # Get the data for the current srch_id
    srch_data = test_set[test_set['srch_id'] == srch_id]

    # Create DMatrix for the current srch_id
    srch_dmatrix = xg.DMatrix(srch_data.drop(['srch_id'], axis=1))

    # Make predictions for the current srch_id
    srch_pred = model.predict(srch_dmatrix)

    # Get the corresponding prop_ids for the current srch_id
    srch_prop_ids = srch_data['prop_id'].values

    # Sort the prop_ids based on the predicted scores
    sorted_indices = np.argsort(srch_pred)[::-1]
    sorted_prop_ids = srch_prop_ids[sorted_indices]

    # Collect the results for the current srch_id
    results.append(pd.DataFrame({'srch_id': srch_id, 'prop_id': sorted_prop_ids}))

# Concatenate all results into a single DataFrame
submission_df = pd.concat(results, ignore_index=True)


# Check final submission DataFrame size
print(f"Expected number of entries: {len(test_set)}")
print(f"Actual number of entries: {len(submission_df)}")

# Save the submission DataFrame to a CSV file
submission_df.to_csv('submit/submission.csv', index=False)

  0%|          | 0/199549 [00:00<?, ?it/s]

NameError: name 'test_data' is not defined