In [1]:
import time

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import ndcg_score

In [2]:
# 加载CSV数据
print('Reading data...')
start_time = time.time()
data = pd.read_csv('./data/filled_training_set.csv')
print(f'Data read in {time.time() - start_time} s')

Reading data...
Data read in 8.052088260650635 s


In [3]:
# 将数据分割为训练集和验证集
# train, valid = train_test_split(data, test_size=0.2, random_state=2022)
train_size = int(len(data) * 0.8)
train = data[:train_size]
valid = data[train_size:]

# 构建XGBoost DMatrix
features = data.columns.tolist()
features.remove('score')
print(features)
X_train = train[features]
y_train = train['score']
group_train = train.groupby('srch_id')['score'].count().values
dtrain = xgb.DMatrix(X_train, y_train, group=group_train)
# dtrain = xgb.DMatrix(X_train, y_train, group=[len(y_train)//len(train['srch_id'].unique())] * len(train['srch_id'].unique()))

X_valid = valid[features]
y_valid = valid['score']
group_valid = valid.groupby('srch_id')['score'].count().values
dvalid = xgb.DMatrix(X_valid, y_valid, group=group_valid)
# dvalid = xgb.DMatrix(X_valid, y_valid, group=[len(y_valid)//len(valid['srch_id'].unique())] * len(valid['srch_id'].unique()))
print(group_valid)
# print(y_train)


['srch_id', 'site_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'random_bool', 'year', 'month', 'comp_avg_rate', 'comp_avg_inv', 'comp_avg_rate_percent_diff']
[27 13 32 ... 24 28  6]


In [5]:
# 设置排序任务的参数
params = {
    'objective': 'rank:ndcg',
    'eta': 0.01,
    'max_depth': 10,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'eval_metric': 'ndcg@5',  # 使用NDCG@5作为评估指标
    # 'seed': 2022,
    'tree_method': 'hist',
    # 'lambda': 1.2,
}
# from sklearn.metrics import make_scorer, ndcg_score


# 将函数转换为scorer
# ndcg_scorer = make_scorer(ndcg_score, k=5)

# params = {
#     'objective': 'rank:ndcg',
#     'tree_method': 'hist',
#     'seed': 2022,
# }

# param_grid = {
#     'eta': [0.01, 0.03, 0.1, 0.3],
#     'max_depth': [4, 6, 8, 10],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.5, 0.7, 0.9],
#     'colsample_bytree': [0.5, 0.7, 0.9],
#     'gamma': [0, 0.1, 0.2, 0.3],
# }
# xgb_model = xgb.XGBRegressor(**params)
# grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring=ndcg_scorer, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)

# 输出最佳参数
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best NDCG@5 score: {grid_search.best_score_}")

# 训练模型
bst = xgb.train(params, dtrain, num_boost_round=2000, evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=50)


[0]	train-ndcg@5:0.30636	valid-ndcg@5:0.29565
[10]	train-ndcg@5:0.39291	valid-ndcg@5:0.34414
[20]	train-ndcg@5:0.40872	valid-ndcg@5:0.35067
[30]	train-ndcg@5:0.41809	valid-ndcg@5:0.35630
[40]	train-ndcg@5:0.42263	valid-ndcg@5:0.35676
[50]	train-ndcg@5:0.42870	valid-ndcg@5:0.35889
[60]	train-ndcg@5:0.43195	valid-ndcg@5:0.35933
[70]	train-ndcg@5:0.43394	valid-ndcg@5:0.36067
[80]	train-ndcg@5:0.43757	valid-ndcg@5:0.36133
[90]	train-ndcg@5:0.44059	valid-ndcg@5:0.36187
[100]	train-ndcg@5:0.44320	valid-ndcg@5:0.36294
[110]	train-ndcg@5:0.44677	valid-ndcg@5:0.36448
[120]	train-ndcg@5:0.44944	valid-ndcg@5:0.36520
[130]	train-ndcg@5:0.45132	valid-ndcg@5:0.36559
[140]	train-ndcg@5:0.45336	valid-ndcg@5:0.36555
[150]	train-ndcg@5:0.45478	valid-ndcg@5:0.36606
[160]	train-ndcg@5:0.45625	valid-ndcg@5:0.36626
[170]	train-ndcg@5:0.45754	valid-ndcg@5:0.36617
[180]	train-ndcg@5:0.45937	valid-ndcg@5:0.36633
[190]	train-ndcg@5:0.46083	valid-ndcg@5:0.36691
[200]	train-ndcg@5:0.46186	valid-ndcg@5:0.36654
[21

In [None]:

pred = bst.predict(dvalid)
print(pred)
print(group_valid)
print(len(pred), len(group_valid))

print(len(pred))
print(len(y_valid))
y_grouped = []
output_grouped = []
start = 0
for cnt in group_valid:
    end = start + cnt
    y_grouped.append(y_valid[start:end])
    output_grouped.append(pred[start:end])
    start = end

print(y_grouped)
print(output_grouped)
ndcg_results = []
for i in range(len(group_valid)):
    ndcg = ndcg_score([group_valid[i]], [output_grouped[i]], k=5)
    ndcg_results.append(ndcg)
    if i % 10000 == 0:
        print(f'Group {i}, ndcg:{ndcg}.')
# print(ndcg_results)
print(np.mean(ndcg_results))

bst.dump_model('dump.raw.txt')

# # 在测试集上做预测
# test_data = pd.read_csv('your_test_data.csv')
# X_test = test_data[features]
# group_test = [len(X_test)//len(test_data['srch_id'].unique())] * len(test_data['srch_id'].unique())
# dtest = xgb.DMatrix(X_test, group=group_test)
# preds = bst.predict(dtest)
#
# # 输出预测结果
# for srch_id, prop_ids, scores in zip(test_data['srch_id'], test_data['prop_id'], preds):
#     print(f'Query {srch_id} predictions:')
#     for prop_id, score in zip(prop_ids, scores):
#         print(f'  Product {prop_id}: {score}')