In [1]:
import time

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import ndcg_score

In [2]:
# 加载CSV数据
print('Reading data...')
start_time = time.time()
data = pd.read_csv('./data/filled_training_set2.csv')
print(f'Data read in {time.time() - start_time} s')

Reading data...
Data read in 16.250774383544922 s


In [3]:
# 将数据分割为训练集和验证集
# train, valid = train_test_split(data, test_size=0.2, random_state=2022)
train_size = int(len(data) * 0.8)
train = data[:train_size]
valid = data[train_size:]

# 构建XGBoost DMatrix
features = data.columns.tolist()
# features.remove('score')
# features.remove('srch_id')
# features.remove('prop_id')
removed_features = ['score', 'srch_id', 'prop_id', 'lgbm_score', 'orig_destination_distance', 'srch_children_count']
for f in removed_features:
    # print(f)
    features.remove(f)
print(features)
X_train = train[features]
y_train = train['score']
group_train = train.groupby('srch_id')['srch_id'].count().values
dtrain = xgb.DMatrix(X_train, y_train, group=group_train)
# dtrain = xgb.DMatrix(X_train, y_train, group=[len(y_train)//len(train['srch_id'].unique())] * len(train['srch_id'].unique()))

X_valid = valid[features]
y_valid = valid['score']
group_valid = valid.groupby('srch_id')['srch_id'].count().values
dvalid = xgb.DMatrix(X_valid, y_valid, group=group_valid)
# dvalid = xgb.DMatrix(X_valid, y_valid, group=[len(y_valid)//len(valid['srch_id'].unique())] * len(valid['srch_id'].unique()))
print(group_valid)

['site_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'random_bool', 'days', 'ump', 'price_diff', 'starrating_diff', 'count_window', 'comp_avg_rate', 'comp_avg_inv', 'comp_avg_rate_percent_diff', 'prop_starrating_mean', 'prop_starrating_median', 'prop_review_score_mean', 'prop_review_score_median', 'prop_location_score2_mean', 'prop_location_score2_median', 'prop_brand_bool_mean', 'prop_brand_bool_median', 'prop_log_historical_price_mean', 'prop_log_historical_price_median', 'visitor_hist_starrating_mean', 'visitor_hist_starrating_median', 'visitor_hist_adr_usd_mean', 'visitor_hist_adr_usd_median', 'c

In [4]:
# 设置排序任务的参数
params = {
    'objective': 'rank:ndcg',
    'eta': 0.01,
    'max_depth': 12,
    'gamma': 1.2,
    'min_child_weight': 0.2,
    'subsample': 0.5,
    'colsample_bytree': 0.5,
    'eval_metric': 'ndcg@5',  # 使用NDCG@5作为评估指标
    # 'seed': 2022,
    'tree_method': 'approx',
    'lambda': 1.2,
}
# 1600 0.67677 0.38598

# 训练模型
# bst = xgb.train(params, dtrain, num_boost_round=1600, evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=20)


In [5]:
X = data[features]
y = data['score']
group_data= data.groupby('srch_id')['srch_id'].count().values
dm = xgb.DMatrix(X, y, group=group_data)

In [6]:
bst = xgb.train(params, dm, num_boost_round=2400, evals=[(dm, 'train')], verbose_eval=10)

[0]	train-ndcg@5:0.30629
[10]	train-ndcg@5:0.41664
[20]	train-ndcg@5:0.44156
[30]	train-ndcg@5:0.45735
[40]	train-ndcg@5:0.46756
[50]	train-ndcg@5:0.47428
[60]	train-ndcg@5:0.47994
[70]	train-ndcg@5:0.48359
[80]	train-ndcg@5:0.48910
[90]	train-ndcg@5:0.49298
[100]	train-ndcg@5:0.49684
[110]	train-ndcg@5:0.50063
[120]	train-ndcg@5:0.50381
[130]	train-ndcg@5:0.50710
[140]	train-ndcg@5:0.51022
[150]	train-ndcg@5:0.51293
[160]	train-ndcg@5:0.51543
[170]	train-ndcg@5:0.51764
[180]	train-ndcg@5:0.51933
[190]	train-ndcg@5:0.52209
[200]	train-ndcg@5:0.52399
[210]	train-ndcg@5:0.52609
[220]	train-ndcg@5:0.52816
[230]	train-ndcg@5:0.53029
[240]	train-ndcg@5:0.53253
[250]	train-ndcg@5:0.53423
[260]	train-ndcg@5:0.53592
[270]	train-ndcg@5:0.53799
[280]	train-ndcg@5:0.53962
[290]	train-ndcg@5:0.54130
[300]	train-ndcg@5:0.54305
[310]	train-ndcg@5:0.54454
[320]	train-ndcg@5:0.54613
[330]	train-ndcg@5:0.54749
[340]	train-ndcg@5:0.54879
[350]	train-ndcg@5:0.55022
[360]	train-ndcg@5:0.55173
[370]	train-

In [None]:
pred = bst.predict(dvalid)
print(pred)
print(len(pred), len(group_valid))
y_grouped = []
output_grouped = []
start = 0
for cnt in group_valid:
    end = start + cnt
    y_grouped.append(y_valid[start:end].tolist())
    output_grouped.append(pred[start:end])
    start = end

print(y_grouped[0])
print(output_grouped[0])
print(len(y_grouped), len(output_grouped))

In [None]:
# ndcg = ndcg_score(output_grouped, y_grouped, k=5)
# print(ndcg)
ndcg_results = []
for i in range(len(y_grouped)):
    y1 = y_grouped[i]
    y2 = output_grouped[i]
    ndcg = ndcg_score(y_score=[y2], y_true=[y1], k=5)
    ndcg_results.append(ndcg)
    if i % 2000 == 0:
        print(f'Group {i}, ndcg:{ndcg}.')
# print(ndcg_results)
print(np.mean(ndcg_results))

In [7]:
bst.dump_model('models/bst.txt')
print('saving model in json...')
bst.save_model('models/bst.json')

saving model in json...


In [6]:
test = pd.read_csv('./data/filled_test_set.csv')

In [3]:
# 从文本文件加载模型
# with open('models/dump.raw.txt', 'r') as fd:
#     model_text = fd.read()
# 
# bst = xgb.Booster(model_file=model_text)
# 从 txt 文件加载模型
loaded_model = xgb.Booster()
loaded_model.load_model('models/bst.json')


In [9]:
from pandas import DataFrame
# test = test.drop(columns=['score'])
features = test.columns.tolist()
print(features)
removed_features = ['score', 'srch_id', 'prop_id', 'orig_destination_distance', 'srch_children_count']
for f in removed_features:
    features.remove(f)
group_test= test.groupby('srch_id')['srch_id'].count().values

dtest = xgb.DMatrix(test[features], group=group_test)


preds = loaded_model.predict(dtest)
print(preds)

['srch_id', 'site_id', 'visitor_location_country_id', 'visitor_hist_starrating', 'visitor_hist_adr_usd', 'prop_country_id', 'prop_id', 'prop_starrating', 'prop_review_score', 'prop_brand_bool', 'prop_location_score1', 'prop_location_score2', 'prop_log_historical_price', 'price_usd', 'promotion_flag', 'srch_destination_id', 'srch_length_of_stay', 'srch_booking_window', 'srch_adults_count', 'srch_children_count', 'srch_room_count', 'srch_saturday_night_bool', 'srch_query_affinity_score', 'orig_destination_distance', 'random_bool', 'days', 'ump', 'price_diff', 'starrating_diff', 'count_window', 'comp_avg_rate', 'comp_avg_inv', 'comp_avg_rate_percent_diff', 'score', 'prop_starrating_mean', 'prop_starrating_median', 'prop_review_score_mean', 'prop_review_score_median', 'prop_location_score2_mean', 'prop_location_score2_median', 'prop_brand_bool_mean', 'prop_brand_bool_median', 'prop_log_historical_price_mean', 'prop_log_historical_price_median', 'visitor_hist_starrating_mean', 'visitor_hist

In [10]:
result = test[['srch_id', 'prop_id']]
result['pred'] = preds
sorted_df = result.groupby('srch_id').apply(lambda x: x.sort_values(by='pred', ascending=False))
sorted_df.reset_index(drop=True, inplace=True)
print(sorted_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['pred'] = preds


         srch_id  prop_id      pred
0              1    28181  0.649910
1              1    54937  0.451010
2              1    99484  0.444609
3              1    61934  0.111719
4              1    72090 -0.285386
...          ...      ...       ...
4959178   332787    99509 -0.383107
4959179   332787    94437 -0.648346
4959180   332787    32019 -0.689581
4959181   332787    35240 -0.846197
4959182   332787    29018 -0.944422

[4959183 rows x 3 columns]


  sorted_df = result.groupby('srch_id').apply(lambda x: x.sort_values(by='pred', ascending=False))


In [11]:
final_res = sorted_df[['srch_id', 'prop_id']]
final_res = final_res.astype(int)
print(final_res)
final_res.to_csv('./data/bst_res.csv', index=False)

         srch_id  prop_id
0              1    28181
1              1    54937
2              1    99484
3              1    61934
4              1    72090
...          ...      ...
4959178   332787    99509
4959179   332787    94437
4959180   332787    32019
4959181   332787    35240
4959182   332787    29018

[4959183 rows x 2 columns]
