In [None]:
import time

import numpy as np
import pandas as pd
import xgboost as xgb
from sklearn.metrics import ndcg_score

In [None]:
# 加载CSV数据
print('Reading data...')
start_time = time.time()
data = pd.read_csv('./data/filled_training_set1.csv')
print(f'Data read in {time.time() - start_time} s')

In [None]:
# 将数据分割为训练集和验证集
# train, valid = train_test_split(data, test_size=0.2, random_state=2022)
train_size = int(len(data) * 0.8)
train = data[:train_size]
valid = data[train_size:]

# 构建XGBoost DMatrix
features = data.columns.tolist()
features.remove('score')
print(features)
X_train = train[features]
y_train = train['score']
group_train = train.groupby('srch_id')['srch_id'].count().values
dtrain = xgb.DMatrix(X_train, y_train, group=group_train)
# dtrain = xgb.DMatrix(X_train, y_train, group=[len(y_train)//len(train['srch_id'].unique())] * len(train['srch_id'].unique()))

X_valid = valid[features]
y_valid = valid['score']
group_valid = valid.groupby('srch_id')['srch_id'].count().values
dvalid = xgb.DMatrix(X_valid, y_valid, group=group_valid)
# dvalid = xgb.DMatrix(X_valid, y_valid, group=[len(y_valid)//len(valid['srch_id'].unique())] * len(valid['srch_id'].unique()))
print(group_valid)

In [None]:
# 设置排序任务的参数
params = {
    'objective': 'rank:ndcg',
    'eta': 0.02,
    'max_depth': 8,
    'gamma': 1.0,
    'min_child_weight': 0.1,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'eval_metric': 'ndcg@5',  # 使用NDCG@5作为评估指标
    # 'seed': 2022,
    'tree_method': 'hist',
    # 'lambda': 1.2,
}
# from sklearn.metrics import make_scorer, ndcg_score


# 将函数转换为scorer
# ndcg_scorer = make_scorer(ndcg_score, k=5)

# params = {
#     'objective': 'rank:ndcg',
#     'tree_method': 'hist',
#     'seed': 2022,
# }

# param_grid = {
#     'eta': [0.01, 0.03, 0.1, 0.3],
#     'max_depth': [4, 6, 8, 10],
#     'min_child_weight': [1, 3, 5],
#     'subsample': [0.5, 0.7, 0.9],
#     'colsample_bytree': [0.5, 0.7, 0.9],
#     'gamma': [0, 0.1, 0.2, 0.3],
# }
# xgb_model = xgb.XGBRegressor(**params)
# grid_search = GridSearchCV(xgb_model, param_grid, cv=5, scoring=ndcg_scorer, n_jobs=-1, verbose=2)
# grid_search.fit(X_train, y_train, eval_set=[(X_valid, y_valid)], early_stopping_rounds=50, verbose=False)

# 输出最佳参数
# print(f"Best parameters: {grid_search.best_params_}")
# print(f"Best NDCG@5 score: {grid_search.best_score_}")

# 训练模型
bst = xgb.train(params, dtrain, num_boost_round=1000, evals=[(dtrain, 'train'), (dvalid, 'valid')], verbose_eval=50)

In [None]:
X = data[features]
y = data['score']
group_data= X.groupby('srch_id')['srch_id'].count().values
dm = xgb.DMatrix(X, y, group=group_data)

In [None]:
bst = xgb.train(params, dm, num_boost_round=2500, evals=[(dm, 'train')], verbose_eval=50)

In [None]:
pred = bst.predict(dvalid)
print(pred)
print(len(pred), len(group_valid))
y_grouped = []
output_grouped = []
start = 0
for cnt in group_valid:
    end = start + cnt
    y_grouped.append(y_valid[start:end].tolist())
    output_grouped.append(pred[start:end])
    start = end

print(y_grouped[0])
print(output_grouped[0])
print(len(y_grouped), len(output_grouped))

In [None]:
# ndcg = ndcg_score(output_grouped, y_grouped, k=5)
# print(ndcg)
ndcg_results = []
for i in range(len(y_grouped)):
    y1 = y_grouped[i]
    y2 = output_grouped[i]
    ndcg = ndcg_score(y_score=[y2], y_true=[y1], k=5)
    ndcg_results.append(ndcg)
    if i % 2000 == 0:
        print(f'Group {i}, ndcg:{ndcg}.')
# print(ndcg_results)
print(np.mean(ndcg_results))

In [None]:
bst.dump_model('models/bst.txt')
print('saving model in json...')
bst.save_model('models/bst.json')

In [None]:
test = pd.read_csv('./data/filled_test_set.csv')

In [None]:
# 从文本文件加载模型
# with open('models/dump.raw.txt', 'r') as fd:
#     model_text = fd.read()
# 
# bst = xgb.Booster(model_file=model_text)
# 从 txt 文件加载模型
loaded_model = xgb.Booster()
loaded_model.load_model('models/bst.json')


In [None]:
from pandas import DataFrame
test = test.drop(columns=['score'])
group_test= test.groupby('srch_id')['srch_id'].count().values

dtest = xgb.DMatrix(test, group=group_test)


preds = loaded_model.predict(dtest)
print(preds)

In [26]:
result = test[['srch_id', 'prop_id']]
result['pred'] = preds
sorted_df = result.groupby('srch_id').apply(lambda x: x.sort_values(by='pred', ascending=False))
sorted_df.reset_index(drop=True, inplace=True)
print(sorted_df)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  result['pred'] = preds


         srch_id  prop_id      pred
0              1    54937  1.478797
1              1    99484  1.256358
2              1    61934  1.085763
3              1    28181  1.008531
4              1    24194  0.294274
...          ...      ...       ...
4959178   332787    32019  0.115624
4959179   332787    99509 -0.050972
4959180   332787    29018 -0.340200
4959181   332787    94437 -0.358900
4959182   332787    35240 -0.602464

[4959183 rows x 3 columns]


  sorted_df = result.groupby('srch_id').apply(lambda x: x.sort_values(by='pred', ascending=False))


In [27]:
final_res = sorted_df[['srch_id', 'prop_id']]
final_res = final_res.astype(int)
print(final_res)
final_res.to_csv('./data/bst_res.csv', index=False)

         srch_id  prop_id
0              1    54937
1              1    99484
2              1    61934
3              1    28181
4              1    24194
...          ...      ...
4959178   332787    32019
4959179   332787    99509
4959180   332787    29018
4959181   332787    94437
4959182   332787    35240

[4959183 rows x 2 columns]
