In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.model_selection import StratifiedGroupKFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.metrics import ndcg_score
from xgboost import XGBRanker
from catboost import CatBoostRanker, Pool

In [2]:
data = pd.read_csv('intern_task.csv')
data = data.drop(['feature_100', 'feature_72', 'feature_65', 'feature_64'], axis=1)
seed = 7

data.head()

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0


In [3]:
data = data.sort_values('query_id').reset_index(drop=True)
X = data.drop(['rank', 'query_id'], axis=1)
y = data['rank']
qid = data['query_id']

In [4]:
qid.unique().shape

(2000,)

Обучать модель будем в соответствии с запросами, поэтому в тест отложим 20% запросов.

In [5]:
qid[qid == qid.unique()[1600]].index[0]

198160

In [6]:
X_train, X_test = X.loc[:198159], X.loc[198160:]
y_train, y_test = y.loc[:198159], y.loc[198160:]
qid_train, qid_test = qid.loc[:198159], qid.loc[198160:]

X_train.shape[0] + X_test.shape[0] == X.shape[0]

True

Построим pipeline: стандартизация обучающих данных, применение pca.

Обучим модели на данных без манипуляций и на данных с таким pipeline и сравним метрики.

In [7]:
kfold = StratifiedGroupKFold(shuffle=False)
ranker_scores = []
pipe_scores = []

for train_index, valid_index in kfold.split(X_train, y_train, groups=qid_train):
    X_train_, X_valid_ = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid_ = y_train.iloc[train_index], y_train.iloc[valid_index]

    ranker = XGBRanker(tree_method="hist",
                       lambdarank_num_pair_per_sample=8,
                       objective="rank:ndcg",
                       lambdarank_pair_method="topk")
    
    ranker.fit(X_train_, y_train_, qid=qid.iloc[train_index])
    y_pred = ranker.predict(X_valid_)
    score = ndcg_score(np.array([y_valid_]), np.array([y_pred]), k=5)
    ranker_scores.append(score)

for train_index, valid_index in kfold.split(X_train, y_train, groups=qid_train):
    X_train_, X_valid_ = X_train.iloc[train_index], X_train.iloc[valid_index]
    y_train_, y_valid_ = y_train.iloc[train_index], y_train.iloc[valid_index]

    pipe = Pipeline([
        ('scaler', StandardScaler()),
        ('pca', PCA(n_components=10)),
    ])

    ranker = XGBRanker(tree_method="hist",
                       lambdarank_num_pair_per_sample=8,
                       objective="rank:ndcg",
                       lambdarank_pair_method="topk")
    
    ranker.fit(pipe.fit_transform(X_train_), y_train_, qid=qid.iloc[train_index])
    y_pred = ranker.predict(pipe.transform(X_valid_))
    score = ndcg_score(np.array([y_valid_]), np.array([y_pred]), k=5)
    pipe_scores.append(score)

scores = pd.DataFrame({
    'ranker': ranker_scores,
    'pipe': pipe_scores
})

scores

Unnamed: 0,ranker,pipe
0,0.74563,0.445499
1,0.893007,0.561583
2,1.0,0.286517
3,0.776573,0.117264
4,0.967199,0.292395


Хотя данные и казались сильно скоррелированными, очевидно серьезное ухудшение качества при уменьшении признакового простраства, а значит модель извлекает информацию из сжимаемых признаков и предложенный вариант обработки данных не годится.

In [8]:
scores['ranker'].mean()

0.876481735860702

In [9]:
train_pool = Pool(data=X_train, label=y_train, group_id=qid_train)
test_pool = Pool(data=X_test, label=y_test, group_id=qid_test)
    
cb = CatBoostRanker()
    
cb.fit(train_pool, eval_set=test_pool, verbose=100)

0:	test: 0.7071959	best: 0.7071959 (0)	total: 527ms	remaining: 8m 46s
100:	test: 0.7872009	best: 0.7874355 (99)	total: 33s	remaining: 4m 53s
200:	test: 0.7944766	best: 0.7944766 (200)	total: 1m 5s	remaining: 4m 21s
300:	test: 0.7981836	best: 0.7982369 (299)	total: 1m 45s	remaining: 4m 4s
400:	test: 0.8000010	best: 0.8000275 (398)	total: 2m 30s	remaining: 3m 44s
500:	test: 0.7995446	best: 0.8003527 (413)	total: 3m 13s	remaining: 3m 12s
600:	test: 0.7998918	best: 0.8003527 (413)	total: 3m 58s	remaining: 2m 38s
700:	test: 0.8012404	best: 0.8014414 (696)	total: 4m 42s	remaining: 2m
800:	test: 0.8007861	best: 0.8017564 (707)	total: 5m 26s	remaining: 1m 21s
900:	test: 0.8012802	best: 0.8017564 (707)	total: 6m 11s	remaining: 40.8s
999:	test: 0.8014876	best: 0.8018232 (961)	total: 6m 55s	remaining: 0us

bestTest = 0.8018231765
bestIteration = 961

Shrink model to first 962 iterations.


<catboost.core.CatBoostRanker at 0x1b3f1ed4d90>

In [10]:
xgb = XGBRanker(tree_method="hist",
                lambdarank_num_pair_per_sample=8,
                objective="rank:ndcg",
                lambdarank_pair_method="topk")

xgb.fit(X_train, y_train, qid=qid_train)
ndcg_score(np.array([y_test]), np.array([xgb.predict(X_test)]), k=5)

0.7076049743407979

In [11]:
ndcg_score(np.array([y_test]), np.array([cb.predict(X_test)]), k=5)

0.7939028099025153

Делаем выбор в пользу XGBoost. Подберем параметры с помощь optuna

UPD: при перезапуске ноутбука catboost показал лучшие результаты) надо было фиксировать seed