 ### Подключение

In [1]:
%load_ext autoreload
%autoreload 2

In [16]:
#!pip install umap-learn umap-learn[plot]
#!pip install opentsne
#!pip install shap
#!pip install lightgbm
#!pip install catboost
#!pip install polars
#!pip install pyarrow-hotfix
#!pip install --upgrade seaborn

In [110]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.io as pio
import plotly.express as px
import hdbscan
import shap
from bokeh.plotting import curdoc

import lightgbm as lgb
import catboost as cb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import StratifiedGroupKFold

from functools import partial

from utils.distribution import get_df_info, plot_density
from utils.drplotter import DimReductionPlotter
from utils.lgbm import plot_feature_info, plot_scores, plot_tree_info
from utils.my_utils import VotingModel
from sklearn.metrics import log_loss, ndcg_score, mean_squared_error

In [20]:
data = pd.read_csv("intern_task.csv")

In [21]:
data.head(10)

Unnamed: 0,rank,query_id,feature_0,feature_1,feature_2,feature_3,feature_4,feature_5,feature_6,feature_7,...,feature_134,feature_135,feature_136,feature_137,feature_138,feature_139,feature_140,feature_141,feature_142,feature_143
0,0,10,1.0,0.0,1.0,3.0,3.0,0.333333,0.0,0.333333,...,0.0,0.0,0.454545,0.890238,8.655534,1.0,0.077778,0.002222,1.0,0.333333
1,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.773976,23.130514,0.0,0.027826,0.00043,44.0,14.666667
2,0,10,3.0,0.0,2.0,0.0,3.0,1.0,0.0,0.666667,...,0.0,0.0,0.0,0.918308,13.351339,0.0,0.014925,0.000104,22.0,7.333333
3,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.0,0.975355,18.240926,0.0,0.05314,0.000255,8.0,2.666667
4,2,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,273.0,79.670665,0.2,0.990119,31.786048,0.333333,0.046512,0.000307,24.0,8.0
5,1,10,3.0,0.0,3.0,1.0,3.0,1.0,0.0,1.0,...,0.0,0.0,0.2,0.974819,22.936731,0.333333,0.033233,9.3e-05,28.0,9.333333
6,1,10,3.0,0.0,3.0,0.0,3.0,1.0,0.0,1.0,...,3.0,16.2,0.0,0.945281,18.240926,0.0,0.013008,2.3e-05,5.0,1.666667
7,0,10,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
8,0,10,2.0,0.0,1.0,0.0,2.0,0.666667,0.0,0.333333,...,218.0,55.069946,0.0,0.448807,4.695805,0.0,0.002153,2e-06,5.0,1.666667
9,0,10,3.0,0.0,0.0,0.0,3.0,1.0,0.0,0.0,...,0.0,0.0,0.0,0.975608,0.0,0.0,0.021583,4e-05,9.0,3.0


In [22]:
df_info = get_df_info(data)
df_info.head(50)

Unnamed: 0,dtype,nunique,nan,zero,empty_str,vc_max,example1,example2,trash_score
rank,int64,5,-1.0,0.517,-1.0,"(0.517, 0.0)",4.0,1.0,0.517
query_id,int64,2000,-1.0,-1.0,-1.0,"(0.004, 22540.0)",19300.0,17440.0,0.004
feature_0,float64,16,-1.0,0.115,-1.0,"(0.338, 2.0)",6.0,7.0,0.338
feature_1,float64,10,-1.0,0.863,-1.0,"(0.863, 0.0)",4.0,9.0,0.863
feature_2,float64,15,-1.0,0.302,-1.0,"(0.33, 1.0)",0.0,24.0,0.33
feature_3,float64,10,-1.0,0.603,-1.0,"(0.603, 0.0)",5.0,8.0,0.603
feature_4,float64,16,-1.0,0.085,-1.0,"(0.346, 2.0)",10.0,11.0,0.346
feature_5,float64,44,-1.0,0.115,-1.0,"(0.7, 1.0)",0.96875,0.75,0.7
feature_6,float64,25,-1.0,0.863,-1.0,"(0.863, 0.0)",0.555556,0.2,0.863
feature_7,float64,32,-1.0,0.302,-1.0,"(0.383, 1.0)",0.666667,0.2,0.383


 Пустых значений нет, Nan тоже - скорее всего их заменили нулями - можно об этом не беспокоиться. Скорее всего думать о смысле фич тоже не особо нужно поскольку они все переименованы. Разбиваю датасет следующим образом: 80% query оставим на обучение+валидацию, оставшиеся 20% на test - на нем будем считать итоговую метрику чтобы не допустить лик таргета.

In [28]:
def my_split(data, num=-1, max_num=-1):
    queries = data['query_id'].unique()
    if num == -1:
        train_query, test_query = train_test_split(queries, test_size=0.2)
    else:
        first = int(num/max_num*len(queries))
        last = int((num + 1)/max_num*(len(queries) - 1))
        train_query = queries[first : last]
    train_mask = data['query_id'].apply(lambda x: x in train_query)
    test_mask = ~train_mask
    return train_mask, test_mask

train_mask, test_mask = my_split(data)
train, test = data[train_mask], data[test_mask]
print(train.shape, test.shape) #watch the size - check if it still is close to 0.8/0.2

(185644, 146) (49614, 146)


### Решение

Поскольку нам нужно "по вектору фичей предсказать ранк документа" видимо требуется подойти к вопросу поточечно, как к простой задаче классификации: у нас есть 5 меток 0 1 2 3 4 и надо указать к какой метке ближе наша строка независимо от других поступлений.

In [99]:
X = train.drop(["rank", "query_id"], axis=1)

y = train["rank"]
queries= train["query_id"]

params = {
    "boosting_type": "gbdt",
    "objective": "multiclass",
    "class_weight":'balanced',
    "num_class": 5,
    "learning_rate": 0.1,
    "n_estimators": 1000,
    "verbose": -1,
    "random_state": 42,
}

fitted_models = []

for num in range(5):
    idx_train, idx_valid = my_split(train, num=num, max_num=5)
    X_train, y_train = X[idx_train], y[idx_train]
    X_valid, y_valid = X[idx_valid], y[idx_valid]

    model = lgb.LGBMClassifier(**params)
    model.fit(
        X_train, y_train,
        eval_set=[(X_valid, y_valid)],
        callbacks=[lgb.log_evaluation(100), lgb.early_stopping(100)]
    )

    fitted_models.append(model)

model = VotingModel(fitted_models)

Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.18419
[200]	valid_0's multi_logloss: 1.16725
Early stopping, best iteration is:
[177]	valid_0's multi_logloss: 1.16574
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.1017
[200]	valid_0's multi_logloss: 1.08605
Early stopping, best iteration is:
[157]	valid_0's multi_logloss: 1.08198
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.07678
[200]	valid_0's multi_logloss: 1.07036
Early stopping, best iteration is:
[132]	valid_0's multi_logloss: 1.06504
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.16446
[200]	valid_0's multi_logloss: 1.15152
Early stopping, best iteration is:
[146]	valid_0's multi_logloss: 1.14298
Training until validation scores don't improve for 100 rounds
[100]	valid_0's multi_logloss: 1.17688
[200]	valid_0's multi_logloss: 1.12425
[300]

In [100]:
res1 = pd.DataFrame(model.predict(test.drop(["rank", "query_id"], axis=1)).round())
res1.to_csv('res1.csv') #save the results not to run the training in the future again

### Считаем средний ndcg@1 ndcg@3 ndcg@5

In [117]:
res1 = pd.read_csv('res1.csv')
y_pred = res1["0"]
y_test = test["rank"].reset_index(drop=True)
query_id = test['query_id'].reset_index(drop=True)

def calculate_metrics(query_id, y_test, y_pred):
    df = pd.DataFrame({'query_id': query_id, 'y_test': y_test, 'y_pred': y_pred})
    ndcg_1 = []
    ndcg_3 = []
    ndcg_5 = []
    mses = []
    
    unique_queries = df['query_id'].unique()
    
    for query in unique_queries:
        current_query_df = df[df['query_id'] == query]
        
        current_y_test = current_query_df['y_test']
        current_y_pred = current_query_df['y_pred']
        
        top_5_idx = current_y_pred.argsort()[::-1][:5]
        
        ndcg_1.append(ndcg_score([current_y_test], [current_y_pred], k=1))
        ndcg_3.append(ndcg_score([current_y_test], [current_y_pred], k=3))
        ndcg_5.append(ndcg_score([current_y_test], [current_y_pred], k=5))
        mses.append(mean_squared_error(current_y_test, current_y_pred))
    mean_ndcg_1 = np.mean(ndcg_1)
    mean_ndcg_3 = np.mean(ndcg_3)
    mean_ndcg_5 = np.mean(ndcg_5)
    mean_mse = np.mean(mses)
    
    return mean_ndcg_1, mean_ndcg_3, mean_ndcg_5, mean_mse


print(test['query_id'].shape, y_test.shape, y_pred.shape)
ndcg_1, ndcg_3, ndcg_5, mean_mse = calculate_metrics(query_id, y_test, y_pred)

print("NDCG@1:", ndcg_1)
print("NDCG@3:", ndcg_3)
print("NDCG@5:", ndcg_5)
print("Mean mse per query:", mean_mse)

(49614,) (49614,) (49614,)
NDCG@1: 0.44753975424423037
NDCG@3: 0.432826162704049
NDCG@5: 0.43644907023806795
Mean mse per query: 0.7434708317771939


Видим, что ndcg не сильно высокий: оно и логично - мы работали в рамках повышения логлосса а не этого скора.