In [126]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.metrics import ndcg_score, roc_auc_score
from catboost import CatBoostClassifier, Pool
import optuna

import warnings
warnings.filterwarnings("ignore")

In [18]:
data = pd.read_csv('data/data.csv')
data.head()

Unnamed: 0,msno,song_id,source_system_tab,source_screen_name,source_type,target,city,bd,gender,registered_via,...,indicated_age,name,country,year,song_length,genre_ids,artist_name,language,days_left,age_diff
0,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,BBzumQNXUHKdEBOB7mAJuzok+IJA1c2Ryg/yzTF6tik=,explore,Explore,online-playlist,1,1,28.824247,NONE,7,...,0,Good Grief,GB,2016.0,206471.0,359,Bastille,52.0,2103,1987.175753
1,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,bhp/MpSNoqoxOIB+/l8WPqu6jldth4DIpCm3ayXnJqM=,my library,Local playlist more,local-playlist,1,13,24.0,female,9,...,1,Lords of Cardboard,US,1999.0,284584.0,1259,Various Artists,52.0,2301,1975.0
2,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,JNWfrrC7zNN7BdMpsISKa4Mw+xVJYNnxXh3/Epw7QgY=,my library,Local playlist more,local-playlist,1,13,24.0,female,9,...,1,Hip Hop Is Dead(Album Version (Edited)),US,2006.0,225396.0,1259,Nas,52.0,2301,1982.0
3,Xumu+NIjS6QYVxDS4/t3SawvJ7viT9hPKXmf0RtLNx8=,2A87tzfnJTSWqD7gIZHisolhe4DMdzkbd6LzO1KHjNs=,my library,Local playlist more,local-playlist,1,13,24.0,female,9,...,1,Disco Africa,GB,2010.0,255512.0,1019,Soundway,-1.0,2301,1986.0
4,FGtllVqz18RPiwJj/edr2gV78zirAiY/9SmYvia+kCg=,3qm6XTZ6MOCU11x8FIVbAGH5l5uMkT3/ZalWG1oo2Gc=,explore,Explore,online-playlist,1,1,28.824247,NONE,7,...,0,Sleep Without You,QM,2016.0,187802.0,1011,Brett Young,52.0,2103,1987.175753


In [26]:
data['registration_init_time'] = pd.to_datetime(data['registration_init_time'])
data['expiration_date'] = pd.to_datetime(data['expiration_date'])
data['language'] = data['language'].astype('str')

data.dtypes

msno                              object
song_id                           object
source_system_tab                 object
source_screen_name                object
source_type                       object
target                             int64
city                               int64
bd                               float64
gender                            object
registered_via                     int64
registration_init_time    datetime64[ns]
expiration_date           datetime64[ns]
indicated_age                      int64
name                              object
country                           object
year                             float64
song_length                      float64
genre_ids                         object
artist_name                       object
language                          object
days_left                          int64
age_diff                         float64
dtype: object

In [32]:
seed = 42
X = data.drop(['msno', 'song_id', 'target', 'name'], axis=1)
y = data['target']

cat = ['source_system_tab',
       'source_screen_name',
       'source_type',
       'city',
       'gender',
       'registered_via',
       'indicated_age',
       'country',
       'genre_ids',
       'artist_name',
       'language']
#text = ['name']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.20, random_state=seed)

train_pool = Pool(X_train, y_train, cat_features=cat)
test_pool = Pool(X_test, y_test, cat_features=cat)

Обучим модель со стандартными параметрами, и оценим метрики качества

In [35]:
model = CatBoostClassifier(task_type='GPU', random_state=seed)

model.fit(train_pool, eval_set=test_pool, verbose=100)

Learning rate set to 0.034041
0:	learn: 0.6898595	test: 0.6898466	best: 0.6898466 (0)	total: 1.27s	remaining: 21m 10s
100:	learn: 0.6308454	test: 0.6289751	best: 0.6289751 (100)	total: 1m 49s	remaining: 16m 14s
200:	learn: 0.6242862	test: 0.6211490	best: 0.6211490 (200)	total: 3m 40s	remaining: 14m 37s
300:	learn: 0.6203687	test: 0.6164856	best: 0.6164856 (300)	total: 5m 28s	remaining: 12m 42s
400:	learn: 0.6177421	test: 0.6134695	best: 0.6134695 (400)	total: 7m 17s	remaining: 10m 53s
500:	learn: 0.6156692	test: 0.6111308	best: 0.6111308 (500)	total: 9m 5s	remaining: 9m 3s
600:	learn: 0.6138569	test: 0.6091208	best: 0.6091208 (600)	total: 10m 57s	remaining: 7m 16s
700:	learn: 0.6122461	test: 0.6072930	best: 0.6072930 (700)	total: 12m 47s	remaining: 5m 27s
800:	learn: 0.6109745	test: 0.6058921	best: 0.6058921 (800)	total: 14m 34s	remaining: 3m 37s
900:	learn: 0.6098772	test: 0.6047163	best: 0.6047163 (900)	total: 16m 23s	remaining: 1m 48s
999:	learn: 0.6088343	test: 0.6035645	best: 0.60

<catboost.core.CatBoostClassifier at 0x1ee5507f210>

In [38]:
roc_auc_score(y_test, model.predict_proba(X_test)[:, 1])

0.7350934245137188

Встает нетривиальная задача посчитать NDCG@20 для бинарного таргета.

Ничего лучше эвристики поверх данных мне на ум не пришло. Хотя такой подход будет явно подгонять ранжирование под данные - это минус. В защиту этого подхода, скажу, что основывается от на здравом смысле и скорее всего не сильно противоречит реальности.

In [45]:
data.groupby(['msno', 'song_id'])['target'].mean()

msno                                          song_id                                     
++5wYjoMgQHoRuD3GbbvmphZbBBwymzv5Q4l8sywtuU=  +/lcxtBy9FuH0ObLsK9wRf3zl9zSyvDNMpTWSGCAXxc=    0.0
                                              +JGuj3rm4FBs8loN7rvI+JZ+EX3K9+WaxbDtmjs6mQc=    1.0
                                              +MRnGH0Gg7jA7izLFRU1SZtGPmWHdsWTeL9wRXChnRA=    1.0
                                              +Sm75wnBf/sjm/QMUAFx8N+Ae04kWCXGlgH50tTeM6c=    1.0
                                              +d62ngXhdNTJRLKXO8/X9+BBoj77Hs8xVHMLmYGmB4k=    0.0
                                                                                             ... 
zzqc2ja7z10FtSpagYVcAZXg/gPRq7wcDZuNFj+zJSU=  zK2kMG6yF7AOdKVQfMIPvKyTRynq+ANecPCBJ90IIeA=    0.0
                                              zqDZjACUVfphX2Me6LEbMwDWLXA4bIWCbSSD+QsIypQ=    1.0
zzzRi5ek1YCKTGns8C77xwAutE05PAPmz8T/pIIQhzE=  PgRtmmESVNtWjoZHO5a1r21vIz9sVZmcJJpFCbRa1LI=    0.0
                           

In [46]:
data.columns

Index(['msno', 'song_id', 'source_system_tab', 'source_screen_name',
       'source_type', 'target', 'city', 'bd', 'gender', 'registered_via',
       'registration_init_time', 'expiration_date', 'indicated_age', 'name',
       'country', 'year', 'song_length', 'genre_ids', 'artist_name',
       'language', 'days_left', 'age_diff'],
      dtype='object')

In [79]:
data['source_screen_name'].unique()

array(['Explore', 'Local playlist more', 'NONE', 'My library',
       'Online playlist more', 'Album more', 'Discover Feature',
       'Unknown', 'Discover Chart', 'Radio', 'Artist more', 'Search',
       'Others profile more', 'Search Trends', 'Discover Genre',
       'My library_Search', 'Search Home', 'Discover New',
       'Self profile more', 'Concert', 'Payment'], dtype=object)

In [80]:
data['source_system_tab'].unique()

array(['explore', 'my library', 'search', 'discover', 'NONE', 'radio',
       'listen with', 'notification', 'settings'], dtype=object)

In [81]:
data['source_type'].unique()

array(['online-playlist', 'local-playlist', 'local-library',
       'top-hits-for-artist', 'album', 'NONE', 'song-based-playlist',
       'radio', 'song', 'listen-with', 'artist', 'topic-article-playlist',
       'my-daily-playlist'], dtype=object)

Цель - обогатить таргет, чтобы дифференцировать различные значения. Так как у нас задача ранжирования возьмем колонки source_system_tab, source_screen_name, source_type и дадим каждому значению коэффициент likely, в зависимости от того, насколько человеку понравилась песня. Скажем, если он слушает её из своего плэйлиста - это больше, чем если он услышал её по радио или в трендах. Исходя из этой эвристики прибавим к таргету такие коэффициенты по колонкам и посчитаем требуемую метрику.

оплата > концерт > плэйлист > поиск песни > поиск по исполнителю > поиск лучшего > случайность'

In [122]:
translate_source_screen_name = {
    'Payment': 10,
    'Concert': 9,
    'My library': 8,
    'My library_Search': 8,
    'Artist more': 7,
    'Album more': 6,
    'Discover Genre': 5,
    'Online playlist more': 5,
    'Discover New': 5,
    'Search Trends': 5,
    'Discover Chart': 5,
    'Discover Feature': 5,
    'Local playlist more': 5,
    'Self profile more': 5,
    'Others profile more': 4,
    'Search Home': 4,
    'Search': 4,
    'Explore': 3,
    'Radio': 3,
    'Unknown': 2,
    'NONE': 2    
}

translate_source_system_tab = {
    'my library': 10,
    'listen with': 6,
    'explore': 5,
    'search': 5,
    'discover': 5,
    'notification': 4,
    'radio': 4,
    'settings': 2,
    'NONE': 2
}

translate_source_type = {
    'local-playlist': 10,
    'local-library': 10,
    'online-playlist': 8,
    'song': 7,
    'album': 6,
    'artist': 6,
    'topic-article-playlist': 4,
    'my-daily-playlist': 4,
    'listen-with': 4,
    'top-hits-for-artist': 3,
    'song-based-playlist': 3,
    'radio': 2,
    'NONE': 2,
}

coef = 0.01

rank = coef * X_test['source_screen_name'].map(translate_source_screen_name) +\
    coef * X_test['source_system_tab'].map(translate_source_system_tab) +\
    coef * X_test['source_type'].map(translate_source_type) +\
    y_test

rank.nunique()

60

In [123]:
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
rank = le.fit_transform(rank)
rank

array([32, 13, 52, ..., 55, 32, 55], dtype=int64)

In [124]:
ndcg_score(np.array([rank]), np.array([model.predict_proba(X_test)[:, 1]]), k=20)

0.8083651686105533

Метрика не совсем честная, но более подходящего варианта я не приудмал.

Что касается подбора гиперпараметров, очевидно, что модель недообучилась.

In [127]:
def objective(trial):
    param = {
        "iterations": trial.suggest_int("iterations", 900, 1500),
        "depth": trial.suggest_int("depth", 2, 8),
        "boosting_type": trial.suggest_categorical("boosting_type", ["Ordered", "Plain"]),
        "bootstrap_type": trial.suggest_categorical(
            "bootstrap_type", ["Bayesian", "Bernoulli", "MVS"]
        ),
        "learning_rate": trial.suggest_float("learning_rate", 1e-3, 0.1, log=True),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 1, 100)
    }
    
    if param["bootstrap_type"] == "Bayesian":
        param["bagging_temperature"] = trial.suggest_float("bagging_temperature", 0, 10)
    elif param["bootstrap_type"] == "Bernoulli":
        param["subsample"] = trial.suggest_float("subsample", 0.1, 1)

    model = CatBoostClassifier(**param, task_type='GPU', random_state=seed)
    model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=60, verbose=False)

    ndcg20 = ndcg_score(np.array([rank]), np.array([model.predict_proba(X_test)[:, 1]]), k=20)   
    
    return ndcg20

In [128]:
study = optuna.create_study(direction="maximize")
study.optimize(objective, n_trials=50, timeout=6000)

[I 2024-04-30 20:11:11,871] A new study created in memory with name: no-name-9da4823f-30cc-4653-ab47-45745da39853
[I 2024-04-30 20:31:59,934] Trial 0 finished with value: 0.8607433562447357 and parameters: {'iterations': 1116, 'depth': 7, 'boosting_type': 'Plain', 'bootstrap_type': 'MVS', 'learning_rate': 0.005409177858928487, 'min_data_in_leaf': 21}. Best is trial 0 with value: 0.8607433562447357.
[I 2024-04-30 21:05:38,957] Trial 1 finished with value: 0.778974793565656 and parameters: {'iterations': 1350, 'depth': 8, 'boosting_type': 'Plain', 'bootstrap_type': 'Bernoulli', 'learning_rate': 0.044508932145802274, 'min_data_in_leaf': 39, 'subsample': 0.20944796485052536}. Best is trial 0 with value: 0.8607433562447357.
[I 2024-04-30 21:28:12,329] Trial 2 finished with value: 0.7987071610815604 and parameters: {'iterations': 1086, 'depth': 7, 'boosting_type': 'Ordered', 'bootstrap_type': 'MVS', 'learning_rate': 0.0747297881579277, 'min_data_in_leaf': 3}. Best is trial 0 with value: 0.86

In [129]:
print('Best hyperparameters:', study.best_params)
print('Best ndcg20:', study.best_value)

Best hyperparameters: {'iterations': 900, 'depth': 3, 'boosting_type': 'Plain', 'bootstrap_type': 'Bayesian', 'learning_rate': 0.014537014065368418, 'min_data_in_leaf': 56, 'bagging_temperature': 7.208524295714465}
Best ndcg20: 0.9292740593062361
