**Загрузим необходимый функционал**

In [1]:
import gc
import pandas as pd
from tqdm import tqdm
from src import utils
import lightgbm as lgb
from catboost import CatBoostRanker, Pool
from src.CatBoostValidation import CatBoostValidation

tqdm.pandas()

**Загрузим эмбеддинги постов и комментариев**

In [2]:
with open("auxiliary_data/text_embed_train.npy", "rb") as f:
    df_text = np.load(f)
print(df_text.shape)

with open("auxiliary_data/comments_embed_train.npy", "rb") as f:
    df_coms = np.load(f)
print(df_coms.shape)

(440435, 768)
(440435, 768)


**Загрузим датафрейм со сгенерированными признаками**

In [3]:
new = pd.read_csv("processed_train_data.csv")
new.drop(["text", "comments", "score"], axis=1, inplace=True)
new.isna().sum().sum()

0

**Создадим обощённый тренировочный датафрейм**

In [4]:
data_train = pd.DataFrame(columns=[f"f{i}" for i in range(df_coms.shape[1])], data=df_coms)
data_train["cos"] = new["cos_measure"]
data_train["euclid"] = new["euclidean_measure"]
data_train["manh"] = new["manhattan_measure"]
data_train["toxic"] = new["toxic"]
data_train["percent_words"] = new["percent_words"]
data_train["hard_sentence"] = new["hard_sentence"].astype(int)
data_train["count_words"] = new["count_words"]
data_train["unique_words"] = new["unique_words"]
data_train["equality_toxic"] = new["equality_toxic"].astype(int)
data_train["resemblance"] = new["resemblance"]

data_train["score"] = [0, 1, 2, 3, 4] * df_text[::5].shape[0]

groups = []
for i in tqdm(range(df_text[::5].shape[0])):
    groups += [i] * 5
    
data_train["group_id"] = groups

del df_coms, groups, df_text, new
gc.collect();

100%|███████████████████████████████████████████████████████████████████████| 88087/88087 [00:00<00:00, 1599901.51it/s]


**Для оценки перформанса модели выберем холдаут стратегию**

In [5]:
train_test_split_threshold = int(data_train["group_id"].quantile(0.8))

x_train = data_train[data_train["group_id"] <= train_test_split_threshold]
x_valid = data_train[data_train["group_id"] > train_test_split_threshold]
x_valid.reset_index(drop=True, inplace=True)

assert x_train.shape[0] % 5 == 0
assert x_valid.shape[0] % 5 == 0

x_train.shape, x_valid.shape

del train_test_split_threshold
gc.collect();

**Определим фичи, на которых будем обучаться, и группы для LightGBMRanker**

In [6]:
cols = list(data_train)
cols.remove("group_id")
cols.remove("score")
print(len(cols))

gc.collect();

778


In [7]:
group_train = x_train.groupby("group_id").size().reset_index(name="cnt").cnt.values
assert group_train.shape[0] * 5 == x_train.shape[0]

**Инициализируем LightGBMRanker и оценим его перформанс по NDCG@5**

In [8]:
ranker_model = lgb.LGBMRanker(
    n_estimators = 100,
    random_state = 33,
    n_jobs = 12,
    verbose = 20,
    learning_rate = 0.1
)

ranker_model.fit(
    x_train[cols], 
    x_train["score"], 
    group=group_train,
    eval_set=[(x_valid[cols], x_valid["score"])],
    eval_group=[[5] * (x_valid.shape[0] // 5)],
    eval_at=[5],
)

[LightGBM] [Debug] Dataset::GetMultiBinFromAllFeatures: sparse rate 0.001840
[LightGBM] [Debug] init for col-wise cost 0.000169 seconds, init for row-wise cost 0.440664 seconds
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 197787
[LightGBM] [Info] Number of data points in the train set: 352350, number of used features: 778
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[1]	valid_0's ndcg@5: 0.803406
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[2]	valid_0's ndcg@5: 0.813995
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[3]	valid_0's ndcg@5: 0.817683
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[4]	valid_0's ndcg@5: 0.818504
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 7
[5]	valid_0's ndcg@5: 0.818548
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[6]	valid_0's ndcg@5: 0.818783
[LightGBM] [Debug] Trained a tree with leaves = 31 and dep

[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[82]	valid_0's ndcg@5: 0.820453
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[83]	valid_0's ndcg@5: 0.820738
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[84]	valid_0's ndcg@5: 0.820543
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[85]	valid_0's ndcg@5: 0.8207
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 10
[86]	valid_0's ndcg@5: 0.820663
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 8
[87]	valid_0's ndcg@5: 0.82066
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[88]	valid_0's ndcg@5: 0.820764
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 11
[89]	valid_0's ndcg@5: 0.820783
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 13
[90]	valid_0's ndcg@5: 0.820918
[LightGBM] [Debug] Trained a tree with leaves = 31 and depth = 9
[91]	valid_0's ndcg@5: 0.820899
[LightGBM] [Debug] Trained

**Соберём пул данных для CatBoostRanker**

In [9]:
train = Pool(
    data=x_train[cols],
    label=x_train["score"],
    group_id=x_train["group_id"].values
)

test = Pool(
    data=x_valid[cols],
    label=x_valid["score"],
    group_id=x_valid["group_id"].values
)

**Инициализируем CatBoostRanker и оценим его перформанс по NDCG@5**

In [10]:
model = CatBoostRanker(
    iterations = 100,
    custom_metric = ["NDCG:top=5"],
    verbose = True,
    random_seed = 33,
    loss_function = "YetiRank",
    task_type = "GPU",
    metric_period = 20,
)
model.fit(train, eval_set=test, plot=False)

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8561528	best: 0.8561528 (0)	total: 62.8ms	remaining: 6.22s
20:	test: 0.8834403	best: 0.8834403 (20)	total: 939ms	remaining: 3.53s
40:	test: 0.8860708	best: 0.8860708 (40)	total: 1.75s	remaining: 2.52s
60:	test: 0.8863823	best: 0.8863823 (60)	total: 2.56s	remaining: 1.64s
80:	test: 0.8864115	best: 0.8864115 (80)	total: 3.35s	remaining: 786ms
99:	test: 0.8866951	best: 0.8866951 (99)	total: 4.12s	remaining: 0us
bestTest = 0.8866951384
bestIteration = 99


<catboost.core.CatBoostRanker at 0x1a162dd1580>

**В качестве основного ранжирующего алгоритма возьмём - CatBoostRanker**

**Для более устойчивой оценки перформанса модели используем стратегию кросс валидации на 5 фолдах**

In [11]:
# cols = list(data_train)
# cols.remove("group_id")
# cols.remove("score")

cbval = CatBoostValidation(
    data_train[cols],
    data_train["score"],
    data_train["group_id"]
)

In [12]:
cbval.validate()

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8543879	best: 0.8543879 (0)	total: 51.7ms	remaining: 5.12s
20:	test: 0.8821573	best: 0.8821573 (20)	total: 907ms	remaining: 3.41s
40:	test: 0.8836456	best: 0.8836456 (40)	total: 1.73s	remaining: 2.5s
60:	test: 0.8836472	best: 0.8836472 (60)	total: 2.56s	remaining: 1.63s
80:	test: 0.8838177	best: 0.8838177 (80)	total: 3.37s	remaining: 791ms
99:	test: 0.8839875	best: 0.8839875 (99)	total: 4.15s	remaining: 0us
bestTest = 0.8839875359
bestIteration = 99


Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8506871	best: 0.8506871 (0)	total: 54.7ms	remaining: 5.41s
20:	test: 0.8800754	best: 0.8800754 (20)	total: 928ms	remaining: 3.49s
40:	test: 0.8820903	best: 0.8820903 (40)	total: 1.74s	remaining: 2.51s
60:	test: 0.8825418	best: 0.8825418 (60)	total: 2.56s	remaining: 1.64s
80:	test: 0.8826839	best: 0.8826839 (80)	total: 3.38s	remaining: 792ms
99:	test: 0.8829353	best: 0.8829353 (99)	total: 4.13s	remaining: 0us
bestTest = 0.8829352762
bestIteration = 99


Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8613482	best: 0.8613482 (0)	total: 53.1ms	remaining: 5.26s
20:	test: 0.8796632	best: 0.8796632 (20)	total: 922ms	remaining: 3.47s
40:	test: 0.8823026	best: 0.8823026 (40)	total: 1.76s	remaining: 2.54s
60:	test: 0.8824984	best: 0.8824984 (60)	total: 2.6s	remaining: 1.66s
80:	test: 0.8828549	best: 0.8828549 (80)	total: 3.44s	remaining: 806ms
99:	test: 0.8830388	best: 0.8830388 (99)	total: 4.21s	remaining: 0us
bestTest = 0.8830387672
bestIteration = 99


Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8502244	best: 0.8502244 (0)	total: 51.7ms	remaining: 5.12s
20:	test: 0.8797466	best: 0.8797466 (20)	total: 920ms	remaining: 3.46s
40:	test: 0.8823074	best: 0.8823074 (40)	total: 1.74s	remaining: 2.5s
60:	test: 0.8825804	best: 0.8825804 (60)	total: 2.57s	remaining: 1.64s
80:	test: 0.8828445	best: 0.8828445 (80)	total: 3.4s	remaining: 798ms
99:	test: 0.8830881	best: 0.8830881 (99)	total: 4.17s	remaining: 0us
bestTest = 0.8830881165
bestIteration = 99


Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	test: 0.8546783	best: 0.8546783 (0)	total: 51.9ms	remaining: 5.14s
20:	test: 0.8812752	best: 0.8812752 (20)	total: 909ms	remaining: 3.42s
40:	test: 0.8823533	best: 0.8823533 (40)	total: 1.73s	remaining: 2.49s
60:	test: 0.8824615	best: 0.8824615 (60)	total: 2.54s	remaining: 1.62s
80:	test: 0.8826561	best: 0.8826561 (80)	total: 3.36s	remaining: 787ms
99:	test: 0.8827907	best: 0.8827907 (99)	total: 4.11s	remaining: 0us
bestTest = 0.8827906602
bestIteration = 99
[mean NDCG@5]: 0.8831681


**Обучим модель на полном тренировочном наборе данных и сохраним полученные веса**

In [13]:
full_train = Pool(
    data=data_train[cols],
    label=data_train["score"],
    group_id=data_train["group_id"].values
)

ranker = CatBoostRanker(
    iterations = 100,
    custom_metric = ["NDCG:top=5"],
    verbose = True,
    random_seed = 33,
    loss_function = "YetiRank",
    task_type = "GPU",
    metric_period = 20,
)
ranker.fit(full_train, plot=False)

Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time
Metric NDCG:top=5;type=Base is not implemented on GPU. Will use CPU for metric computation, this could significantly affect learning time


0:	total: 56ms	remaining: 5.54s
20:	total: 1.05s	remaining: 3.95s
40:	total: 2s	remaining: 2.88s
60:	total: 2.95s	remaining: 1.89s
80:	total: 3.89s	remaining: 913ms
99:	total: 4.79s	remaining: 0us


<catboost.core.CatBoostRanker at 0x1a16311eee0>

In [15]:
ranker.save_model("ranker", export_parameters=False)

---