**Загрузим необходимый функционал**

In [1]:
import gc
import json
import pandas as pd
from tqdm import tqdm
from src import utils
from catboost import CatBoostRanker, Pool

tqdm.pandas()

**Загрузим эмбеддинги постов и комментариев**

In [2]:
with open("auxiliary_data/text_embed_test.npy", "rb") as f:
    df_text = np.load(f)
print(df_text.shape)

with open("auxiliary_data/comments_embed_test.npy", "rb") as f:
    df_coms = np.load(f)
print(df_coms.shape)

(70020, 768)
(70020, 768)


**Загрузим датафрейм со сгенерированными признаками**

In [3]:
new = pd.read_csv("processed_test_data.csv")
new.drop(["text", "comments"], axis=1, inplace=True)
new.isna().sum().sum()

0

**Создадим обощённый тестовый датафрейм**

In [4]:
data_test = pd.DataFrame(columns=[f"f{i}" for i in range(df_coms.shape[1])], data=df_coms)
data_test["cos"] = new["cos_measure"]
data_test["euclid"] = new["euclidean_measure"]
data_test["manh"] = new["manhattan_measure"]
data_test["toxic"] = new["toxic"]
data_test["percent_words"] = new["percent_words"]
data_test["hard_sentence"] = new["hard_sentence"].astype(int)
data_test["count_words"] = new["count_words"]
data_test["unique_words"] = new["unique_words"]
data_test["equality_toxic"] = new["equality_toxic"].astype(int)
data_test["resemblance"] = new["resemblance"]

groups = []
for i in tqdm(range(df_text[::5].shape[0])):
    groups += [i] * 5
    
data_test["group_id"] = groups

del df_coms, groups, df_text, new
gc.collect();

100%|████████████████████████████████████████████████████████████████████████| 14004/14004 [00:00<00:00, 504457.67it/s]


**Загрузим обученную модель**

In [5]:
ranker = CatBoostRanker()
ranker.load_model("models/ranker")

<catboost.core.CatBoostRanker at 0x192dae44ca0>

**Соберём пул тестовых данных для предсказания рангов**

In [6]:
cols = list(data_test)
cols.remove("group_id")

test_pool = Pool(
    data=data_test[cols],
    group_id=data_test["group_id"].values
)

**Сделаем предсказания и соберём данные в jsonl**

In [7]:
preds = ranker.predict(test_pool)

In [8]:
test = pd.read_json("data/ranking_test.jsonl", lines = True)
print(test.shape)
test = utils.get_valid_stucture(test, is_train=False)
test["preds"] = preds

(14004, 2)


0it [00:00, ?it/s]

In [9]:
text_index = test.index[test.index % 5 == 0]

In [10]:
json_list = []
for i in tqdm(text_index):
    json_ = {}
    text = test.loc[i, "text"]
    comments = []
    temp = test.loc[i:i+4, ["comments", "preds"]].sort_values(by="preds").reset_index(drop=True)
    for val in temp.iterrows():
        comments.append(
            {
                "text": val[1]["comments"],
                "score": val[0]
            }
        )
    json_["text"] = text
    json_["comments"] = comments
    json_list.append(json_)
assert len(json_list) == (test.shape[0] / 5)

100%|███████████████████████████████████████████████████████████████████████████| 14004/14004 [00:58<00:00, 240.85it/s]


In [11]:
with open("predictions.jsonl", 'w') as file:
    for item in json_list:
        file.write(json.dumps(item) + "\n")

---