In [1]:
from pathlib import Path
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier, Pool
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score

In [2]:
TEXT_COLUMNS = [
    "Text",
    "name",
    "address",
    "normalized_main_rubric_name_ru",
    "prices_summarized",
    "reviews_summarized",
]

def build_text(df: pd.DataFrame) -> pd.Series:
    safe = df.reindex(columns=TEXT_COLUMNS).fillna("")
    return safe.astype(str).agg(" ".join, axis=1)


def resolve_label(df: pd.DataFrame) -> pd.Series:
    if "relevance_new" in df.columns:
        return df["relevance_new"]
    raise KeyError("No label column found (expected relevance_new or relevance).")

In [3]:
TRAIN_PATH = Path("data/raw/data_final_for_dls_new.jsonl")
VAL_PATH = Path("data/raw/data_final_for_dls_eval_new.jsonl")

# TF-IDF
MAX_FEATURES = 120_000
MIN_DF = 3

# BOOSTING
DEPTH = 6
LR = 0.1
ITERS = 1000
REG = 3
SEED = 42

In [4]:
train_df = pd.read_json(TRAIN_PATH, lines=True)
val_df = pd.read_json(VAL_PATH, lines=True)
print(len(train_df), len(val_df))

35094 570


In [5]:
train_df[train_df['Text']=='сигары']

Unnamed: 0,Text,address,name,normalized_main_rubric_name_ru,permalink,prices_summarized,relevance,reviews_summarized,relevance_new
0,сигары,"Москва, Дубравная улица, 34/29",Tabaccos; Магазин Tabaccos; Табаккос,Магазин табака и курительных принадлежностей,1263329400,,1.0,"Организация занимается продажей табака, курите...",1.0
16833,сигары,"Москва, Дубравная улица, 51, стр. 1",Everest Garden,Кальян-бар,135455309610,"Everest Garden предлагает широкий выбор блюд, ...",0.0,Everest Garden — ресторан с атмосферой кальяна...,0.0
20270,сигары,"Московская область, Одинцовский городской окру...",HiDar; Hi Dar; Hidar Hookah Shop,Магазин табака и курительных принадлежностей,213268529351,,1.0,Организация занимается продажей табака и курит...,1.0
22049,сигары,"Москва, проспект Вернадского, 58",Intro Hookah; Интро хука; Интро Хука,Магазин табака и курительных принадлежностей,199059312788,,0.1,Организация занимается продажей табака и курит...,0.1
24513,сигары,"Москва, Западный административный округ, район...",Who cares; Who cares hookah shop,Вейп-шоп,94001366296,,0.1,"Организация занимается продажей вейпов, электр...",0.1
25421,сигары,"Москва, Ангелов переулок, 6",Бристоль; Bristol'; Bristol,Алкогольные напитки,143074092718,Магазин продуктов «Бристоль» предлагает алкого...,0.0,Организация занимается продажей алкогольных на...,0.0


In [5]:
train_df = train_df[train_df["relevance_new"] != 0.1]
val_df = val_df[val_df["relevance_new"] != 0.1]
print(len(train_df), len(val_df))

30391 500


# TF-IFD

In [6]:
y_train = resolve_label(train_df).astype(int)
y_val = resolve_label(val_df).astype(int)

train_text = build_text(train_df)
val_text = build_text(val_df)

vectorizer = TfidfVectorizer(
    max_features=MAX_FEATURES,
    min_df=MIN_DF,
    ngram_range=(1, 2),
    lowercase=True,
    token_pattern=r"(?u)\b\w+\b",
)

x_train = vectorizer.fit_transform(train_text)
x_val = vectorizer.transform(val_text)

# CATBOOST

In [9]:
train_pool = Pool(x_train, y_train)
val_pool = Pool(x_val, y_val)

model = CatBoostClassifier(
    iterations=ITERS,
    depth=DEPTH,
    learning_rate=LR,
    l2_leaf_reg=REG,
    loss_function="Logloss",
    eval_metric="Accuracy",
    random_seed=SEED,
    verbose=100,
    allow_writing_files=False,
    # task_type='GPU',
    od_type="Iter",
    od_wait=300,
)

In [10]:
model.fit(train_pool, eval_set=val_pool, use_best_model=True)

val_proba = model.predict_proba(x_val)[:, 1]
y_pred = (val_proba >= 0.5).astype(int)

acc = accuracy_score(y_val, y_pred)
f1 = f1_score(y_val, y_pred)
precision = precision_score(y_val, y_pred)
recall = recall_score(y_val, y_pred)

print(f"Validation size: {len(y_val)}")
print(f"Accuracy: {acc:.4f}")
print(f"precision: {precision:.4f}")
print(f"recall: {recall:.4f}")
print(f"F1: {f1:.4f}")

0:	learn: 0.5491757	test: 0.6540000	best: 0.6540000 (0)	total: 729ms	remaining: 12m 8s
100:	learn: 0.6562140	test: 0.6580000	best: 0.6580000 (100)	total: 1m 10s	remaining: 10m 31s
200:	learn: 0.7039255	test: 0.7080000	best: 0.7120000 (195)	total: 2m 22s	remaining: 9m 24s
300:	learn: 0.7383107	test: 0.7440000	best: 0.7440000 (299)	total: 3m 32s	remaining: 8m 14s
400:	learn: 0.7684183	test: 0.7740000	best: 0.7740000 (394)	total: 4m 43s	remaining: 7m 2s
500:	learn: 0.7952683	test: 0.8000000	best: 0.8020000 (493)	total: 6m 7s	remaining: 6m 6s
600:	learn: 0.8173472	test: 0.8120000	best: 0.8120000 (585)	total: 7m 19s	remaining: 4m 51s
700:	learn: 0.8341614	test: 0.8280000	best: 0.8300000 (695)	total: 8m 33s	remaining: 3m 38s
800:	learn: 0.8520944	test: 0.8440000	best: 0.8460000 (794)	total: 10m 46s	remaining: 2m 40s
900:	learn: 0.8688757	test: 0.8560000	best: 0.8560000 (894)	total: 11m 59s	remaining: 1m 19s
999:	learn: 0.8819387	test: 0.8680000	best: 0.8700000 (959)	total: 13m 9s	remaining: 