In [10]:
import numpy as np
import pandas as pd

from sklearn.model_selection import KFold, RandomizedSearchCV
from sklearn.metrics import balanced_accuracy_score
# from sklearn.preprocessing import LabelEncoder

import catboost as cb
from catboost import CatBoostClassifier, Pool

import xgboost as xgb
from xgboost import XGBClassifier, DMatrix

import matplotlib.pyplot as plt
import seaborn as sns
import phik
from phik.report import plot_correlation_matrix
from phik import report

import warnings
warnings.filterwarnings('ignore')

In [11]:
train = pd.read_csv('https://www.dropbox.com/scl/fi/9hb4r3uce0mqz8fkpja17/text_classification_train.csv?rlkey=w42y98wa401gelzou08pp582k&dl=1')

test = pd.read_csv('https://www.dropbox.com/scl/fi/7z7rsy14amjeugf166i1t/text_classification_test.csv?rlkey=z53jgwhijd6bpvk7n8n2munwb&dl=1')

# Baseline model

In [13]:
train.shape, test.shape

((7500, 2618), (2500, 2617))

In [15]:
train.head()

Unnamed: 0,category,text,ruBert-base_text_feature_0,ruBert-base_text_feature_1,ruBert-base_text_feature_2,ruBert-base_text_feature_3,ruBert-base_text_feature_4,ruBert-base_text_feature_5,ruBert-base_text_feature_6,ruBert-base_text_feature_7,...,labse_text_feature_758,labse_text_feature_759,labse_text_feature_760,labse_text_feature_761,labse_text_feature_762,labse_text_feature_763,labse_text_feature_764,labse_text_feature_765,labse_text_feature_766,labse_text_feature_767
0,extreme,Ледник Пасторури это цирковой ледник расположе...,0.272156,0.155383,0.060285,0.363159,-0.140391,0.507753,-0.226326,0.431878,...,-0.045795,-0.027475,0.030528,-0.052218,0.042459,-0.012714,-0.05537,-0.012433,-0.016283,-0.006994
1,martial_arts,Главные участники предстоящего Betokenoid 274 ...,0.439223,0.343683,0.093642,0.245294,0.08977,0.424717,-0.071487,0.18597,...,-0.022664,0.042258,-0.027394,-0.033566,0.016021,-0.022054,-0.040366,0.007392,-0.02907,-0.011284
2,extreme,Ttokenoid Btokenoid – карта с которой можно не...,-0.040338,0.058095,-0.091063,0.296028,-0.137103,0.931456,-0.16906,0.131503,...,-0.009464,0.011548,-0.046034,0.024588,-0.01367,-0.047028,0.009395,-0.000488,-0.06026,0.006563
3,autosport,В Сильверстоуне произошли крупные обновления а...,0.444181,0.218742,0.247859,0.234885,0.006668,0.407703,-0.115768,0.433781,...,-0.045474,-0.043454,-0.000961,-0.012203,-0.047922,-0.054657,-0.053768,0.018481,-0.039148,-0.038874
4,extreme,На протяжении более чем 30 лет Вестсайд являет...,-0.126253,-0.115856,0.131131,0.052595,0.060591,0.420976,0.090776,0.246287,...,0.004818,-0.008138,-0.013958,-0.038482,-0.002943,-0.03597,-0.01083,-0.005132,-0.04799,-0.005869


In [31]:
X = train.drop(['category', 'text'], axis=1)
y = train['category'].astype('category')
num_classes = y.nunique()
cat_features = ['category']

In [40]:
from sklearn.preprocessing import LabelEncoder

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [44]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    X, y_encoded, test_size=0.2, random_state=42)


In [47]:
train_pool = cb.Pool(X_train, y_train)
test_pool = cb.Pool(X_test, y_test)

model = cb.CatBoostClassifier(
    iterations=1000,
    depth=6,
    loss_function='MultiClass',
    classes_count=num_classes,
    verbose=100,
    task_type="GPU",
    devices='0'
)

In [48]:
model.fit(train_pool, eval_set=test_pool, early_stopping_rounds=100)

Learning rate set to 0.126016
0:	learn: 2.4743254	test: 2.4877300	best: 2.4877300 (0)	total: 388ms	remaining: 6m 27s
100:	learn: 0.7364787	test: 1.2648581	best: 1.2648581 (100)	total: 27.9s	remaining: 4m 8s
200:	learn: 0.4491232	test: 1.0993957	best: 1.0993957 (200)	total: 52.8s	remaining: 3m 29s
300:	learn: 0.3244822	test: 1.0361701	best: 1.0361701 (300)	total: 1m 15s	remaining: 2m 55s
400:	learn: 0.2499024	test: 0.9976979	best: 0.9975969 (399)	total: 1m 38s	remaining: 2m 26s
500:	learn: 0.2021835	test: 0.9691122	best: 0.9691122 (500)	total: 1m 59s	remaining: 1m 59s
600:	learn: 0.1646249	test: 0.9439510	best: 0.9439510 (600)	total: 2m 21s	remaining: 1m 34s
700:	learn: 0.1375408	test: 0.9262325	best: 0.9262325 (700)	total: 2m 43s	remaining: 1m 9s
800:	learn: 0.1152857	test: 0.9096509	best: 0.9096003 (799)	total: 3m 5s	remaining: 46.1s
900:	learn: 0.0984762	test: 0.8976452	best: 0.8976452 (900)	total: 3m 27s	remaining: 22.8s
999:	learn: 0.0857581	test: 0.8881405	best: 0.8881405 (999)	to

<catboost.core.CatBoostClassifier at 0x233800dea40>

In [49]:
y_pred = model.predict(X_test)

# Преобразуйте числовые предсказания обратно в строковые метки
y_pred_labels = label_encoder.inverse_transform(y_pred)
y_test_labels = label_encoder.inverse_transform(y_test)

# Оцените точность модели
accuracy = balanced_accuracy_score(y_test_labels, y_pred_labels)
print(f'Accuracy: {accuracy}')

Accuracy: 0.7239302608475016


In [51]:
model.save_model('models/catboost_baseline_model.cbm')

In [68]:
feature_importances = model.get_feature_importance(data=train_pool, type=cb.EFstrType.PredictionValuesChange)

feature_importances_df = pd.DataFrame({'Feature': X.columns, 'Importance': feature_importances}).sort_values(by='Importance', ascending=False)

feature_importances_df

Unnamed: 0,Feature,Importance
641,ruBert-base_text_feature_641,1.298949
2036,labse_text_feature_188,1.298233
113,ruBert-base_text_feature_113,1.130945
86,ruBert-base_text_feature_86,0.997096
423,ruBert-base_text_feature_423,0.986294
...,...,...
1660,rubert-base-cased-conversational_text_feature_580,0.000000
1662,rubert-base-cased-conversational_text_feature_582,0.000000
1663,rubert-base-cased-conversational_text_feature_583,0.000000
1664,rubert-base-cased-conversational_text_feature_584,0.000000


отбросим фичи с важностью < 0.1

In [69]:
drop_feature_importances_df = feature_importances_df[feature_importances_df['Importance'] < 0.1]

In [70]:
drop_feature_importances_df.shape

(2353, 2)

In [73]:
X_cleaned = X.drop(drop_feature_importances_df['Feature'].to_list(), axis=1)

В дальнейшем:
1. затюнить в Оптуне
2. Обучить xgb (dart) и затюнить
3. xgb (rf) и затюнить
4. Заблендить резы