In [1]:
from dataclasses import dataclass, asdict
import pandas as pd
import numpy as np
import torch

from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, classification_report

from catboost import CatBoostClassifier



In [3]:
def seed_everything(seed=2024):
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.backends.cudnn.deterministic = True
    
seed_everything()

### 3 Начинаем эксперимент

In [4]:
from getpass import getpass
access_key = getpass(prompt="Введите API Access токен: ")
secret_key = getpass(prompt="Введите API Secret токен: ")


In [6]:
%%capture
%env CLEARML_WEB_HOST=https://app.clear.ml/
%env CLEARML_API_HOST=https://api.clear.ml
%env CLEARML_FILES_HOST=https://files.clear.ml
%env CLEARML_API_ACCESS_KEY=$access_key
%env CLEARML_API_SECRET_KEY=$secret_key

### 4 Подгружаем данные

In [7]:
url = "https://github.com/a-milenkin/ml_instruments/raw/refs/heads/main/data/quickstart_train.csv"
rides_info = pd.read_csv(url)

In [10]:
rides_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2337 entries, 0 to 2336
Data columns (total 17 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   car_id                    2337 non-null   object 
 1   model                     2337 non-null   object 
 2   car_type                  2337 non-null   object 
 3   fuel_type                 2337 non-null   object 
 4   car_rating                2337 non-null   float64
 5   year_to_start             2337 non-null   int64  
 6   riders                    2337 non-null   int64  
 7   year_to_work              2337 non-null   int64  
 8   target_reg                2337 non-null   float64
 9   target_class              2337 non-null   object 
 10  mean_rating               2337 non-null   float64
 11  distance_sum              2337 non-null   float64
 12  rating_min                2337 non-null   float64
 13  speed_max                 2337 non-null   float64
 14  user_rid

In [17]:
cat_features = ['model', 'car_type', 'fuel_type']
targets = ['target_class', 'target_reg']
features2drop = ['car_id']
filtered_features = [col for col in rides_info.columns if col not in features2drop + targets]
num_features = [col for col in filtered_features if col not in cat_features]

print('categorical features:', cat_features)
print('numerical features:', num_features)
print('target features:', targets)

for col in cat_features:
    rides_info[col] = rides_info[col].astype(str)

categorical features: ['model', 'car_type', 'fuel_type']
numerical features: ['car_rating', 'year_to_start', 'riders', 'year_to_work', 'mean_rating', 'distance_sum', 'rating_min', 'speed_max', 'user_ride_quality_median', 'deviation_normal_count', 'user_uniq']
target features: ['target_class', 'target_reg']


In [18]:
train, test = train_test_split(rides_info, test_size=0.2, random_state=42)
X_train = train[filtered_features]
y_train = train['target_class']

X_test = test[filtered_features]
y_test = test['target_class']

In [None]:
cb_params = {
    "depth": 4,
    "learning_rate": 0.06,
    "loss_function": "MultiClass",
    "custom_metric": ["Recall"],
    # Главная фишка катбуста - работа с категориальными признаками
    "cat_features": cat_features,
    # Регуляризация и ускорение
    "colsample_bylevel": 0.098,
    "subsample": 0.95,
    "l2_leaf_reg": 9,
    "min_data_in_leaf": 243,
    "max_bin": 187,
    "random_strength": 1,
    # Параметры ускорения
    "task_type": "CPU",
    "thread_count": -1,
    "bootstrap_type": "Bernoulli",
    # Важное!
    "random_seed": cfg.seed,
    "early_stopping_rounds": 50,
}

In [19]:
from clearml import Task, Logger

In [None]:
task = Task.init(
    project_name="ClearMl_logging",
    task_name="CatBoost model baseline"
)