In [2]:
import sys
import os

In [3]:
project_root = os.path.abspath("..")
if project_root not in sys.path:
    sys.path.append(project_root)

In [4]:
import pandas as pd
from src.setting.settings import setup_logger, dbm
import numpy as np
import seaborn as sns
import importlib
from datetime import datetime, timedelta
from typing import Tuple

In [15]:
from src.utils import decorators, logger
from src.data_loader import raw_data_loader
from src.preprocessor import preprocessing
from src.splitter import splitter
from src.pipeline import data_preparation_pipeline, model_pipeline

importlib.reload(decorators)
importlib.reload(logger)
importlib.reload(raw_data_loader)
importlib.reload(splitter)
importlib.reload(preprocessing)
importlib.reload(data_preparation_pipeline)
importlib.reload(model_pipeline)

from src.utils.decorators import start_finish_function
from src.utils.logger import CustomFormatter
from src.data_loader.raw_data_loader import get_df
from src.preprocessor.preprocessing import ManagerPreprocessing
from src.splitter.splitter import FixedDateTrainTestSplitter, TimeBasedSubsetSelector, SplitManager
from src.pipeline.data_preparation_pipeline import TrainTestPreparer
from src.pipeline.model_pipeline import transform_pipe

In [10]:
engine = dbm.get_engine()

In [11]:
df = get_df(engine)

[2025-05-01 14:59:31] [INFO] [decorators.wrapper] Старт выполнения метода 'Engine.get_df'
[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Завершение выполнения метода 'Engine.get_df'. Время выполнения: 12.87 секунд


In [12]:
feature_name = ['delivery_point', 'rasstoyanie', 'region_zagruzki', 'lat_zagruzki', 'lng_zagruzki', 'region_vygruzki', 'lat_vygruzki', 'lng_vygruzki', 'date_create', 'tonnazh', 'obem_znt', 'kolvo_gruzovykh_mest', 'lt_stoimost_perevozki']

x_train, y_train, x_test, y_test, x_subset, y_subset = TrainTestPreparer(feature_columns=feature_name).prepare(df)

[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Старт выполнения метода 'TrainTestPreparer.prepare'
[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Старт выполнения метода 'ManagerPreprocessing.preprocess'
[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Старт выполнения метода 'ManagerPreprocessing._dropna'
[2025-05-01 14:59:45] [INFO] [preprocessing.ManagerPreprocessing._dropna] Удалено пустых столбцов: 295 шт, осталось 618328
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Завершение выполнения метода 'ManagerPreprocessing._dropna'. Время выполнения: 0.45 секунд
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Старт выполнения метода 'DatetimePreprocessor.transform'
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Завершение выполнения метода 'DatetimePreprocessor.transform'. Время выполнения: 0.64 секунд
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Старт выполнения метода 'TargetCreator.transform'
[2025-05-01 14:59:58] [INFO] [decorators.wrapper] Завершение выполнения

## Предобработка

In [62]:
df['planned_delivery_days'], df['y'] = Preprocessing.create_target(df)

In [77]:
columns_params = ['delivery_point', 'rasstoyanie', 'region_zagruzki', 'lat_zagruzki', 'lng_zagruzki', 'region_vygruzki', 'lat_vygruzki', 'lng_vygruzki', 'date_create', 'tonnazh', 'obem_znt', 'kolvo_gruzovykh_mest', 'lt_stoimost_perevozki', 'voditel', 'planned_delivery_days', 'y']
df_preparation = df[columns_params].copy()
df_preparation.shape 

(618623, 16)

In [78]:
df_preparation = df_preparation.dropna(subset=[col for col in df_preparation.columns if col != 'voditel'])
df_preparation.shape 

(618328, 16)

### Категориальные фичи

In [79]:
obj_col = df_preparation.select_dtypes(include=np.object_).columns
obj_col = obj_col.drop('voditel')

for col in obj_col:
    counts = df_preparation[col].value_counts()
    valid_values = counts[counts > 5].index
    df_preparation = df_preparation[df_preparation[col].isin(valid_values)]

print(f"Финальный размер: {df_preparation.shape}")

Финальный размер: (618224, 16)


### Новые фичи

In [80]:
def assign_group(distance):
    if pd.isna(distance):
        return np.nan
    if distance == 0:
        return 0
    elif distance < 1000:
        return (distance // 100) * 100
    else:
        return (distance // 1000) * 1000

def tonnazh_group(tonnazh):
    if pd.isna(tonnazh):
        return np.nan
    if tonnazh == 0:
        return 0
    elif tonnazh < 1:
        return round((tonnazh // 0.1) * 0.1, 1)
    elif tonnazh < 5:
        return (tonnazh // 1) * 1
    elif tonnazh < 10:
        return 5
    else:
        return 10

def cost_group(cost):
    if pd.isna(cost):
        return np.nan
    if cost == 0:
        return 0
    elif cost < 10000:
        return (cost // 1000) * 1000
    elif cost < 50000:
        return (cost // 10000) * 10000
    elif 50000 < cost < 150000:
        return (cost // 50000) * 50000
    elif 150000 < cost < 500001:
        return 150000
    else:
        return 500000

In [81]:
df_preparation['distance_group'] = df_preparation['rasstoyanie'].map(assign_group)

df_preparation['distance_group'] = df_preparation['distance_group'].astype('int')

df_preparation['tonnazh_group'] = df_preparation['tonnazh'].map(tonnazh_group)

# df_preparation['tonnazh_group'] = df_preparation['tonnazh_group'].astype('int')

df_preparation['cost_group'] = df_preparation['lt_stoimost_perevozki'].map(cost_group)

df_preparation['cost_group'] = df_preparation['cost_group'].astype('int')

### Геопространственные признаки

In [82]:
R = 6371.0

# Преобразуем координаты из градусов в радианы
lat1 = np.radians(df_preparation['lat_zagruzki'].values)
lon1 = np.radians(df_preparation['lng_zagruzki'].values)
lat2 = np.radians(df_preparation['lat_vygruzki'].values)
lon2 = np.radians(df_preparation['lng_vygruzki'].values)

# Разница координат
dlat = lat2 - lat1
dlon = lon2 - lon1

# Формула гаверсинуса
a = np.sin(dlat / 2.0) ** 2 + np.cos(lat1) * np.cos(lat2) * np.sin(dlon / 2.0) ** 2
c = 2 * np.arcsin(np.sqrt(a))

# Вычисляем расстояние
df_preparation['geo_rasstoyanie_km'] = R * c

### Временные фичи

In [83]:
df_preparation['year']=df_preparation.date_create.dt.year
df_preparation['month']=df_preparation.date_create.dt.month

df_preparation.sort_values('date_create', inplace=True)

## Обучение модели

In [85]:
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.model_selection import GridSearchCV, TimeSeriesSplit

from sklearn.svm import LinearSVC

from sklearn.metrics import (make_scorer, accuracy_score, f1_score, 
                            average_precision_score, precision_score, recall_score, 
                            precision_recall_fscore_support, fbeta_score)
from sklearn.inspection import permutation_importance

import matplotlib.pyplot as plt

### Функции

In [109]:
def df_split_test_and_train(
        df, date_column='date_create',
        test_count_days=90
    ) -> Tuple[pd.DataFrame, pd.Series, pd.DataFrame, pd.Series]:
    df_split = df.copy()
    df_split = df_split.sort_values(date_column)

    split_date = df_split[date_column].max() - timedelta(test_count_days)

    train_df = df_split[df_split[date_column] < split_date]
    test_df = df_split[df_split[date_column] >= split_date]

    X_train = train_df.drop(['y'], axis=1)
    y_train = train_df.y
    X_test = test_df.drop(['y'], axis=1)
    y_test = test_df.y
    return X_train, y_train, X_test, y_test

def select_time_based_subset(df, percent=0.2, n_segments=5):
    total_len = len(df)
    segment_size = int((total_len * percent) / n_segments)
    
    indices = []
    step = (total_len - segment_size) // (n_segments - 1)

    for i in range(n_segments):
        start_idx = i * step
        end_idx = start_idx + segment_size
        indices.extend(range(start_idx, end_idx))

    df_subset = df.iloc[indices]
    return df_subset
    

def optimize_threshold(y_true, y_proba, metric=f1_score):
    thresholds = np.linspace(0.1, 0.9, 81)
    scores = [metric(y_true, (y_proba >= t).astype(int)) for t in thresholds]
    best_t = thresholds[np.argmax(scores)]
    return round(best_t, 3), round(max(scores), 3)

def optimal_threshold_cv(model, X, y, score_method=f1_score, cv_splits=3):
    thresholds = np.linspace(0.1, 0.9, 81)  # Шаг 0.01
    tss = TimeSeriesSplit(n_splits=cv_splits)

    best_thresholds = []
    best_scores = []

    for train_idx, val_idx in tss.split(X):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        y_train, y_val = y.iloc[train_idx], y.iloc[val_idx]

        model.fit(X_train, y_train)
        val_probs = model.predict_proba(X_val)[:, 1]

        scores = []
        for thresh in thresholds:
            val_pred = (val_probs >= thresh).astype(int)
            score = score_method(y_val, val_pred)
            scores.append(score)

        # Берем порог с максимальным F1 на текущем фолде
        best_idx = np.argmax(scores)
        best_thresholds.append(thresholds[best_idx])
        best_scores.append(scores[best_idx])

    optimal_threshold = np.mean(best_thresholds)
    optimal_scores = np.mean(best_scores)

    return optimal_threshold, optimal_scores

In [121]:
def get_df_p_r_f1_auc_fr(pipe, x_train, y_train, x_test, y_test, best_threshold=None):
    pipe.fit(x_train, y_train)
    
    y_train_proba = pipe.predict_proba(x_train)[:, 1]
    y_test_proba = pipe.predict_proba(x_test)[:, 1]
    
    # Подбор лучшего порога отсечения (по train)
    if not best_threshold:
        best_threshold, best_f1 = optimize_threshold(y_train, y_train_proba)
        print(f'Подобраный порог отсечения: {best_threshold}')
    else:
        print(f'Лучший порог отсечения: {best_threshold}')
        pass
    
    # === 2. Применение порога и замер на TRAIN ===
    y_best_train_pred = (y_train_proba >= best_threshold).astype(int)
    precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(
        y_train, y_best_train_pred, average='binary'
    )
    auc_fr_train = average_precision_score(y_train, y_train_proba)
    
    # === 3. Применение порога и замер на TEST ===
    y_best_test_pred = (y_test_proba >= best_threshold).astype(int)
    precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(
        y_test, y_best_test_pred, average='binary'
    )
    auc_fr_test = average_precision_score(y_test, y_test_proba)
    
    # === 4. Вывод результата ===
    results = {
        'threshold': best_threshold,
        'train': {
            'precision': round(precision_train, 4),
            'recall': round(recall_train, 4),
            'f1': round(f1_train, 4),
            'ap': round(auc_fr_train, 4)
        },
        'test': {
            'precision': round(precision_test, 4),
            'recall': round(recall_test, 4),
            'f1': round(f1_test, 4),
            'ap': round(auc_fr_test, 4)
        }
    }
    
    return pd.DataFrame(results).T

In [100]:
df_for_split = df_preparation.copy()
df_for_split.head(1)

Unnamed: 0,delivery_point,rasstoyanie,region_zagruzki,lat_zagruzki,lng_zagruzki,region_vygruzki,lat_vygruzki,lng_vygruzki,date_create,tonnazh,...,lt_stoimost_perevozki,voditel,planned_delivery_days,y,distance_group,tonnazh_group,cost_group,geo_rasstoyanie_km,year,month
324453,1,800.0,Приморский край,42.845616,132.571133,Смоленская область,55.19187,34.346061,2016-11-01,0.25,...,0.0,f920e2f0-cfcd-11e5-80e4-005056010e02,9,1,800,0.2,0,6682.931393,2016,11


### Препроцесинг

In [14]:
x_test.columns

Index(['znt_ssylka', 'delivery_point', 'rasstoyanie', 'date_create',
       'date_fakticheskaya_vygruzki', 'tonnazh', 'obem_znt', 'adres_zagruzki',
       'adres_vygruzki', 'country_zagruzki', 'region_zagruzki',
       'city_zagruzki', 'lat_zagruzki', 'lng_zagruzki', 'country_vygruzki',
       'region_vygruzki', 'city_vygruzki', 'lat_vygruzki', 'lng_vygruzki',
       'kolvo_gruzovykh_mest', 'lt_stoimost_perevozki', 'voditel', 'year',
       'month', 'planned_delivery_days', 'distance_group', 'tonnazh_group',
       'cost_group', 'geo_rasstoyanie_km'],
      dtype='object')

In [101]:
df_for_split.drop(['year', 'kolvo_gruzovykh_mest', 'lt_stoimost_perevozki', 'obem_znt', 'voditel'], axis=1, inplace=True)
df_for_split.shape

(618224, 17)

In [102]:
x_train, y_train, x_test, y_test = df_split_test_and_train(df_for_split)

In [104]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder

In [105]:
obj_columns = ['distance_group', 'cost_group', 'region_zagruzki', 'region_vygruzki']
exclude_columns = ['year', 'kolvo_gruzovykh_mest', 'lt_stoimost_perevozki', 'obem_znt', 'voditel', 'date_create']

col_for_one_idx = [list(x_train.columns).index(col_name) for col_name in obj_columns]

not_obj_columns = x_train.select_dtypes(exclude=object).columns
numeric_col = not_obj_columns.difference(exclude_columns).difference(obj_columns)
numeric_cols_idx = [list(x_train.columns).index(col_name) for col_name in numeric_col]

ohe = OneHotEncoder(handle_unknown='ignore', sparse_output=False)

t = [('OHE', ohe, col_for_one_idx),
    ('StandardScaler', StandardScaler(), numeric_cols_idx)]

col_transformer = ColumnTransformer(transformers=t, verbose_feature_names_out=True)

## Бустинг

In [30]:
import xgboost as xgb

In [33]:
xgb.__version__

'3.0.0'

In [124]:
pipe_boost = Pipeline(steps=[
    ('column_transformer', col_transformer),
    ('model_boost', xgb.XGBClassifier(eval_metric='logloss'))
])

pipe_boost.fit(x_train, y_train)

In [113]:
pipe_boost.predict_proba()

array([[0.73903394, 0.2609661 ],
       [0.4185412 , 0.5814588 ],
       [0.9321746 , 0.06782539],
       ...,
       [0.89190894, 0.10809105],
       [0.9028682 , 0.09713177],
       [0.9364233 , 0.0635767 ]], dtype=float32)

In [125]:
param_grid = {
    'model_boost__n_estimators': [100, 200],
    'model_boost__max_depth': [3, 5, 7],
    'model_boost__learning_rate': [0.01, 0.1, 0.3],
    # 'model_boost__subsample': [0.8, 1.0],
    # 'model_boost__colsample_bytree': [0.8, 1.0]
}

param_scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'ap': make_scorer(average_precision_score)
}

# Как правильно разделить данные чтобы оставить только 20% данных
df_subset = select_time_based_subset(df_for_split, percent=0.2, n_segments=5)

X_subset = df_subset.drop(columns='y')
y_subset = df_subset['y']

tss = TimeSeriesSplit(n_splits=3)

search = GridSearchCV(estimator=pipe_boost, 
                      param_grid=param_grid,
                      scoring=param_scoring, 
                      refit='ap',
                      cv=tss,
                      n_jobs=-1, 
                      verbose=10, 
                      return_train_score=True
                      )
search.fit(X_subset, y_subset)

print(f"Лучшие параметры модели: {search.best_params_}")
best_model = search.best_estimator_

# # Калибровка вероятностей
# calibrated_model = CalibratedClassifierCV(base_estimator=best_model, method='sigmoid', cv=tss)
# calibrated_model.fit(X_subset, y_subset)

# Подбор лучшего порога отсечения
optimal_threshold, optimal_f1 = optimal_threshold_cv(
    best_model, 
    X_subset, 
    y_subset, 
    cv_splits=3
)

x_train, y_train, x_test, y_test = df_split_test_and_train(df_for_split)

boost_res_df = get_df_p_r_f1_auc_fr(
    best_model,
    x_train,
    y_train,
    x_test,
    y_test,
    best_threshold=optimal_threshold
)

boost_res_df

Fitting 3 folds for each of 18 candidates, totalling 54 fits
Лучшие параметры модели: {'model_boost__learning_rate': 0.3, 'model_boost__max_depth': 3, 'model_boost__n_estimators': 200}
Лучший порог отсечения: 0.29000000000000004


Unnamed: 0,precision,recall,f1,ap
threshold,0.29,0.29,0.29,0.29
train,0.5136,0.8054,0.6272,0.6779
test,0.2996,0.8875,0.448,0.5474


## Пересборка для продакшена

In [15]:
from src.utils import decorators, logger
from src.data_loader import raw_data_loader
from src.preprocessor import preprocessing
from src.splitter import splitter
from src.pipeline import data_preparation_pipeline, model_pipeline

importlib.reload(decorators)
importlib.reload(logger)
importlib.reload(raw_data_loader)
importlib.reload(splitter)
importlib.reload(preprocessing)
importlib.reload(data_preparation_pipeline)
importlib.reload(model_pipeline)

from src.utils.decorators import start_finish_function
from src.utils.logger import CustomFormatter
from src.data_loader.raw_data_loader import get_df
from src.preprocessor.preprocessing import ManagerPreprocessing
from src.splitter.splitter import FixedDateTrainTestSplitter, TimeBasedSubsetSelector, SplitManager
from src.pipeline.data_preparation_pipeline import TrainTestPreparer
from src.pipeline.model_pipeline import transform_pipe

In [68]:
from src.features.column_transformers import FeatureSelection, CategoricalTypeCaster
from src.setting.settings import setup_logger
from sklearn.pipeline import Pipeline
from xgboost import XGBClassifier

from sklearn.model_selection import GridSearchCV, TimeSeriesSplit
from sklearn.metrics import make_scorer, accuracy_score, f1_score, average_precision_score, precision_recall_fscore_support

In [10]:
engine = dbm.get_engine()

In [11]:
df = get_df(engine)

[2025-05-01 14:59:31] [INFO] [decorators.wrapper] Старт выполнения метода 'Engine.get_df'
[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Завершение выполнения метода 'Engine.get_df'. Время выполнения: 12.87 секунд


In [12]:
feature_name = ['delivery_point', 'rasstoyanie', 'region_zagruzki', 'lat_zagruzki', 'lng_zagruzki', 'region_vygruzki', 'lat_vygruzki', 'lng_vygruzki', 'date_create', 'tonnazh', 'obem_znt', 'kolvo_gruzovykh_mest', 'lt_stoimost_perevozki']

x_train, y_train, x_test, y_test, x_subset, y_subset = TrainTestPreparer(feature_columns=feature_name).prepare(df)

[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Старт выполнения метода 'TrainTestPreparer.prepare'
[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Старт выполнения метода 'ManagerPreprocessing.preprocess'
[2025-05-01 14:59:44] [INFO] [decorators.wrapper] Старт выполнения метода 'ManagerPreprocessing._dropna'
[2025-05-01 14:59:45] [INFO] [preprocessing.ManagerPreprocessing._dropna] Удалено пустых столбцов: 295 шт, осталось 618328
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Завершение выполнения метода 'ManagerPreprocessing._dropna'. Время выполнения: 0.45 секунд
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Старт выполнения метода 'DatetimePreprocessor.transform'
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Завершение выполнения метода 'DatetimePreprocessor.transform'. Время выполнения: 0.64 секунд
[2025-05-01 14:59:45] [INFO] [decorators.wrapper] Старт выполнения метода 'TargetCreator.transform'
[2025-05-01 14:59:58] [INFO] [decorators.wrapper] Завершение выполнения

In [22]:
x_train, y_train, x_test, y_test, x_subset, y_subset

(                                  znt_ssylka  delivery_point  rasstoyanie  \
 232442  604815f3-a024-11e6-80e9-00505601119a               1        300.0   
 564223  e996deeb-a004-11e6-80e9-00505601119a               1        800.0   
 324453  8651afc8-a039-11e6-80eb-00505601119b               1        800.0   
 329956  8898f219-a038-11e6-80eb-00505601119b               1       7300.0   
 150338  3e736c0c-a038-11e6-80eb-00505601119b               1        650.0   
 ...                                      ...             ...          ...   
 437895  b5513647-0830-48df-9476-d479a5983d89               1       9000.0   
 580457  f03cb792-76a0-44c1-bf45-6c210b30e3d3               1        500.0   
 358647  9488003d-4a14-451c-930b-3cf202905390               2       1760.0   
 499304  cecf7dd5-b084-11ef-9c64-3a68dd4bd927               1       5700.0   
 170521  46bb2efb-b0b0-11ef-9c64-3a68dd4bd927               1       1500.0   
 
        date_create date_fakticheskaya_vygruzki  tonnazh  obem

In [44]:
categorical_columns = ['distance_group', 'cost_group', 'region_zagruzki', 'region_vygruzki']

features = ['delivery_point', 'region_zagruzki', 'lat_zagruzki', 'lng_zagruzki',
            'region_vygruzki', 'lat_vygruzki', 'lng_vygruzki', 'month', 'planned_delivery_days',
            'geo_rasstoyanie_km', 'distance_group', 'tonnazh_group', 'cost_group',
            'rasstoyanie', 'tonnazh', 'lt_stoimost_perevozki']

pipe_xgboost = Pipeline(steps=[
    ('cast_category', CategoricalTypeCaster(categorical_columns=categorical_columns)),
    ('selection', FeatureSelection(features=features)),
    ('model', XGBClassifier(enable_categorical=True))
])

In [45]:
pipe_xgboost.fit(x_subset, y_subset)

[2025-05-01 21:09:18] [INFO] [decorators.wrapper] Старт выполнения метода 'CategoricalTypeCaster.transform'
[2025-05-01 21:09:19] [ERROR] [column_transformers.CategoricalTypeCaster.transform] Не найдены категориальные фичи: []
[2025-05-01 21:09:19] [INFO] [decorators.wrapper] Завершение выполнения метода 'CategoricalTypeCaster.transform'. Время выполнения: 0.31 секунд
[2025-05-01 21:09:19] [INFO] [decorators.wrapper] Старт выполнения метода 'FeatureSelection.transform'
[2025-05-01 21:09:19] [INFO] [decorators.wrapper] Завершение выполнения метода 'FeatureSelection.transform'. Время выполнения: 0.03 секунд


In [52]:
param_grid = {
    'model__n_estimators': [100, 200],
    # 'model_boost__max_depth': [3, 5, 7],
    # 'model_boost__learning_rate': [0.01, 0.1, 0.3],
    # 'model_boost__subsample': [0.8, 1.0],
    # 'model_boost__colsample_bytree': [0.8, 1.0]
}

param_scoring = {
    'accuracy': make_scorer(accuracy_score),
    'f1': make_scorer(f1_score),
    'ap': make_scorer(average_precision_score)
}

tss = TimeSeriesSplit(n_splits=3)

search = GridSearchCV(estimator=pipe_xgboost,
                      param_grid=param_grid,
                      scoring=param_scoring,
                      refit='ap',
                      cv=tss,
                      n_jobs=-1,
                      verbose=10,
                      return_train_score=True
                      )
search.fit(x_subset, y_subset)

print(f"Лучшие параметры модели: {search.best_params_}")
best_model = search.best_estimator_

Fitting 3 folds for each of 2 candidates, totalling 6 fits


[2025-05-01 21:41:49] [INFO] [decorators.wrapper] Старт выполнения метода 'CategoricalTypeCaster.transform'
[2025-05-01 21:41:49] [ERROR] [column_transformers.CategoricalTypeCaster.transform] Не найдены категориальные фичи: []
[2025-05-01 21:41:49] [INFO] [decorators.wrapper] Завершение выполнения метода 'CategoricalTypeCaster.transform'. Время выполнения: 0.22 секунд
[2025-05-01 21:41:49] [INFO] [decorators.wrapper] Старт выполнения метода 'FeatureSelection.transform'
[2025-05-01 21:41:49] [INFO] [decorators.wrapper] Завершение выполнения метода 'FeatureSelection.transform'. Время выполнения: 0.04 секунд


Лучшие параметры модели: {'model__n_estimators': 100}


In [71]:
thresholds = np.linspace(0.1, 0.9, 81)

best_thresholds = []
best_auc_scores = []
best_f1 = []
best_precision = []
best_recall = []

for train_idx, val_idx in tss.split(x_train):
    X_train_thresh, X_val_thresh = x_train.iloc[train_idx], x_train.iloc[val_idx]
    y_train_thresh, y_val_thresh = y_train.iloc[train_idx], y_train.iloc[val_idx]
    
    best_model.fit(X_train_thresh, y_train_thresh)
    val_probs = best_model.predict_proba(X_val_thresh)[:, 1]

    precision_scores = []
    recall_scores = []
    f1_scores = []

    for thresh in thresholds:
        val_pred = (val_probs >= thresh).astype(int)
        precision_train, recall_train, f1_train, _ = precision_recall_fscore_support(
            y_val_thresh, val_pred, average='binary'
        )
        precision_scores.append(precision_train)
        recall_scores.append(recall_train)
        f1_scores.append(f1_train)

    best_idx = np.argmax(f1_scores)
    best_thresholds.append(thresholds[best_idx])
    best_precision.append(precision_scores[best_idx])
    best_recall.append(recall_scores[best_idx])
    best_f1.append(f1_scores[best_idx])

    best_auc_scores.append(average_precision_score(y_val_thresh, val_probs))

optimal_threshold = np.mean(best_thresholds)
optimal_precision = np.mean(best_precision)
optimal_recall = np.mean(best_recall)
optimal_f1 = np.mean(best_f1)
optimal_auc = np.mean(best_auc_scores)


y_test_proba = best_model.predict_proba(x_test)[:, 1]

y_best_test_pred = (y_test_proba >= optimal_threshold).astype(int)
precision_test, recall_test, f1_test, _ = precision_recall_fscore_support(
    y_test, y_best_test_pred, average='binary'
)
auc_fr_test = average_precision_score(y_test, y_test_proba)

results = {
    'threshold': optimal_threshold,
    'train': {
        'precision': round(optimal_precision, 4),
        'recall': round(optimal_recall, 4),
        'f1': round(optimal_f1, 4),
        'ap': round(optimal_auc, 4)
    },
    'test': {
        'precision': round(precision_test, 4),
        'recall': round(recall_test, 4),
        'f1': round(f1_test, 4),
        'ap': round(auc_fr_test, 4)
    }
}

df_result = pd.DataFrame(results).T
df_result

[2025-05-01 22:30:23] [INFO] [decorators.wrapper] Старт выполнения метода 'CategoricalTypeCaster.transform'
[2025-05-01 22:30:23] [ERROR] [column_transformers.CategoricalTypeCaster.transform] Не найдены категориальные фичи: []
[2025-05-01 22:30:23] [INFO] [decorators.wrapper] Завершение выполнения метода 'CategoricalTypeCaster.transform'. Время выполнения: 0.14 секунд
[2025-05-01 22:30:23] [INFO] [decorators.wrapper] Старт выполнения метода 'FeatureSelection.transform'
[2025-05-01 22:30:23] [INFO] [decorators.wrapper] Завершение выполнения метода 'FeatureSelection.transform'. Время выполнения: 0.03 секунд
[2025-05-01 22:30:24] [INFO] [decorators.wrapper] Старт выполнения метода 'CategoricalTypeCaster.transform'
[2025-05-01 22:30:24] [ERROR] [column_transformers.CategoricalTypeCaster.transform] Не найдены категориальные фичи: []
[2025-05-01 22:30:24] [INFO] [decorators.wrapper] Завершение выполнения метода 'CategoricalTypeCaster.transform'. Время выполнения: 0.16 секунд
[2025-05-01 22:3

Unnamed: 0,precision,recall,f1,ap
threshold,0.196667,0.196667,0.196667,0.196667
train,0.4048,0.8284,0.5428,0.4494
test,0.2589,0.8977,0.4019,0.4194
