# Загрузка библиотек

In [1]:
!pip install catboost
!pip install shap
!pip install scikit-learn
!pip install hyperopt
!pip install pandas



In [1]:
import os
import pandas as pd
pd.set_option('display.max_columns', None)
import numpy as np
np.set_printoptions(precision=4)
import catboost

# Загрузка DataFrame (train and test)

In [442]:
train_df, test_df = pd.read_csv('train_AIC.csv', encoding='utf-8'), pd.read_csv('test_AIC.csv', encoding='utf-8')

### Распределение фичей (category, int, float)

In [5]:
train_df['НРП'] = train_df['НРП'].apply(int)
test_df['НРП'] = test_df['НРП'].apply(int)

In [427]:
cat_features = ['Поставщик', 'Материал', 'Категорийный менеджер', 'Операционный менеджер', 'Завод', 'Закупочная организация',
                'Балансовая единица', 'Вариант поставки', 'ЕИ', 'Группа материалов', 'НРП', 'Месяц1', 'Месяц2', 'Месяц3', 'День недели 2',
                'Отмена полного деблокирования заказа на закупку', 'Изменение позиции заказа на закупку: изменение даты поставки на бумаге',
                'Изменение позиции заказа на закупку: дата поставки', 'Согласование заказа 1', 'Согласование заказа 2',
                'Согласование заказа 3']

int_features = ['Длительность', 'До поставки', 'Количество позиций', 'Количество обработчиков 7', 'Количество обработчиков 15',
               'Количество обработчиков 30', 'Количество изменений после согласований', 'Дней между 0_1', 'Дней между 1_2',
               'Дней между 2_3', 'Дней между 3_4', 'Дней между 4_5', 'Дней между 5_6', 'Дней между 6_7', 'Дней между 7_8']

float_features = ['Сумма', 'Количество', 'Количество циклов согласования']

# Создание датасета для обучения модели

In [5]:
y = train_df.y
x = train_df.drop('y', axis=1)

In [6]:
from sklearn.model_selection import train_test_split
x_train, x_validation, y_train, y_validation = train_test_split(x, y, train_size = 0.8, random_state=42)

In [7]:
from hyperopt import fmin, hp, tpe, Trials, space_eval, STATUS_OK
from catboost import CatBoostClassifier

# Поиск гиперпараметров

In [None]:
def objective(search):
    model = CatBoostClassifier(**search,
                                custom_loss=['Logloss'],
                                eval_metric='F1',
                                task_type="GPU",
                                verbose=False,
                                early_stopping_rounds=100,
                                random_seed=42)

    model.fit(X = x_train, y = y_train, eval_set=(x_validation, y_validation), verbose=False)
    return {'loss': model.get_best_score()['validation']['F1'], 'status': STATUS_OK}

search_medium = {'learning_rate': hp.uniform('learning_rate', 0.01, 1),
                'iterations': hp.randint('iterations', 500, 3000),
                'l2_leaf_reg': hp.randint('l2_leaf_reg', 0, 5),
                'random_strength': hp.randint('random_strength', 0, 4),
                'depth': hp.randint('depth', 0, 16)
               }

algorithm=tpe.suggest

In [None]:
medium_params = fmin(
  fn=objective,
  space=search_medium,
  algo=algorithm,
  max_evals=100
)

 16%|███████                                     | 16/100 [10:22<1:21:05, 57.92s/trial, best loss: 0.35164059743619897]

job exception: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/cuda/methods/oblivious_tree_doc_parallel_structure_searcher.cpp:160: 1 0



 16%|███████▎                                      | 16/100 [10:23<54:31, 38.94s/trial, best loss: 0.35164059743619897]


CatBoostError: C:/Go_Agent/pipelines/BuildMaster/catboost.git/catboost/cuda/methods/oblivious_tree_doc_parallel_structure_searcher.cpp:160: 1 0

In [None]:
catboost_hyperparams_medium = space_eval(search_medium, medium_params)
print(catboost_hyperparams_medium)

In [11]:
catboost_hyperparams_medium = {'depth': 9, 'iterations': 2692, 'l2_leaf_reg': 2, 'learning_rate': 0.19302925722420228, 'random_strength': 2}

In [23]:
cat_features = list(range(0, x.shape[1]))
print(cat_features)

[0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42]


# Создание и обучение модели

In [12]:
model = CatBoostClassifier(
    **catboost_hyperparams_medium,
    random_seed=42,
    eval_metric='F1'
)

model.fit(
    x_train, y_train,
    cat_features=cat_features,
    eval_set=(x_validation, y_validation),
    verbose=50,
    plot=True
)

MetricVisualizer(layout=Layout(align_self='stretch', height='500px'))

0:	learn: 0.1397500	test: 0.1316554	best: 0.1316554 (0)	total: 164ms	remaining: 7m 22s
50:	learn: 0.6491593	test: 0.7029296	best: 0.7029296 (50)	total: 10.4s	remaining: 9m 1s
100:	learn: 0.7281867	test: 0.7526844	best: 0.7526844 (100)	total: 21.5s	remaining: 9m 12s
150:	learn: 0.7652472	test: 0.7757921	best: 0.7759082 (147)	total: 32.7s	remaining: 9m 11s
200:	learn: 0.7927419	test: 0.7869751	best: 0.7871885 (197)	total: 43.9s	remaining: 9m 3s
250:	learn: 0.8112512	test: 0.7924947	best: 0.7925301 (249)	total: 55s	remaining: 8m 54s
300:	learn: 0.8275907	test: 0.7960951	best: 0.7963970 (294)	total: 1m 6s	remaining: 8m 45s
350:	learn: 0.8422220	test: 0.8014238	best: 0.8014238 (350)	total: 1m 17s	remaining: 8m 38s
400:	learn: 0.8556542	test: 0.8024702	best: 0.8031762 (395)	total: 1m 28s	remaining: 8m 27s
450:	learn: 0.8673745	test: 0.8033755	best: 0.8044891 (438)	total: 1m 40s	remaining: 8m 17s
500:	learn: 0.8789628	test: 0.8053217	best: 0.8054970 (477)	total: 1m 51s	remaining: 8m 7s
550:	l

<catboost.core.CatBoostClassifier at 0x17d813890>

### Предсказание модели и последующая запись в submission

In [14]:
submission = pd.read_csv('/Users/andreyboriskin/Downloads/Проект_сбер/74abd23d767d00a2fcc9a2cbe53bc4e7')

In [40]:
preds = model.predict(train_df_20)
preds

array([0, 0, 0, ..., 0, 0, 0])

In [16]:
submission['value'] = preds
submission.to_csv('submission_1.csv', index=False)

In [None]:
model.save_model('model.cmb')