In [None]:
!pip install -U lightautoml

In [None]:
!pip install --upgrade torch

In [None]:
!pip install textstat

In [4]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [33]:
# Работа с файловой системой и HTTP-запросами
import os
import requests

# Библиотеки для работы с данными и моделями
import numpy as np  # Массивы и математика
import pandas as pd  # Работа с табличными данными
from sklearn.metrics import roc_auc_score, accuracy_score, classification_report  # Метрики оценки моделей
from sklearn.model_selection import train_test_split  # Разделение данных на обучающую и тестовую выборки
from sklearn.preprocessing import LabelEncoder  # Кодирование категориальных признаков

# Работа с нейронными сетями
import torch  # Фреймворк для глубокого обучения
from transformers import AutoTokenizer, AutoModel  # Токенизатор и модель из библиотеки Transformers

# Работа с текстами
import textstat  # Лингвистические метрики текстов
import re  # Регулярные выражения

# Прогресс-бар для циклов
from tqdm import tqdm  # Удобный прогресс-бар

# Работа с временем и датами
from datetime import datetime  # Работа с временными метками

# LightAutoML для автоматического машинного обучения
from lightautoml.automl.presets.tabular_presets import TabularAutoML, TabularUtilizedAutoML  # Предустановки моделей
from lightautoml.tasks import Task  # Определение задачи (например, классификация, регрессия)
from lightautoml.report.report_deco import ReportDeco, ReportDecoUtilized  # Генерация отчетов
from lightautoml.addons.tabular_interpretation import SSWARM  # Интерпретация моделей

# typing для аннотаций типов
from typing import List  # Аннотация списка


In [5]:
# Инициализация токенизатора и модели
tokenizer = AutoTokenizer.from_pretrained("sergeyzh/rubert-tiny-turbo")
model = AutoModel.from_pretrained("sergeyzh/rubert-tiny-turbo")

# Функция для предобработки текста (пример реализации)
def preprocess_text(text):
    if not isinstance(text, str):
        return ""
    return text.lower()

# Функция для извлечения первой даты из текста
def extract_first_date(text):
    if not isinstance(text, str):
        return None

    date_pattern = r'\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b'
    match = re.search(date_pattern, text)

    if match:
        date_str = match.group()
        for fmt in ("%d.%m.%Y", "%d/%m/%Y", "%d-%m-%Y", "%d.%m.%y", "%d/%m/%y", "%d-%m-%y"):
            try:
                return datetime.strptime(date_str, fmt)
            except ValueError:
                continue

    return None

# Функция для вычисления разницы в днях между двумя датами
def calculate_days_difference(extracted_date, reference_date):
    if pd.isna(extracted_date) or pd.isna(reference_date):
        return None
    return (reference_date - extracted_date).days

# Функция для получения эмбеддингов из текста
def get_embeddings(text: str):
    inputs = tokenizer(text, return_tensors='pt', truncation=True, padding=True)
    with torch.no_grad():
        outputs = model(**inputs)
    return outputs.last_hidden_state.mean(dim=1).squeeze().numpy()

# Функция для генерации текстовых признаков с использованием textstat
def get_textstat_features(text):
    features = {}
    if not isinstance(text, str):
        return features
    # Расчет признаков textstat
    features['flesch_reading_ease'] = textstat.flesch_reading_ease(text)
    features['smog_index'] = textstat.smog_index(text)
    features['flesch_kincaid_grade'] = textstat.flesch_kincaid_grade(text)
    features['coleman_liau_index'] = textstat.coleman_liau_index(text)
    features['automated_readability_index'] = textstat.automated_readability_index(text)
    features['dale_chall_readability_score'] = textstat.dale_chall_readability_score(text)
    features['difficult_words'] = textstat.difficult_words(text)
    features['linsear_write_formula'] = textstat.linsear_write_formula(text)
    features['gunning_fog'] = textstat.gunning_fog(text)
    features['text_standard'] = textstat.text_standard(text, float_output=True)
    features['syllable_count'] = textstat.syllable_count(text)
    features['lexicon_count'] = textstat.lexicon_count(text)
    features['sentence_count'] = textstat.sentence_count(text)
    return features

# Функция для извлечения признаков из DataFrame
def extract_features(df, text_column):
    # Предобработка текста
    df[text_column] = df[text_column].apply(preprocess_text)

    # Извлечение номеров контрактов и их замена в тексте
    def replace_contract_numbers(text):
        if not isinstance(text, str):
            return text, False
        contract_pattern = r'(\b\d{2,}-\d{4,}\b)'
        match = re.search(contract_pattern, text)
        if match:
            return re.sub(contract_pattern, '[CONTRACT_NUMBER]', text), True
        return text, False

    df[text_column], has_contract_number = zip(
        *df[text_column].apply(replace_contract_numbers))

    # Извлечение дат и их замена в тексте
    def replace_dates_and_extract(text):
        if not isinstance(text, str):
            return text, None
        date_pattern = r'\b\d{1,2}[./-]\d{1,2}[./-]\d{2,4}\b'
        match = re.search(date_pattern, text)
        if match:
            extracted_date = extract_first_date(match.group())
            text = re.sub(date_pattern, '[DATE]', text)
            return text, extracted_date
        return text, None

    df[text_column], extracted_dates = zip(
        *df[text_column].apply(replace_dates_and_extract))

    # Вычисление разницы в датах, если существует столбец 'Date'
    if 'Date' in df.columns:
        df['Date'] = pd.to_datetime(df['Date'])
        df['date_difference'] = [
            calculate_days_difference(extracted_date, reference_date)
            for extracted_date, reference_date in zip(extracted_dates, df['Date'])
        ]
    else:
        df['date_difference'] = None

    # Получение эмбеддингов
    embeddings = df[text_column].progress_apply(get_embeddings)

    # Развертывание эмбеддингов в индивидуальные столбцы
    embeddings_df = pd.DataFrame(embeddings.tolist(), index=df.index)
    embeddings_df.columns = [f'embedding_{i}' for i in range(embeddings_df.shape[1])]

    # Подсчет количества слов в тексте
    word_count = df[text_column].apply(lambda x: len(x.split()))

    # Получение признаков textstat
    textstat_features = df[text_column].apply(get_textstat_features)
    textstat_df = pd.DataFrame(textstat_features.tolist())

    # Объединение признаков обратно в исходный DataFrame
    df = pd.concat([df.reset_index(drop=True), pd.DataFrame({
        'has_contract_number': has_contract_number,
        'word_count': word_count
    }), embeddings_df.reset_index(drop=True), textstat_df.reset_index(drop=True)], axis=1)
    df = df.drop(columns=[text_column])
    return df

# Функция для загрузки и предобработки данных
def load_data(file_path, sep='\t', header=None, names=None):
    data = pd.read_csv(file_path, sep=sep, header=header, names=names)
    if 'Date' in data.columns:
        data['Date'] = pd.to_datetime(data['Date'], format='%d.%m.%Y')
    data['Price'] = data['Price'].replace({',': '.', '-': '.'}, regex=True)
    data['Price'] = pd.to_numeric(data['Price'], errors='coerce')
    data['Price'] = data['Price'].apply(lambda x: float(x))
    return data


The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/1.29k [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/1.16M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/2.41M [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/732 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/712 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/117M [00:00<?, ?B/s]

In [7]:
# пути к файлам
train_file_path = '/content/drive/MyDrive/biv hack/for-teams/train_dataset.csv'
test_file_path = '/content/drive/MyDrive/biv hack/for-teams/payments_training.tsv'


In [10]:
# Загрузка обучающих данных
train = load_data(train_file_path, sep=',', header=0, names=['ID', 'Content', 'reasoning', 'TARGET', 'Date', 'Price', 'Fold'])
train = train[['Content', 'Date', 'TARGET', 'Price']]

# Загрузка тестовых данных
test = load_data(test_file_path, sep='\t', header=None, names=['ID', 'Date', 'Price', 'Content', 'TARGET'])
test = test[['Content', 'Date', 'TARGET', 'Price']]

# Инициализация progress_apply
tqdm.pandas()

# Извлечение признаков из обучающих и тестовых данных
train = extract_features(train, 'Content')
test = extract_features(test, 'Content')

100%|██████████| 5970/5970 [01:08<00:00, 87.00it/s] 
100%|██████████| 500/500 [00:07<00:00, 63.90it/s]


In [14]:
full_train = pd.concat([train, test], ignore_index=True)

Unnamed: 0,Date,TARGET,Price,date_difference,has_contract_number,word_count,embedding_0,embedding_1,embedding_2,embedding_3,...,coleman_liau_index,automated_readability_index,dale_chall_readability_score,difficult_words,linsear_write_formula,gunning_fog,text_standard,syllable_count,lexicon_count,sentence_count
0,2024-11-07,SERVICE,32600.0,76.0,False,8,-0.014197,-0.113569,-0.311699,-0.756214,...,8.66,7.9,19.82,0,3.0,3.20,0.0,8,8,1
1,2024-11-07,SERVICE,30900.0,-121.0,False,6,-0.268744,-0.247321,0.181615,-0.624568,...,9.15,9.8,22.36,0,2.0,2.40,10.0,6,6,1
2,2024-11-07,FOOD_GOODS,4210.0,,False,14,0.404704,0.311802,0.141604,-0.036590,...,16.54,12.7,23.16,0,2.5,2.80,3.0,14,14,2
3,2024-11-07,TAX,4630.0,,False,2,-0.004122,0.046135,-0.452642,-0.301388,...,33.20,31.4,19.53,0,0.0,0.80,0.0,2,2,1
4,2024-11-07,SERVICE,8000.0,316.0,False,7,-0.482871,-0.159998,0.082481,-0.421207,...,6.56,7.0,19.77,0,2.5,2.80,7.0,7,7,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
6465,2024-11-07,TAX,2610.0,,False,2,0.107667,0.274753,-0.246876,-0.057394,...,7.10,10.2,19.53,0,0.0,0.80,0.0,2,2,1
6466,2024-11-07,BANK_SERVICE,31200.0,,False,15,-0.277803,0.602821,-0.585012,-0.808449,...,21.57,18.1,20.17,0,6.5,6.00,0.0,15,15,1
6467,2024-11-07,SERVICE,18200.0,197.0,False,7,0.053888,0.204427,-0.376022,-0.771425,...,9.87,9.7,22.03,0,2.5,2.80,10.0,7,7,1
6468,2024-11-07,FOOD_GOODS,2870000.0,,False,19,0.116363,0.089819,-0.153850,-0.324475,...,6.43,8.2,22.94,0,0.9,1.52,23.0,19,19,5


In [15]:
N_THREADS = 4
N_FOLDS = 5
RANDOM_STATE = 42
TEST_SIZE = 0.2
TIMEOUT = 3600
TARGET_NAME = 'TARGET'

In [16]:
np.random.seed(RANDOM_STATE)
torch.set_num_threads(N_THREADS)

In [29]:
task = Task('multiclass', metric='auc_mu')

In [30]:
roles = {
    'target': TARGET_NAME,
    'drop': ['Date']
}

In [31]:
automl = TabularUtilizedAutoML(
    task = task,
    timeout = TIMEOUT,
    cpu_limit = N_THREADS,
    reader_params = {'n_jobs': N_THREADS, 'cv': N_FOLDS, 'random_state': RANDOM_STATE},
)

In [32]:
out_of_fold_predictions = automl.fit_predict(full_train, roles = roles, verbose = 1)

[21:23:03] Start automl [1mutilizator[0m with listed constraints:


INFO:lightautoml.addons.utilization.utilization:Start automl [1mutilizator[0m with listed constraints:


[21:23:03] - time: 3600.00 seconds


INFO:lightautoml.addons.utilization.utilization:- time: 3600.00 seconds


[21:23:03] - CPU: 4 cores


INFO:lightautoml.addons.utilization.utilization:- CPU: 4 cores


[21:23:03] - memory: 16 GB



INFO:lightautoml.addons.utilization.utilization:- memory: 16 GB



[21:23:03] [1mIf one preset completes earlier, next preset configuration will be started[0m



INFO:lightautoml.addons.utilization.utilization:[1mIf one preset completes earlier, next preset configuration will be started[0m







[21:23:03] Start 0 automl preset configuration:


INFO:lightautoml.addons.utilization.utilization:Start 0 automl preset configuration:


[21:23:03] [1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}


INFO:lightautoml.addons.utilization.utilization:[1mconf_0_sel_type_0.yml[0m, random state: {'reader_params': {'random_state': 42}, 'nn_params': {'random_state': 42}, 'general_params': {'return_all_predictions': False}}
INFO3:lightautoml.addons.utilization.utilization:Found reader_params in kwargs, need to combine
INFO3:lightautoml.addons.utilization.utilization:Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 42}


[21:23:03] Stdout logging level is INFO.


INFO:lightautoml.automl.presets.base:Stdout logging level is INFO.


[21:23:03] Task: multiclass



INFO:lightautoml.automl.presets.base:Task: multiclass



[21:23:03] Start automl preset with listed constraints:


INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:


[21:23:03] - time: 3599.99 seconds


INFO:lightautoml.automl.presets.base:- time: 3599.99 seconds


[21:23:03] - CPU: 4 cores


INFO:lightautoml.automl.presets.base:- CPU: 4 cores


[21:23:03] - memory: 16 GB



INFO:lightautoml.automl.presets.base:- memory: 16 GB



[21:23:04] [1mTrain data shape: (6470, 331)[0m



INFO:lightautoml.reader.base:[1mTrain data shape: (6470, 331)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []


[21:23:31] Layer [1m1[0m train process start. Time left 3572.98 secs


INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 3572.98 secs


[21:23:35] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455

[21:26:53] Time limit exceeded after calculating fold 1



INFO:lightautoml.ml_algo.base:Time limit exceeded after calculating fold 1



[21:26:53] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.9999686556128986[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.9999686556128986[0m


[21:26:53] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed


[21:26:53] Time left 3370.85 secs



INFO:lightautoml.automl.base:Time left 3370.85 secs



[21:26:54] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 32, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.5, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 2, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 200, 'random_state': 42}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 200 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's multi_logloss: 0.405473	valid's Opt metric: 0.999796
DEBUG:lightautoml.ml_algo.boost_lgbm:[200]	valid's multi_logloss: 0.143938	valid's Opt metric: 0.999886
DEBUG:lightautoml.ml_algo.boost_lgbm:[300]	valid's multi_logloss: 0.074

[21:30:50] Time limit exceeded after calculating fold 0



INFO:lightautoml.ml_algo.base:Time limit exceeded after calculating fold 0



[21:30:50] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.9999674819262452[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.9999674819262452[0m


[21:30:50] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed


[21:30:50] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task_type': 'CPU', 'thread_count': 2, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.01, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 5, 'min_data_in_leaf': 1, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m =====
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.9420114	best: 0.9420114 (0)	total: 149ms	remaining: 7m 25s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.9989178	best: 0.9989191 (99)	total: 10.3s	remaining: 4m 55s
DEBUG:lightautoml.ml_algo.boost_cb:200:	test: 0.99968

[21:38:51] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.9998929318521557[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.9998929318521557[0m


[21:38:51] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed


[21:38:51] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 131.67 secs


INFO:lightautoml.ml_algo.tuning.optuna:Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 131.67 secs
INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-c9bf9d50-243b-4c84-9933-2346d6e366ad
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.9127349	best: 0.9127349 (0)	total: 72.1ms	remaining: 3m 36s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.9984723	best: 0.9984723 (100)	total: 5.48s	remaining: 2m 37s
DEBUG:lightautoml.ml_algo.boost_cb:200:	test: 0.9994257	best: 0.9994257 (200)	total: 11.5s	remaining: 2m 40s
DEBUG:lightautoml.ml_algo.boost_cb:300:	test: 0.9997145	best: 0.9997145 (300)	total: 18.3s	remaining: 2m 44s
DEBUG:lightautoml.ml_algo.boost_cb:400:	test: 0.9998243	best: 0.9998243 (400)	total: 23.1s	remaining: 2m 30s
DEBUG:lightautoml.ml_algo.boost_cb:500:	test: 0.9998594	best: 0.9998594 (499)	total: 30.4s	remaining: 2m 31s
DEBUG:lightautoml.ml_algo.boost_cb:600:	test: 0.9998814	best: 0.9998852 (597)	to

[21:41:20] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed


INFO:lightautoml.ml_algo.tuning.optuna:Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
INFO2:lightautoml.ml_algo.tuning.optuna:The set of hyperparameters [1m{'max_depth': 4, 'nan_mode': 'Max', 'l2_leaf_reg': 0.0024430162614261413, 'min_data_in_leaf': 4}[0m
 achieve 0.9999 auc_mu


[21:41:20] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task_type': 'CPU', 'thread_count': 2, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.0024430162614261413, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 4, 'min_data_in_leaf': 4, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Max', 'verbose': 100, 'allow_writing_files': False}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m =====
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.9127349	best: 0.9127349 (0)	total: 134ms	remaining: 6m 42s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.9984723	best: 0.9984723 (100)	total: 6.69s	remaining: 3m 12s
DEBUG:lightautoml.ml_al

[21:48:51] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.9998910678315942[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.9998910678315942[0m


[21:48:51] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed


[21:48:51] Time left 2052.31 secs



INFO:lightautoml.automl.base:Time left 2052.31 secs



[21:48:51] Time limit exceeded in one of the tasks. AutoML will blend level 1 models.



INFO:lightautoml.automl.base:Time limit exceeded in one of the tasks. AutoML will blend level 1 models.



[21:48:51] [1mLayer 1 training completed.[0m



INFO:lightautoml.automl.base:[1mLayer 1 training completed.[0m



[21:48:51] Blending: optimization starts with equal weights and score [1m0.9999214439186962[0m


INFO:lightautoml.automl.blend:Blending: optimization starts with equal weights and score [1m0.9999214439186962[0m


[21:48:54] Blending: iteration [1m0[0m: score = [1m0.9999643664645991[0m, weights = [1m[0.3497075  0.65029246 0.         0.        ][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m0[0m: score = [1m0.9999643664645991[0m, weights = [1m[0.3497075  0.65029246 0.         0.        ][0m


[21:48:58] Blending: iteration [1m1[0m: score = [1m0.9999683926479371[0m, weights = [1m[0.92367506 0.07632492 0.         0.        ][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m1[0m: score = [1m0.9999683926479371[0m, weights = [1m[0.92367506 0.07632492 0.         0.        ][0m


[21:49:02] Blending: iteration [1m2[0m: score = [1m0.9999683926479371[0m, weights = [1m[0.92367506 0.07632492 0.         0.        ][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m2[0m: score = [1m0.9999683926479371[0m, weights = [1m[0.92367506 0.07632492 0.         0.        ][0m


[21:49:02] Blending: no score update. Terminated



INFO:lightautoml.automl.blend:Blending: no score update. Terminated



[21:49:02] [1mAutoml preset training completed in 1558.41 seconds[0m



INFO:lightautoml.automl.presets.base:[1mAutoml preset training completed in 1558.41 seconds[0m



[21:49:02] Model description:
Final prediction for new objects (level 0) = 
	 0.92368 * (2 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.07632 * (1 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) 



INFO:lightautoml.automl.presets.base:Model description:
Final prediction for new objects (level 0) = 
	 0.92368 * (2 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.07632 * (1 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) 







[21:49:02] Start 1 automl preset configuration:


INFO:lightautoml.addons.utilization.utilization:Start 1 automl preset configuration:


[21:49:02] [1mconf_1_sel_type_1.yml[0m, random state: {'reader_params': {'random_state': 43}, 'nn_params': {'random_state': 43}, 'general_params': {'return_all_predictions': False}}


INFO:lightautoml.addons.utilization.utilization:[1mconf_1_sel_type_1.yml[0m, random state: {'reader_params': {'random_state': 43}, 'nn_params': {'random_state': 43}, 'general_params': {'return_all_predictions': False}}
INFO3:lightautoml.addons.utilization.utilization:Found reader_params in kwargs, need to combine
INFO3:lightautoml.addons.utilization.utilization:Merged variant for reader_params = {'n_jobs': 4, 'cv': 5, 'random_state': 43}


[21:49:02] Stdout logging level is INFO.


INFO:lightautoml.automl.presets.base:Stdout logging level is INFO.


[21:49:02] Task: multiclass



INFO:lightautoml.automl.presets.base:Task: multiclass



[21:49:02] Start automl preset with listed constraints:


INFO:lightautoml.automl.presets.base:Start automl preset with listed constraints:


[21:49:02] - time: 2041.46 seconds


INFO:lightautoml.automl.presets.base:- time: 2041.46 seconds


[21:49:02] - CPU: 4 cores


INFO:lightautoml.automl.presets.base:- CPU: 4 cores


[21:49:02] - memory: 16 GB



INFO:lightautoml.automl.presets.base:- memory: 16 GB



[21:49:02] [1mTrain data shape: (6470, 331)[0m



INFO:lightautoml.reader.base:[1mTrain data shape: (6470, 331)[0m

INFO3:lightautoml.reader.base:Feats was rejected during automatic roles guess: []


[21:49:26] Layer [1m1[0m train process start. Time left 2017.49 secs


INFO:lightautoml.automl.base:Layer [1m1[0m train process start. Time left 2017.49 secs


[21:49:28] Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'tol': 1e-06, 'max_iter': 100, 'cs': [1e-05, 5e-05, 0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5, 1, 5, 10, 50, 100, 500, 1000, 5000, 10000, 50000, 100000], 'early_stopping': 2, 'categorical_idx': [323, 324, 325, 326, 327, 328, 329, 330, 331, 332, 333, 334, 335, 336, 337, 338, 339, 340, 341, 342, 343, 344, 345, 346, 347, 348, 349, 350, 351, 352, 353, 354, 355, 356, 357, 358, 359, 360, 361, 362, 363, 364, 365, 366, 367, 368, 369, 370, 371, 372, 373, 374, 375, 376, 377, 378, 379, 380, 381, 382, 383, 384, 385, 386, 387, 388, 389, 390, 391, 392, 393, 394, 395, 396, 397, 398, 399, 400, 401, 402, 403, 404, 405, 406, 407, 408, 409, 410, 411, 412, 413, 414, 415, 416, 417, 418, 419, 420, 421, 422, 423, 424, 425, 426, 427, 428, 429, 430, 431, 432, 433, 434, 435, 436, 437, 438, 439, 440, 441, 442, 443, 444, 445, 446, 447, 448, 449, 450, 451, 452, 453, 454, 455

[21:51:14] Time limit exceeded after calculating fold 0



INFO:lightautoml.ml_algo.base:Time limit exceeded after calculating fold 0



[21:51:15] Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.999958992533401[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m finished. score = [1m0.999958992533401[0m


[21:51:15] [1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_0_Mod_0_LinearL2[0m fitting and predicting completed


[21:51:15] Time left 1908.88 secs



INFO:lightautoml.automl.base:Time left 1908.88 secs

INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 200 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's multi_logloss: 0.404184	valid's Opt metric: 0.999597
DEBUG:lightautoml.ml_algo.boost_lgbm:[200]	valid's multi_logloss: 0.147159	valid's Opt metric: 0.999792
DEBUG:lightautoml.ml_algo.boost_lgbm:[300]	valid's multi_logloss: 0.0814334	valid's Opt metric: 0.99985
DEBUG:lightautoml.ml_algo.boost_lgbm:[400]	valid's multi_logloss: 0.0590462	valid's Opt metric: 0.999875
DEBUG:lightautoml.ml_algo.boost_lgbm:[500]	valid's multi_logloss: 0.0496246	valid's Opt metric: 0.99989
DEBUG:lightautoml.ml_algo.boost_lgbm:[600]	valid's multi_logloss: 0.04453	valid's Opt metric: 0.999902
DEBUG:lightautoml.ml_algo.boost_lgbm:[700]	valid's multi_logloss: 0.0418296	valid's Opt metric: 0.999912
DEBUG:lightautoml.ml_algo.boost_lgbm:[800]	valid's multi_logloss: 0.0404346	valid's Opt metric: 0.999918
DEBUG:lightaut

[21:55:44] [1mSelector_LightGBM[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mSelector_LightGBM[0m fitting and predicting completed


[21:55:45] Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task': 'train', 'learning_rate': 0.01, 'num_leaves': 32, 'feature_fraction': 0.7, 'bagging_fraction': 0.7, 'bagging_freq': 1, 'max_depth': -1, 'verbosity': -1, 'reg_alpha': 0.5, 'reg_lambda': 0.0, 'min_split_gain': 0.0, 'zero_as_missing': False, 'num_threads': 2, 'max_bin': 255, 'min_data_in_bin': 3, 'num_trees': 3000, 'early_stopping_rounds': 200, 'random_state': 42}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m =====
INFO3:lightautoml.ml_algo.boost_lgbm:Training until validation scores don't improve for 200 rounds
DEBUG:lightautoml.ml_algo.boost_lgbm:[100]	valid's multi_logloss: 0.40865	valid's Opt metric: 0.999713
DEBUG:lightautoml.ml_algo.boost_lgbm:[200]	valid's multi_logloss: 0.150874	valid's Opt metric: 0.999832
DEBUG:lightautoml.ml_algo.boost_lgbm:[300]	valid's multi_logloss: 0.0830

[21:59:12] Time limit exceeded after calculating fold 0



INFO:lightautoml.ml_algo.base:Time limit exceeded after calculating fold 0



[21:59:12] Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.9999376379271946[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m finished. score = [1m0.9999376379271946[0m


[21:59:12] [1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_1_Mod_0_LightGBM[0m fitting and predicting completed


[21:59:12] Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task_type': 'CPU', 'thread_count': 2, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.01, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 5, 'min_data_in_leaf': 1, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Min', 'verbose': 100, 'allow_writing_files': False}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m =====
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.9288391	best: 0.9288391 (0)	total: 107ms	remaining: 5m 19s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.9991463	best: 0.9991463 (100)	total: 10.2s	remaining: 4m 53s
DEBUG:lightautoml.ml_algo.boost_cb:200:	test: 0.9996

[22:03:54] Time limit exceeded after calculating fold 2



INFO:lightautoml.ml_algo.base:Time limit exceeded after calculating fold 2



[22:03:54] Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.999903471376689[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m finished. score = [1m0.999903471376689[0m


[22:03:54] [1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_1_Mod_2_CatBoost[0m fitting and predicting completed


[22:03:54] Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 1.00 secs


INFO:lightautoml.ml_algo.tuning.optuna:Start hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ... Time budget is 1.00 secs
INFO:optuna.storages._in_memory:A new study created in memory with name: no-name-6daab39e-5567-4646-877d-8b72d052e3da
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.8978760	best: 0.8978760 (0)	total: 140ms	remaining: 6m 59s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.9987609	best: 0.9987609 (100)	total: 6.51s	remaining: 3m 6s
DEBUG:lightautoml.ml_algo.boost_cb:200:	test: 0.9994449	best: 0.9994643 (197)	total: 11.7s	remaining: 2m 42s
DEBUG:lightautoml.ml_algo.boost_cb:300:	test: 0.9996842	best: 0.9996903 (297)	total: 19.2s	remaining: 2m 52s
DEBUG:lightautoml.ml_algo.boost_cb:400:	test: 0.9998058	best: 0.9998058 (400)	total: 24.1s	remaining: 2m 36s
DEBUG:lightautoml.ml_algo.boost_cb:500:	test: 0.9998563	best: 0.9998584 (497)	total: 30.3s	remaining: 2m 31s
DEBUG:lightautoml.ml_algo.boost_cb:600:	test: 0.9998769	best: 0.9998774 (594)	total:

[22:04:51] Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed


INFO:lightautoml.ml_algo.tuning.optuna:Hyperparameters optimization for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m completed
INFO2:lightautoml.ml_algo.tuning.optuna:The set of hyperparameters [1m{'max_depth': 4, 'nan_mode': 'Max', 'l2_leaf_reg': 0.0024430162614261413, 'min_data_in_leaf': 4}[0m
 achieve 0.9999 auc_mu


[22:04:51] Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...


INFO:lightautoml.ml_algo.base:Start fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m ...
DEBUG:lightautoml.ml_algo.base:Training params: {'task_type': 'CPU', 'thread_count': 2, 'random_seed': 42, 'num_trees': 3000, 'learning_rate': 0.03, 'l2_leaf_reg': 0.0024430162614261413, 'bootstrap_type': 'Bernoulli', 'grow_policy': 'SymmetricTree', 'max_depth': 4, 'min_data_in_leaf': 4, 'one_hot_max_size': 10, 'fold_permutation_block': 1, 'boosting_type': 'Plain', 'boost_from_average': True, 'od_type': 'Iter', 'od_wait': 100, 'max_bin': 32, 'feature_border_type': 'GreedyLogSum', 'nan_mode': 'Max', 'verbose': 100, 'allow_writing_files': False}
INFO2:lightautoml.ml_algo.base:===== Start working with [1mfold 0[0m for [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m =====
INFO3:lightautoml.ml_algo.boost_cb:0:	test: 0.8978760	best: 0.8978760 (0)	total: 73ms	remaining: 3m 38s
DEBUG:lightautoml.ml_algo.boost_cb:100:	test: 0.9987609	best: 0.9987609 (100)	total: 7.95s	remaining: 3m 48s
DEBUG:lightautoml.ml_alg

[22:10:27] Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.999873872636593[0m


INFO:lightautoml.ml_algo.base:Fitting [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m finished. score = [1m0.999873872636593[0m


[22:10:27] [1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed


INFO:lightautoml.ml_algo.base:[1mLvl_0_Pipe_1_Mod_3_Tuned_CatBoost[0m fitting and predicting completed


[22:10:27] Time left 756.83 secs



INFO:lightautoml.automl.base:Time left 756.83 secs



[22:10:27] Time limit exceeded in one of the tasks. AutoML will blend level 1 models.



INFO:lightautoml.automl.base:Time limit exceeded in one of the tasks. AutoML will blend level 1 models.



[22:10:27] [1mLayer 1 training completed.[0m



INFO:lightautoml.automl.base:[1mLayer 1 training completed.[0m



[22:10:27] Blending: optimization starts with equal weights and score [1m0.9998976090442869[0m


INFO:lightautoml.automl.blend:Blending: optimization starts with equal weights and score [1m0.9998976090442869[0m


[22:10:30] Blending: iteration [1m0[0m: score = [1m0.9999661329131817[0m, weights = [1m[0.41783333 0.5821666  0.         0.        ][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m0[0m: score = [1m0.9999661329131817[0m, weights = [1m[0.41783333 0.5821666  0.         0.        ][0m


[22:10:33] Blending: iteration [1m1[0m: score = [1m0.9999693315333398[0m, weights = [1m[0.48410362 0.5158964  0.         0.        ][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m1[0m: score = [1m0.9999693315333398[0m, weights = [1m[0.48410362 0.5158964  0.         0.        ][0m


[22:10:36] Blending: iteration [1m2[0m: score = [1m0.9999693315333398[0m, weights = [1m[0.48410362 0.5158964  0.         0.        ][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m2[0m: score = [1m0.9999693315333398[0m, weights = [1m[0.48410362 0.5158964  0.         0.        ][0m


[22:10:36] Blending: no score update. Terminated



INFO:lightautoml.automl.blend:Blending: no score update. Terminated



[22:10:36] [1mAutoml preset training completed in 1294.19 seconds[0m



INFO:lightautoml.automl.presets.base:[1mAutoml preset training completed in 1294.19 seconds[0m



[22:10:36] Model description:
Final prediction for new objects (level 0) = 
	 0.48410 * (1 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.51590 * (1 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) 



INFO:lightautoml.automl.presets.base:Model description:
Final prediction for new objects (level 0) = 
	 0.48410 * (1 averaged models Lvl_0_Pipe_0_Mod_0_LinearL2) +
	 0.51590 * (1 averaged models Lvl_0_Pipe_1_Mod_0_LightGBM) 







[22:10:36] Blending: optimization starts with equal weights and score [1m0.999968955925877[0m


INFO:lightautoml.automl.blend:Blending: optimization starts with equal weights and score [1m0.999968955925877[0m


[22:10:38] Blending: iteration [1m0[0m: score = [1m0.9999708472263775[0m, weights = [1m[0.76295125 0.23704872][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m0[0m: score = [1m0.9999708472263775[0m, weights = [1m[0.76295125 0.23704872][0m


[22:10:39] Blending: iteration [1m1[0m: score = [1m0.9999708472263775[0m, weights = [1m[0.76295125 0.23704872][0m


INFO:lightautoml.automl.blend:Blending: iteration [1m1[0m: score = [1m0.9999708472263775[0m, weights = [1m[0.76295125 0.23704872][0m


[22:10:39] Blending: no score update. Terminated



INFO:lightautoml.automl.blend:Blending: no score update. Terminated



In [35]:
test_predictions = automl.predict(test)

In [48]:
class_labels = automl.outer_pipes[0].ml_algos[0].models[0][0].reader.class_mapping
label_class = {ind:class_ for class_, ind in class_labels.items()}

# Переводим вероятности в строковые значения классов
predicted_indices = np.argmax(test_predictions.data, axis=1)
predicted_classes = [label_class[idx] for idx in predicted_indices]

In [51]:
from sklearn.metrics import precision_score

In [None]:
precision_score(test['TARGET'].values, predicted_classes, average='weighted')

In [None]:
accuracy = np.mean(test['TARGET'].values == predicted_classes)

# Сохранение модели

In [34]:
with open('automl_model.pkl', 'wb') as f:
    pickle.dump(automl, f)

In [None]:
with open('automl_model.pkl', 'rb') as f:
    loaded_automl = pickle.load(f)

In [None]:
test_predictions = loaded_automl.predict(test_final)

In [None]:
class_labels = loaded_automl.reader.class_mapping
label_class = {ind:class_ for class_, ind in class_labels.items()}

# Переводим вероятности в строковые значения классов
predicted_indices = np.argmax(test_predictions.data, axis=1)
predicted_classes = [label_class[idx] for idx in predicted_indices]