In [None]:
import joblib
import os
import pickle
import random
import string
import warnings

import numpy as np
import pandas as pd

# рисовалки
import altair as alt
import matplotlib.pyplot as plt
import seaborn as sns
from colorama import Fore, Style

# sklearn
from sklearn.compose import ColumnTransformer
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_score
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.pipeline import Pipeline

# бустинги
from catboost import CatBoostClassifier
from lightgbm import LGBMClassifier
from xgboost import XGBClassifier

# torch
import torch
import torch.nn as nn
import torch.optim as optim
from torch.nn import Dropout
from torch.utils.data import TensorDataset, DataLoader

from transformers import AutoTokenizer, AutoModel, BertForPreTraining

from tqdm import tqdm, notebook
notebook.tqdm.pandas()

# монтируем Google Drive
from google.colab import drive
drive.mount('/content/drive')

RANDOM_STATE=42
warnings.filterwarnings('ignore')

In [None]:
def get_barplot(df):
    """Функция для отрисовки столбчатой диаграммы с подсчетом количества наблюдений"""

    counts = df['label'].value_counts().sort_index()

    plt.figure(figsize=(12, 5))
    sns.barplot(x=counts.index, y=counts.values, color='#64EDC1', ec='#0B6145', alpha=0.9)
    plt.title(f'распределение категорий в {df.name}', fontsize=16, fontweight='bold')
    plt.xlabel('категории')
    plt.ylabel('Количество')
    plt.tick_params(axis='x', labelsize=8.5)
    plt.xticks(rotation=0)
    plt.minorticks_on()
    plt.grid(which='major', linewidth=.5)
    plt.grid(which='minor', linewidth=.25, linestyle='--')

    print('-----'*20, f'\n{Fore.RED}{Style.BRIGHT}ДАТАСЕТ {df.name}{Style.RESET_ALL}\n',
          f'размер: {df.shape}\n пропусков:{df.isna().sum().sum()}\n явных дубликатов: {df.duplicated().sum()}\n',
          f'значений в id: {df.id.nunique()}, они идут по индексу равномерно? {df.index.is_monotonic_increasing}'
    )
    display(df.describe().T, df.head(3), df.tail(3))

# get_barplot(train)
# get_barplot(test)

In [None]:
def remove_column_and_save(df, column_name, dir_path='/'):
    """Удаление столбца из таблицы и сохранение его в отдельный файл"""

    col_path = dir_path+f'column_{df.name}_{column_name}.csv'
    df_path = dir_path+f'df_{df.name}.csv'

    df[[column_name]].to_csv(col_path, index=False)
    new_df = df.drop(column_name, axis=1)
    new_df.name = df.name
    new_df.to_csv(df_path, index=False)

    print(f'столбец {column_name} сохранён в {col_path}',
          f'датафрейм без {column_name} сохранён в {df_path}')
    return new_df


def restore_column_from_file(df, column_name, file_path):
    """Восстановление столбца из сохраненного файла"""

    id_column = pd.read_csv(file_path, index_col=0)[column_name]
    df.insert(loc=0, column=column_name, value=id_column)
    return df

# folder_name = 'new_data/'
# train = remove_column_and_save(train, 'id', folder_name)
# test = remove_column_and_save(test, 'id', folder_name)

In [None]:
def csv_splitter(file_name: str, chunk_size: int, header=True, get_names=False, encoding='utf-8'):
    """
    функция делит один csv-файл на несколько меньших по chunk_size - количеству заданных строк
    можно добавить заголовки и вернуть список с названием созданных файлов
    """
    
    name_files_list = []
    df = pd.read_csv(f'{file_name}.csv', encoding=encoding)
    num_chunks = len(df) // chunk_size + 1
    count = 0
    for i in range(num_chunks):
        start_index = i * chunk_size
        end_index = start_index + chunk_size
        chunk = df.iloc[start_index:end_index]
        output_file = f'{file_name}_small_{i}.csv'
        chunk.to_csv(output_file, index=False, header=header)
        count = i + 1
        name_files_list.append(output_file)
    del df
    print(f'успешный успех! Файл "{file_name}" разделён на {count} частей')
    if get_names is not False:
        return name_files_list

# train_files_list = csv_splitter('new_data/labels/df_train', 5000, get_names=True)
# print(f'TRAIN names:\n{train_files_list}')

## sbert_large_nlu_ru - https://huggingface.co/ai-forever/sbert_large_nlu_ru 

вот код для получения эмбеддингов:

In [None]:
tokenizer = AutoTokenizer.from_pretrained("ai-forever/sbert_large_nlu_ru")
model = AutoModel.from_pretrained("ai-forever/sbert_large_nlu_ru")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def mean_pooling(model_output, attention_mask):
    token_embeddings = model_output[0]
    input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
    sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
    sum_mask = torch.clamp(input_mask_expanded.sum(1), min=1e-9)
    return sum_embeddings / sum_mask

def get_embeddings(sentences):
    encoded_input = tokenizer(sentences, padding=True, truncation=True, max_length=24, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**encoded_input.to(device))
    sentence_embeddings = mean_pooling(model_output, encoded_input['attention_mask'])
    return sentence_embeddings.to("cpu").tolist()

ниже код, который, по сути, делает одно и то же. Разница только в том, что один его вариант собирает эмбеддинги в цикле из 22-х файлов, на которые был порезан файл train, а другой делает эмбеддинги из единого файла. Такой ход пришлось провернуть, потому как в Google Colab не загружался такой большой файл, а вот на Kaggle без проблем. 

**код для Google Colab:**

In [None]:
path = '/content/drive/MyDrive/Colab Notebooks/'

test = pd.read_csv(f'{path}/test.csv')
test.head(3)

In [None]:
# TRAIN

path = '/content/drive/MyDrive/Colab Notebooks/'

def process_data(df):
    embeddings = df['text'].apply(lambda x: get_embeddings(x))
    embeddings_array = np.squeeze(np.array(embeddings.tolist()), axis=1)
    return embeddings_array

# вот список путей к файлам датафреймов
file_paths = [f'{path}train_small/df_train_small_{i}.csv' for i in range(22)]

embeddings_list = []  # Список для хранения всех эмбеддингов

for file_path in notebook.tqdm(file_paths):
    data = pd.read_csv(file_path)
    part_embeddings = process_data(data)  # обрабатываем текущий
    embeddings_list.append(part_embeddings)
    del data

# вычисляем общее количество строк и создаём массив embeddings
num_rows = sum(part_embeddings.shape[0] for part_embeddings in embeddings_list)
embeddings = np.empty((num_rows, 1024))  # здесь можнго поиграться с размерностью эмбеддингов

row_index = 0
for part_embeddings in notebook.tqdm(embeddings_list):
    num_rows_part = part_embeddings.shape[0]
    embeddings[row_index : row_index + num_rows_part, :] = part_embeddings
    row_index += num_rows_part

In [None]:
# TEST

file_paths = [f'{path}train_small/test.csv']
embeddings_list_test = []

for file_path in notebook.tqdm(file_paths):
    data = pd.read_csv(file_path)
    part_embeddings = process_data(data)
    embeddings_list_test.append(part_embeddings)
    del data

num_rows = sum(part_embeddings.shape[0] for part_embeddings in embeddings_list_test)
embeddings_test = np.empty((num_rows, 1024))

row_index = 0
for part_embeddings in notebook.tqdm(embeddings_list_test):
    num_rows_part = part_embeddings.shape[0]
    embeddings_test[row_index : row_index + num_rows_part, :] = part_embeddings
    row_index += num_rows_part

In [None]:
folder_name = f'{path}all_embeddings'
try:
    os.mkdir(folder_name)
except FileExistsError:
    pass

joblib.dump(embeddings, os.path.join(folder_name, 'SBERT_embeddings_list_train'))
joblib.dump(embeddings_test, os.path.join(folder_name, 'SBERT_embeddings_list_test'))

**код для Kaggle (целого файла):**

In [None]:
df_train = pd.read_csv('../input/all-data/df_train.csv')
df_test = pd.read_csv('../input/all-data/df_test.csv')
df_train.shape, df_test.shape
# ((105950, 2), (1952, 2))

In [None]:
# TRAIN
y_train = df_train['label']
df_train['text'] = df_train['text'].progress_apply(lambda x: get_embeddings(x))
X_train = np.array(df_train['text'].tolist())
X_train = X_train.reshape(X_train.shape[0], -1)

In [None]:
# TEST
y_test = df_test['label']
df_test['text'] = df_test['text'].progress_apply(lambda x: get_embeddings(x))
X_test = np.array(df_test['text'].tolist())
X_test = X_test.reshape(X_test.shape[0], -1)

In [None]:
# сохраняем эмбеддинги в файл
with open(f'SBERT_embeddings_list_train_2.pickle', 'wb') as f:
    pickle.dump(X_train, f)
with open(f'y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)
with open(f'SBERT_embeddings_list_test_2.pickle', 'wb') as f:
    pickle.dump(X_test, f)
with open(f'y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)

## rubert-tiny2 - https://huggingface.co/cointegrated/rubert-tiny2

вот код для получения эмбеддингов:

In [None]:
tokenizer = AutoTokenizer.from_pretrained("cointegrated/rubert-tiny2")
model = AutoModel.from_pretrained("cointegrated/rubert-tiny2")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

def embed_bert_cls(text, model=model, device=device, tokenizer=tokenizer):
    t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')
    with torch.no_grad():
        model_output = model(**{k: v.to(device) for k, v in t.items()})
    embeddings = model_output.last_hidden_state[:, 0, :]
    embeddings = torch.nn.functional.normalize(embeddings)
    #embeddings = torch.nn.functional.pad(embeddings, (0, 2048 - embeddings.shape[1])) - задать размер (до 2048)
    return embeddings[0].cpu().numpy()

def filling_embeddings(df, list_):
    for text in notebook.tqdm(df['text']):
        list_.append(embed_bert_cls(text))
        torch.cuda.empty_cache()  # Освобождение памяти CUDA

In [None]:
embeddings_list, test_embeddings_list = [], []

for i in notebook.tqdm(range(22)):
    data = pd.read_csv(f'../input/df_train_small_{i}.csv')
    filling_embeddings(data, embeddings_list)
    del data

test_data = pd.read_csv(f'../input/test.csv')
filling_embeddings(test_data, test_embeddings_list)

In [None]:
folder_name = 'all_embeddings'
try:
    os.mkdir(folder_name)
except FileExistsError:
    pass

joblib.dump(embeddings_list, os.path.join(folder_name, 'embeddings_list_train_312'))
joblib.dump(test_embeddings_list, os.path.join(folder_name, 'embeddings_list_test_312'))

## rubert-base-cased - https://huggingface.co/DeepPavlov/rubert-base-cased

вот код для получения эмбеддингов:

In [None]:
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'DeepPavlov/rubert-base-cased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name).to(device)

def get_embeddings(text):
    max_length = 512
    # Токенизация текста и усечение до максимальной длины
    tokens = tokenizer.encode(text, add_special_tokens=True)[:max_length]
    # Дополнение нулями до максимальной длины
    tokens += [0] * (max_length - len(tokens))
    input_ids = torch.tensor([tokens]).to(device)
    with torch.no_grad():
        outputs = model(input_ids)
        embeddings = outputs[0][:, 0, :
    del tokens, input_ids, outputs
    torch.cuda.empty_cache()
    return embeddings

In [None]:
file_paths = [f'../input/df_train_small_{i}.csv' for i in range(22)]
embeddings_list = []

for file_path in notebook.tqdm(file_paths):
    df = pd.read_csv(file_path)
    df['text'] = df['text'].progress_apply(lambda x: get_embeddings(x))
    embeddings = np.array(df['text'].apply(lambda x: x.cpu().numpy()).tolist())
    embeddings_list.append(embeddings)
    del df

X_train = np.concatenate(embeddings_list, axis=0)
X_train = X_train.reshape(X_train.shape[0], -1)

In [None]:
y_test = df_test['label']
df_test['text'] = df_test['text'].progress_apply(lambda x: get_embeddings(x))
X_test = np.array(df_test['text'].apply(lambda x: x.cpu().numpy()).tolist())
X_test = X_test.reshape(X_test.shape[0], -1)

In [None]:
# сохранить
with open(f'RUBERT_BASE_CASED_embeddings_list_train.pickle', 'wb') as f:
    pickle.dump(X_train, f)  
with open(f'y_train.pickle', 'wb') as f:
    pickle.dump(y_train, f)
with open(f'RUBERT_BASE_CASED_embeddings_list_test.pickle', 'wb') as f:
    pickle.dump(X_test, f) 
with open(f'y_test.pickle', 'wb') as f:
    pickle.dump(y_test, f)

In [None]:
# загрузить
path = '../input/embeddings/embeddings/'
path__ = '../input/rubert-base/'

X_train_r = joblib.load(os.path.join(path__, 'RUBERT_BASE_CASED_embeddings_list_train.pickle'))
X_test_r = joblib.load(os.path.join(path__, 'RUBERT_BASE_CASED_embeddings_list_test.pickle'))
y_train = pd.read_csv(f'{path}column_train_label.csv')['label']
y_test = pd.read_csv(f'{path}column_test_label.csv')['label']
X_train_r = [np.array(i) for i in X_train_r]
X_test_r = [np.array(i) for i in X_test_r]

# модели - обучение, тестирование, алгоритмы на GPU

класс, в котором формируется предсказание при помощи табличных алгоритмов:

In [None]:
class GetFinalTable:
    """класс заполняет сводную таблицу"""

    pivot_table = pd.DataFrame(columns=['модель', 'Precision на train при cv', 'Precision на test'])

    @staticmethod
    def add_result(model, precision_train='', precision_test=''):
        GetFinalTable.pivot_table.loc[len(GetFinalTable.pivot_table)] = \
            [model, precision_train, precision_test]


class ModelAnalyzer:
    """класс формирует предсказание и считает Precision"""
    
    @staticmethod
    def get_metrics(name, model, y='', y_pred='', best_score_='', average='micro', test_df=False):
        """метод считает Precision"""

        precision = precision_score(y, y_pred, average=average)
        
        if test_df is not False:
            GetFinalTable.add_result(name, '', precision)
        else:
            GetFinalTable.add_result(name, best_score_, '')
            
        return GetFinalTable.pivot_table

    
    def choosing_best_model(self, name, model, X, y, param_grid, cv=3, n_iter=250, rs=RANDOM_STATE,
                            X_test=False, y_test=False):
        """
        метод формирует пайплайн, передаёт его RandomizedSearchCV, по сетке 
        находит лучшие гиперпараметры, возвращает предсказания
        """

        if X_test is not False:
            y_pred_Xtest = model.predict(X_test)
            self.get_metrics(name, model, y=y_test, y_pred=y_pred_Xtest, test_df=True)
            return y_pred_Xtest
        else:
            pipeline = Pipeline([('classifier', model)])
            grid = RandomizedSearchCV(pipeline, param_distributions=param_grid, cv=cv,
                                      scoring='precision_micro',
                                      refit=True, n_iter=n_iter, random_state=rs, n_jobs=-1)
            search = grid.fit(X, y)
            print('---' * 10, f'\nподобранные параметры:\n{search.best_params_}\n', '---' * 10)
            pipeline.set_params(**search.best_params_)
            pipeline.fit(X, y)
            y_pred = pipeline.predict(X)
            self.get_metrics(name, model, y=y, y_pred=y_pred, best_score_=search.best_score_)
            
            return pipeline, y_pred

In [None]:
# для удобства сохраним название столбцов в переменную:
del_str_train = 'Precision на train при cv'
del_str_test = 'Precision на test'

# вот экземпляр класса ModelAnalyzer:
analyzer = ModelAnalyzer()

In [None]:
def reduce_size(X, y, test_size=0.5, random_state=RANDOM_STATE):
    """Функция сокращает размер эмбеддингов с сохранением распределения классов"""
    
    print(f'Было строк: {len(X)}\nБыло классов: {np.unique(y).size}')
    X_train, _, y_train, _ = train_test_split(X, y, test_size=test_size, stratify=y, random_state=random_state)
    print(f'Стало строк: {len(X_train)}\nСтало классов: {np.unique(y_train).size}')
    return X_train, y_train

X_train_emb, y_train = reduce_size(X_train_emb, y_train)

# Было строк: 105950
# Было классов: 47
# Стало строк: 52975
# Стало классов: 47

### CatBoostClassifier

In [None]:
%%time
pipeline_dict = dict()
model = CatBoostClassifier(random_seed=RANDOM_STATE, verbose=False, loss_function='MultiClass', task_type='GPU')
name_model = 'BERT312_CatBoostClassifier'
params = {
    'classifier__bagging_temperature': range(1, 9),
    'classifier__thread_count': range(1, 6),
    'classifier__iterations': range(501, 1501),
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.5],
    'classifier__depth': range(1, 13),
    'classifier__l2_leaf_reg': [0.1, 1, 3, 10, 20],
    'classifier__border_count': [32, 64, 128, 256]
}
pipeline, y_pred_train =\
    analyzer.choosing_best_model(name_model,
                                 model, X_train_emb, y_train, param_grid=params, cv=2, n_iter=1)

pipeline_dict[name_model] = (pipeline, y_pred_train, GetFinalTable.pivot_table.iloc[[-1]])

with open(f'{name_model}.pickle', 'wb') as f:
    pickle.dump(pipeline_dict[name_model][0], f)

display(GetFinalTable.pivot_table.drop(columns=del_str_test).iloc[[-1]].round(2))

# ------------------------------ 
# подобранные параметры:
# {'classifier__thread_count': 4, 'classifier__learning_rate': 0.05, 'classifier__l2_leaf_reg': 20, 'classifier__iterations': 1318, 'classifier__depth': 7, 'classifier__border_count': 64, 'classifier__bagging_temperature': 5}
#  ------------------------------
# CPU times: user 8min 45s, sys: 2min 3s, total: 10min 49s
# Wall time: 8min 4s

# модель	Precision на train при cv
# 1	BERT312_CatBoostClassifier	0.6

### XGBClassifier

In [None]:
%%time
pipeline_dict = dict()
model = XGBClassifier(objective='multi:softmax', num_class=47, random_state=RANDOM_STATE, tree_method='gpu_hist')
name_model = 'BERT312_XGBClassifier'
params = {
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.5],
    'classifier__max_depth': [3, 6, 9],
    'classifier__n_estimators': [100, 200, 300],
    'classifier__reg_lambda': [0.1, 1, 10]
}

pipeline, y_pred_train = analyzer.choosing_best_model(name_model, model, X_train_emb, y_train, 
                                                      param_grid=params, cv=2, n_iter=20)

pipeline_dict[name_model] = (pipeline, y_pred_train, GetFinalTable.pivot_table.iloc[[-1]])

with open(f'{name_model}.pickle', 'wb') as f:
    pickle.dump(pipeline_dict[name_model][0], f)

display(GetFinalTable.pivot_table.drop(columns=del_str_test).iloc[[-1]].round(2))

# ------------------------------ 
# подобранные параметры:
# {'classifier__reg_lambda': 10, 'classifier__n_estimators': 300, 'classifier__max_depth': 9, 'classifier__learning_rate': 0.1}
#  ------------------------------
# CPU times: user 12min 45s, sys: 6.52 s, total: 12min 52s
# Wall time: 1h 11min 19s

# модель	Precision на train при cv
# 2	BERT312_XGBClassifier	0.61

### LGBMClassifier

In [None]:
%%time
pipeline_dict = dict()
model = LGBMClassifier(objective='multiclass', num_class=47, random_state=RANDOM_STATE, device='gpu')
name_model = 'BERT312_LGBMClassifier'
params = {
    'classifier__learning_rate': [0.01, 0.05, 0.1, 0.5],
    'classifier__max_depth': range(1, 13),
    'classifier__reg_alpha': [0.1, 1, 3, 10, 20],
    'classifier__num_boost_round': range(501, 1501)
}
pipeline, y_pred_train = analyzer.choosing_best_model(name_model, model, X_train_emb, y_train, param_grid=params, cv=2, n_iter=20)

pipeline_dict[name_model] = (pipeline, y_pred_train, GetFinalTable.pivot_table.iloc[[-1]])

with open(f'{name_model}.pickle', 'wb') as f:
    pickle.dump(pipeline_dict[name_model][0], f)

display(GetFinalTable.pivot_table.drop(columns=del_str_test).iloc[[-1]].round(2))

# ------------------------------ 
# подобранные параметры:
# {'classifier__reg_alpha': 1, 'classifier__num_boost_round': 945, 'classifier__max_depth': 2, 'classifier__learning_rate': 0.05}
#  ------------------------------
# CPU times: user 19min 30s, sys: 45.6 s, total: 20min 15s
# Wall time: 2h 21min 27s
# модель	Precision на train при cv
# 3	BERT312_LGBMClassifier	0.61

In [None]:
# проверка на тесте:

pr = analyzer.choosing_best_model('FIN_BERT312_XGBClassifier', model_XGB,
                                  '', '', '', X_test=X_test_emb, y_test=y_test)
GetFinalTable.pivot_table.drop(columns='Precision на train при cv').iloc[[-1]]

## NeuralNetworkClassifier 

In [None]:
random.seed(RANDOM_STATE)
np.random.seed(RANDOM_STATE)
torch.manual_seed(RANDOM_STATE)

if torch.cuda.is_available():
    torch.cuda.manual_seed_all(RANDOM_STATE)
    torch.backends.cudnn.deterministic = True

class NeuralNetworkClassifier:
    """
    класс обучает модель на GPU, выводит прогресс обучения и отрисовывает график
    обучения на тестовой и валидационной выборке. Обучение останавливается при 
    прохождении всех заданных эпох или если скор на валидационной выборке не менялся
    в лучшую сторону заданное количество эпох (patience - терпение)
    """
    
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.1, learning_rate=0.001, patience=3):
        
        self.model = NeuralNet(input_size, hidden_size, output_size, dropout_rate)
        self.criterion = nn.CrossEntropyLoss()
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate)
        self.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
        self.model.to(self.device)
        self.patience = patience
        self.best_loss = float('inf')
        self.epochs_without_improvement = 0
        self.train_loss_history = []
        self.val_loss_history = []

        
    def train(self, X_train, y_train, X_val, y_val, batch_size=32, num_epochs=10, progress_bar=1000):
        """обучаемся"""
        
        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(self.device)
        y_train_tensor = torch.tensor(np.array(y_train), dtype=torch.long).to(self.device)
        dataset = TensorDataset(X_train_tensor, y_train_tensor)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=True)

        total_steps = len(dataloader)
        for epoch in range(num_epochs):
            for i, (inputs, labels) in enumerate(dataloader):
                self.optimizer.zero_grad()
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                loss.backward()
                self.optimizer.step()

                # Вывод шкалы прогресса
                if (i+1) % progress_bar == 0:
                    print(f'Epoch [{epoch+1}/{num_epochs}], Step [{i+1}/{total_steps}], Loss: {loss.item():.4f}')
            
            # Оценка производительности на проверочном наборе данных
            val_loss = self.evaluate(X_val, y_val, batch_size)
    
            # Проверка на улучшение
            if val_loss < self.best_loss:
                self.best_loss = val_loss
                self.epochs_without_improvement = 0
            else:
                self.epochs_without_improvement += 1

            # Добавление значений ошибки в историю
            self.val_loss_history.append(val_loss)
            self.train_loss_history.append(loss.item())
            
            # Проверка на раннюю остановку
            if self.epochs_without_improvement >= self.patience:
                print(f'Early stopping! No improvement for {self.patience} epochs.')
                self.plot_loss()
                return
            
        self.plot_loss()

            
    def plot_loss(self):
        """рисум график истории обучения"""
        
        plt.figure(figsize=(12, 5))
        plt.plot(self.train_loss_history, label='Train Loss', c='#B03A2E')
        plt.plot(self.val_loss_history, label='Validation Loss', c='#148F77')
        plt.xlabel('Epoch')
        plt.ylabel('Loss')
        plt.title('Training and Validation Loss', fontsize=16, fontweight='bold')
        plt.legend(facecolor='oldlace', edgecolor='#7B6DA5')
        plt.minorticks_on()
        plt.grid(which='major', linewidth=.5)
        plt.grid(which='minor', linewidth=.25, linestyle='--')
        plt.show()
        
        
    def evaluate(self, X_val, y_val, batch_size):
        """метод оценивает производительности модели на валидационных данных"""
        
        X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(self.device)
        y_val_tensor = torch.tensor(np.array(y_val), dtype=torch.long).to(self.device)
        dataset = TensorDataset(X_val_tensor, y_val_tensor)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)
        
        total_loss = 0
        total_samples = 0
        self.model.eval()
        
        with torch.no_grad():
            for inputs, labels in dataloader:
                outputs = self.model(inputs)
                loss = self.criterion(outputs, labels)
                total_loss += loss.item() * inputs.size(0)
                total_samples += inputs.size(0)
        
        self.model.train()
        return total_loss / total_samples
    
    
    def predict(self, X_test, batch_size):
        """метод получает прогнозы модели на тестовых данных"""
        
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(self.device)
        dataset = TensorDataset(X_test_tensor)
        dataloader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

        predictions = []
        self.model.eval()

        with torch.no_grad():
            for inputs in dataloader:
                outputs = self.model(inputs[0])
                _, predicted = torch.max(outputs.data, 1)
                predictions.extend(predicted.cpu().numpy())

        self.model.train()
        return predictions
    

class NeuralNet(nn.Module):
    """нейросеть сеть с несколькими скрытыми слоями. Наследуется от класса nn.Module из PyTorch"""
    
    def __init__(self, input_size, hidden_sizes, output_size, dropout_rate):
        
        super(NeuralNet, self).__init__()
        self.hidden_layers = nn.ModuleList()
        self.bn_layers = nn.ModuleList()
        self.hidden_sizes = hidden_sizes
        self.dropout = nn.Dropout(dropout_rate)
        self.activation = nn.LeakyReLU(negative_slope=0.2, inplace=True)

        # Создание скрытых слоев с разным количеством нейронов
        for i in range(len(hidden_sizes)):
            if i == 0:
                self.hidden_layers.append(nn.Linear(input_size, hidden_sizes[i]))
            else:
                self.hidden_layers.append(nn.Linear(hidden_sizes[i-1], hidden_sizes[i]))
            self.bn_layers.append(nn.BatchNorm1d(hidden_sizes[i]))

        self.output_layer = nn.Linear(hidden_sizes[-1], output_size)

        
    def forward(self, x):
        """ 
        метод определяет прямой проход (forward pass) модели. Принимает 
        входные данные x и последовательно пропускает их через слои
        """
        
        for i, hidden_layer in enumerate(self.hidden_layers):
            x = hidden_layer(x)
            x = self.bn_layers[i](x)
            x = self.dropout(x) # случайный отсев, "аналог" резуляризации
            x = self.activation(x)

        x = self.output_layer(x)
        return x


class NeuralNetworkClassifierL2(NeuralNetworkClassifier):
    """ подкласс NeuralNetworkClassifier - классификатор с применением регуляризации L2"""
    
    def __init__(self, input_size, hidden_size, output_size, dropout_rate=0.1, 
                 learning_rate=0.001, patience=3, weight_decay=0.001):
        super(NeuralNetworkClassifierL2, self).__init__(input_size, hidden_size, output_size, 
                                                        dropout_rate, learning_rate, patience)
        
        self.optimizer = optim.Adam(self.model.parameters(), lr=learning_rate, weight_decay=weight_decay)

In [None]:
# загружаем эмбеддинги:
path = '../input/embeddings/embeddings/'
path_ = '../input/embeddings-list-2048/'

X_train_emb = joblib.load(os.path.join(path_, 'embeddings_list_train_2048'))
X_test_emb = joblib.load(os.path.join(path_, 'embeddings_list_test_2048'))
y_train = pd.read_csv(f'{path}column_train_label.csv')['label']
y_test = pd.read_csv(f'{path}column_test_label.csv')['label']

In [None]:
# вот здесь слои задаём, сколько их будет всего и количество нейронов в каждом
# потихоньку будем идти на понижение, т.к. у векторов размерность 2048, а классов 47
hidden_sizes = [2048, 1536, 768, 384]

# создаём экземпляр класса с регуляризацией L2 и явно указав шаг градиента и
# patience - "терпение" - если 3 эпохи на валидационной выборке качество не будет улучшаться,
# то, во избежание переобучения, обучение будет остановлено
classifier = NeuralNetworkClassifierL2(input_size=2048, hidden_size=hidden_sizes, 
                                       output_size=47, learning_rate=1e-5, patience=3)
# обучаемся и предсказываем
classifier.train(X_train_emb, y_train, batch_size=64, num_epochs=300, X_val=X_test_emb, y_val=y_test)
y_pred = classifier.predict(X_test_emb, batch_size=64)

![loss_0_696721.jpg](https://ltdfoto.ru/images/2023/11/01/loss_0_696721.jpg)

## DummyClassifier

In [None]:
def model_comparison(X_train, y_train, X_test, y_test, model_prediction, strategy='uniform'):
    """
    Функция находит предсказание dummy-модели по заданной 
    стратегии и отрисовывает столбчатый график с подписью значений
    """

    dummy_clf = DummyClassifier(strategy=strategy)
    dummy_clf.fit(X_train, y_train)
    dummy_pred = dummy_clf.predict(X_test)
    dummy_precision = precision_score(y_test, dummy_pred, average='micro')
    model_precision = 0.701332  # accuracy_score(y_test, model_prediction, average='micro')

    fig, ax = plt.subplots(figsize=(12, 5))
    ax.bar(x=[0], height=[model_precision], label='Модель', alpha=0.6, color='#48C9B0', ec='#2E6D57')
    ax.bar(x=[1], height=[dummy_precision], label='Dummy', alpha=0.6, color='#EC7063', ec='#3167A8')

    ax.text(0, model_precision, f'{model_precision:.3f}', ha='center', va='bottom')
    ax.text(1, dummy_precision, f'{dummy_precision:.3f}', ha='center', va='bottom')

    ax.set_xticks([0, 1])
    ax.set_xticklabels(['Модель', 'Dummy'])
    plt.legend(facecolor='oldlace', edgecolor='#7B6DA5', loc='upper center')
    plt.title(f'Сравнение предсказаний модели и dummy-модели\nСтратегия: {strategy}',
              fontweight='bold', fontsize=16)
    plt.ylim([0, 1])
    plt.minorticks_on()
    plt.grid(which='major', linewidth=.5)
    plt.grid(which='minor', linewidth=.25, linestyle='--');
    
# model_comparison(X_train_emb, y_train, X_test_emb, y_test, '')

![dummy_bert1.jpg](https://ltdfoto.ru/images/2023/11/01/dummy_bert1.jpg)

_____

_____

_____