<h1><center><b>Импорт библиотек</b></center></h1>

In [None]:
from pathlib import Path

import numpy as np
import pandas as pd
import torch
from navec import Navec
from pytorch_lightning.loggers import TensorBoardLogger
from sklearn.model_selection import train_test_split
from tqdm import tqdm
from utils.dataset_mlp import DataModule
from utils.model_coca import ClassifierCOCA
from utils.model_mlp import ClassifierMLP
from utils.utils import (cleanhtml, lemmatize, normalization_df, predict,
                         train_model_mvp, transform_tokens)

<h1><center><b>Установка путей и констант</b></center></h1>

In [None]:
DATA_PATH = Path('data')
LOG_PATH = Path('logs')
MODELS_PATH = Path('models')
CHECKPOINT_PATH = Path('checkpoints')
IMAGES_PATH = DATA_PATH / 'images'

DATA_PATH.mkdir(parents=True, exist_ok=True)
LOG_PATH.mkdir(parents=True, exist_ok=True)
CHECKPOINT_PATH.mkdir(parents=True, exist_ok=True)
TEXT_MODEL = MODELS_PATH / 'navec_hudlit_v1_12B_500K_300d_100q.tar'  # 471 MB


In [None]:
# Параметры обучения:
# случайное зерно
SEED = 13
# номер видеокарты
GPU_ID = 1
# размер валидационной выборки
VAL_SIZE = 0.15
# количество эпох обучения
NUM_EPOCHS = 25
# размер батча
BATCH_SIZE = 256
# скорость обучения
LEARNING_RATE = 1e-3
# число задействованных ядер
NUM_WORKERS = 12
# количество классов
NUM_CLASSES = 874
# размер изображения для ресайза
IMAGE_SIZE = 256
# длина эмбеддинга изображения
IMAGE_EMB_LEN = 300

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

<h2><center><b>Чтение данных</b></center></h2>

In [None]:
navec = Navec.load(TEXT_MODEL)
train_df = pd.read_parquet(DATA_PATH / 'train.parquet', engine='fastparquet')
train_df['title'] = train_df.apply(lambda x: lemmatize(
    cleanhtml(x.text_fields)['title'], navec), axis=1)
train_df['category_id'] = train_df['category_id'].astype('category').cat.codes
train_df.head(3)


<h2><center><b>Разбиение на тренировачную и тестовую выборки</b></center></h2>

In [None]:
train, test = train_test_split(train_df, test_size=0.25, random_state=SEED)

train = train[train['title'].astype(str) != '[]']
test = test[test['title'].astype(str) != '[]']

train['title'] = train.title.apply(transform_tokens)
test['title'] = test.title.apply(transform_tokens)

train = train[['product_id', 'title', 'category_id']]
test = test[['product_id', 'title', 'category_id']]

<h2><center><b>Загрузка модели</b></center></h2>

In [None]:
model_hparams = {'batch_size': BATCH_SIZE,
                 'len_image_emb': IMAGE_EMB_LEN,
                 'num_classes': NUM_CLASSES,
                 'image_size': IMAGE_SIZE
                 }
optimizer_params = {'name': 'RAdam',
                    'lr': LEARNING_RATE}
model_coca = ClassifierCOCA(model_hparams=model_hparams,
                       optimizer_params=optimizer_params)
checkpoint = torch.load(
    'checkpoints/coca_RAdam/epoch=1-val_loss=5.23014.ckpt')
model_coca.load_state_dict(checkpoint['state_dict'])
model_coca.to(device)


<h2><center><b>Получение предсказаний</b></center></h2>

In [None]:
images_path = IMAGES_PATH / 'train'
image_embd_train, text_embd_train = predict(train, model_coca, images_path, device)
image_embd_test, text_embd_test = predict(test, model_coca, images_path, device)


<h2><center><b>Нормализация и разделение данных</b></center></h2>

In [None]:
train_emb = normalization_df(image_embd_train,text_embd_train)
test_emb = normalization_df(image_embd_test,text_embd_test)
y_train = train['category_id'].values
y_test = test['category_id'].values

In [None]:
train_values = np.concatenate([train_emb, y_train[...,np.newaxis]], axis = 1)
test_values = np.concatenate([test_emb, y_test[...,np.newaxis]], axis = 1)

<h1><center><b>Обучение классификатора</b></center></h1>

In [None]:
# Убедимся, что все операции детерминированы на графическом процессоре
# (если он используется) для воспроизводимости.
torch.backends.cudnn.determinstic = True
torch.backends.cudnn.benchmark = False
# создание объекта логгирования(тензорборд)
name = 'mvp'
tensor_logger = TensorBoardLogger(LOG_PATH, name=name)
# создание модуля данных
datamodule = DataModule(train_values,
                        batch_size=BATCH_SIZE,
                        num_workers=NUM_WORKERS,
                        val_size=VAL_SIZE,
                        seed=SEED)
model_hparams = {'batch_size': BATCH_SIZE,
                 'len_image_emb': 300,
                 'num_classes': NUM_CLASSES
                 }
optimizer_params = {'name': 'AdamW',
                    'lr': LEARNING_RATE}
trainer = train_model_mvp(
    model_hparams=model_hparams,
    optimizer_params=optimizer_params,
    ckpt_path=None,
    logger=tensor_logger,
    scheduler=None,
    checkpoint_path=CHECKPOINT_PATH / f'{name}_{optimizer_params["name"]}',
    device=device,
    gpu_id=[0],
    num_epochs=NUM_EPOCHS,
    data_module=datamodule,
    seed=SEED,
    unfreeze_epoch=None
)


<h2><center><b>Расчет метрик</b></center></h2>

In [None]:
# inference
from sklearn.metrics import f1_score

model_hparams = {'batch_size': BATCH_SIZE,
                 'len_image_emb': 300,
                 'num_classes': NUM_CLASSES
                 }
optimizer_params = {'name': 'RAdam',
                    'lr': LEARNING_RATE}
model = ClassifierMLP(model_hparams=model_hparams,
                      optimizer_params=optimizer_params)
checkpoint = torch.load(
    '/4tb/nikonov/hierarchy/hierarchy/checkpoints/mvp_AdamW/epoch=7-val_loss=1.43775.ckpt')
model.load_state_dict(checkpoint['state_dict'])




In [None]:
preds = model(torch.from_numpy(test_values[:,:-1]))
_, predicted = torch.max(preds.data, 1)
f1_score(test_values[:,-1:], predicted, average='macro')

<h2><center><b>Тестовый инференс</b></center></h2>

In [22]:
test_data = pd.read_parquet(DATA_PATH / 'test.parquet', engine='fastparquet')

In [23]:
test_data['title'] = test_data.apply(lambda x: lemmatize(
    cleanhtml(x.text_fields)['title'], navec), axis=1)
test_data = test_data[['product_id', 'title']]
images_path = IMAGES_PATH / 'test'
image_test, text_test = predict(test_data, model_coca, images_path, device)
test_emb = normalization_df(image_test,text_test)
preds = model(torch.from_numpy(test_emb))
_, predicted = torch.max(preds.data, 1)

result = pd.DataFrame({'id': test_data.product_id, 'predicted_category_id':predicted})
result.to_parquet('result.parquet')

100%|██████████| 16860/16860 [03:54<00:00, 71.92it/s]
