In [1]:
import os
import pickle
from pprint import pprint
from typing import NamedTuple, Tuple

import pandas as pd
import pyarrow.parquet as pq
import torch
import torch.nn.functional as F
from optuna.samplers import TPESampler
from sklearn.metrics import f1_score
from torch import nn
from torch.utils.data import Dataset

import optuna
from utils.DataProcessor import DataProcessor
from utils.eda import eda
from utils.Oversampler import Oversampler
from utils.TargetEncodersWrapper import TargetEncodersWrapper
from utils.TextProcessor import TextProcessor

In [2]:
SEED = 8
LOG_INTERVAL = 11
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")

DEVICE

device(type='cuda')

In [3]:
torch.manual_seed(SEED)

<torch._C.Generator at 0x7fec1f6d2770>

# Data Processing

In [4]:
data_processor = DataProcessor(embedding_model='cc.ru.300.bin')

## Reading

In [5]:
train_data = pq.read_table('./data/train.parquet').to_pandas().set_index('product_id')
test_data = pq.read_table('./data/test.parquet').to_pandas().set_index('product_id')

## EDA

Let's look at the dataframes and their sizes.

In [6]:
print(train_data.shape)
train_data.iloc[0:3]

(91120, 7)


Unnamed: 0_level_0,category_id,sale,shop_id,shop_title,rating,text_fields,category_name
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
325286,12171,False,9031,Aksik,5.0,"{""title"": ""Зарядный кабель Borofone BX1 Lightn...",Все категории->Электроника->Смартфоны и телефо...
888134,14233,False,18305,Sela,5.0,"{""title"": ""Трусы Sela"", ""description"": ""Трусы-...",Все категории->Одежда->Женская одежда->Белье и...
1267173,13429,False,16357,ЮНЛАНДИЯ канцтовары,5.0,"{""title"": ""Гуашь \""ЮНЫЙ ВОЛШЕБНИК\"", 12 цветов...",Все категории->Хобби и творчество->Рисование->...


In [7]:
print(test_data.shape)
test_data.iloc[0:3]

(16860, 5)


Unnamed: 0_level_0,sale,shop_id,shop_title,rating,text_fields
product_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
1997646,False,22758,Sky_Electronics,5.0,"{""title"": ""Светодиодная лента Smart led Strip ..."
927375,False,17729,Di-Di Market,4.405941,"{""title"": ""Стекло ПЛЕНКА керамик матовое Honor..."
1921513,False,54327,VisionStore,4.0,"{""title"": ""Проводные наушники с микрофоном jac..."


Make sure the columns are in appropriate formats.

In [8]:
print(train_data.dtypes, '\n')
print(test_data.dtypes)

category_id        int64
sale                bool
shop_id            int64
shop_title        object
rating           float64
text_fields       object
category_name     object
dtype: object 

sale              bool
shop_id          int64
shop_title      object
rating         float64
text_fields     object
dtype: object


Missing values.

In [9]:
eda(train_data)

Missing values percentage:
category_id      0.0
sale             0.0
shop_id          0.0
shop_title       0.0
rating           0.0
text_fields      0.0
category_name    0.0
dtype: float64

Unique values percentage:
category_id       0.959175
sale              0.002195
shop_id          11.577041
shop_title       11.577041
rating            2.543898
text_fields      99.518218
category_name     0.959175
dtype: float64



In [10]:
eda(test_data)

Missing values percentage:
sale           0.0
shop_id        0.0
shop_title     0.0
rating         0.0
text_fields    0.0
dtype: float64

Unique values percentage:
sale            0.011862
shop_id        31.049822
shop_title     31.049822
rating          5.041518
text_fields    99.768683
dtype: float64



Class balance in the train set.

In [11]:
print(f'Number of unique categories: {train_data["category_id"].nunique()}')
train_class_balance = train_data['category_id'].value_counts()
print(train_class_balance.iloc[:5])
print(train_class_balance.iloc[-5:])

Number of unique categories: 874
11937    6590
14922    3709
13651    1463
13143    1460
12980    1222
Name: category_id, dtype: int64
12808    2
12901    1
11549    1
11875    1
12836    1
Name: category_id, dtype: int64


In [12]:
train_class_balance.describe()

count     874.000000
mean      104.256293
std       300.107191
min         1.000000
25%        14.000000
50%        38.000000
75%        99.000000
max      6590.000000
Name: category_id, dtype: float64

The classes are imbalanced. Let us oversample the rare classes. The simple approach is to replicate the rare classes samples.

## Oversampling of Rare Classes

Let us bring each class number of examples up to `lower_limit`.

In [13]:
oversampler = Oversampler()
train_data = oversampler.oversample(train_data, target='category_id', lower_limit=50, reset_index=True)
train_data['category_id'].value_counts()

11937    6590
14922    3709
13651    1463
13143    1460
12980    1222
         ... 
13346      50
14585      50
11636      50
15042      50
13376      50
Name: category_id, Length: 874, dtype: int64

I have conducted experiments on datasets with and without oversampling. The best architecture (described further at `NN` section) was trained with the same parameters.

Resulted F1-Weighted scores:

- without oversampling: `0.884`
- with oversampling: `0.895`

## Dictionary category_id -> category_name

I have built a mapping from `category_id` to `category_name` and saved it to file.

In [14]:
category_name_path = 'category_name.pickle'

with open(category_name_path, 'rb') as f:
    category_name = pickle.load(f)

category_name[2601]

'Все категории->Одежда->Женская одежда->Белье и купальники->Майки и топы бельевые'

## Feature Engineering

### Unwrapping the text data dictionaries

In [15]:
train_data['title'], train_data['description'], train_data['attributes'], train_data['characteristics'] =\
    zip(*train_data['text_fields'].apply(data_processor.process_json))

### Categorical Features Encoding

#### Sale

In [16]:
train_class_balance = train_data['category_id'].value_counts()
train_class_balance_sale = train_data[train_data['sale']]['category_id'].value_counts()
(train_class_balance_sale / train_class_balance).dropna()[0:10]

2599    0.037037
2601    0.038095
2730    0.016000
2744    0.008850
2746    0.058824
2748    0.006329
2769    0.001618
2803    0.010373
2804    0.002288
2824    0.022388
Name: category_id, dtype: float64

The `sale` feature seems to be useless. Let us drop it.

#### Shop ID

Shops, in contrast, could include some useful information. Many sellers have limited assortiment of goods, so knowing a seller could help predicting their goods category.

I applied the multiclass target encoding for the `shop_id` (= `shop_title`) feature. However, it hasn't improved the target metric.

In [17]:
# shop_encoders_path = './shop_encoders.pickle'
# shop_encoders_wrapper = TargetEncodersWrapper()
# shop_encoders = shop_encoders_wrapper.load(encoders_path=shop_encoders_path)

# if shop_encoders is None:
#     df_shop_encodings = shop_encoders_wrapper.fit_transform(train_data, 'shop_title', 'category_id', shop_encoders_path)
# else:
#     df_shop_encodings = shop_encoders_wrapper.transform(train_data, 'shop_title', shop_encoders)
    
# train_data = pd.concat([train_data, df_shop_encodings], axis=1)
# train_data

## Text Processing

In [18]:
def get_df_with_processed_text_features(df: pd.DataFrame,
                                        text_features: list,
                                        processor: TextProcessor,
                                        path: str):
    """ Loads the dataframe from `path`.
    If `path` is not found, saves the dataframe concatenated with
    embedded `text_features` to `path`.
    Returns: `df` with embeddings concatenated to the right of the table.
    """

    if not os.path.exists(path):
        for text_feature in text_features:
            df[text_feature] = df[text_feature].apply(lambda x: processor.process(x))
        df.to_csv(path)
    else:
        df = pd.read_csv(path, index_col=0).fillna('')

    return df

In [19]:
train_data_path = './train_data_preprocessed_final.csv'
text_features = ['title', 'description', 'attributes', 'characteristics']
train_data = get_df_with_processed_text_features\
    (train_data, text_features, data_processor.text_processor, train_data_path)

train_data = train_data.drop(columns=['rating', 'sale', 'shop_id', 'shop_title', 'text_fields'])

## Text Vectorization with FastText

In [20]:
embeddings_folder = 'embeddings'

df_embeddings = data_processor.embedder.generate_embeddings(train_data, text_features, embeddings_folder, postfix='train')
df_embeddings.iloc[0:3]

Unnamed: 0,title.0,title.1,title.2,title.3,title.4,title.5,title.6,title.7,title.8,title.9,...,characteristics.290,characteristics.291,characteristics.292,characteristics.293,characteristics.294,characteristics.295,characteristics.296,characteristics.297,characteristics.298,characteristics.299
0,0.051914,-0.040258,0.025391,0.048084,0.01103,-0.028035,-0.020593,-0.007132,0.027099,-0.02896,...,-0.078753,-0.106597,0.048499,0.030822,-0.017637,0.039964,-0.015342,-0.041639,-0.041431,-0.07937
1,0.072009,0.059222,0.020265,-0.006977,-0.083824,0.064424,-0.004561,-0.011842,-0.014513,0.026763,...,0.003411,-0.099625,0.045395,-0.020554,-0.018369,-0.017763,0.008698,0.044429,0.01726,-0.02623
2,0.001555,0.030092,-0.002284,-0.01154,0.035198,0.011356,-0.013616,0.011064,-0.01944,-0.035191,...,0.001901,-0.043206,-0.053606,-0.018003,-0.010898,0.009663,-0.004444,0.099461,0.008132,-0.051409


# Train-valid-test split

Firstly, accumulate all the data in one dataframe. Then split it taking only features used while training.

In [21]:
targets = ['category_id', 'category_name']
target = targets[0]

text_features = ['title', 'description', 'attributes', 'characteristics']
embedded_features = df_embeddings.columns.to_list()
unused_features = ['product_id']

training_features = embedded_features + [f for f in train_data.columns.to_list() if f not in targets + text_features + unused_features]

In [22]:
def data_split(df: pd.DataFrame, training_features: list[str], target: str, data_frac: float, train_frac: float, valid_frac: float, test_frac: float) -> pd.DataFrame:
    assert train_frac + valid_frac + test_frac == 1
    # make stratified subsets
    df_result = df.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=data_frac, random_state=SEED))
    df_result = df_result[training_features + targets]

    # split the df_result into train and test subsets
    df_train = df_result.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=train_frac, random_state=SEED))
    df_valid_test = df_result[~df_result.isin(df_train)].dropna(how='all')
    df_valid = df_valid_test.groupby(target, group_keys=False).apply(lambda x: x.sample(frac=valid_frac/(valid_frac+test_frac), random_state=SEED))
    df_test = df_valid_test[~df_valid_test.isin(df_valid)].dropna(how='all')

    class_balance = {
        "train": df_train[target].value_counts(),
        "valid": df_valid[target].value_counts(),
        "test": df_test[target].value_counts(),
    }
    df_class_balance = pd.DataFrame(class_balance).sort_values(by='train', ascending=False)
    df_class_balance.index.name = 'category_id'

    return (
        df_train[training_features],
        df_train[target],
        df_valid[training_features],
        df_valid[target],
        df_test[training_features],
        df_test[target],
        df_class_balance
        )

In [23]:
df = pd.concat([train_data, df_embeddings], axis=1)

X_train, y_train, X_valid, y_valid, X_test, y_test, df_class_balance =\
    data_split(df, training_features, target, data_frac = 1, train_frac = 0.7, valid_frac = 0.15, test_frac = 0.15)

print(X_train.shape)
print(X_valid.shape)
print(X_test.shape)

df_class_balance

(78441, 1200)
(16818, 1200)
(16822, 1200)


Unnamed: 0_level_0,train,valid,test
category_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
11937.0,4613,988,989
14922.0,2596,556,557
13651.0,1024,220,219
13143.0,1022,219,219
12980.0,855,184,183
...,...,...,...
13632.0,35,8,7
2751.0,35,8,7
12554.0,35,8,7
12428.0,35,8,7


# NN

## Data Loaders

In [24]:
num_samples = len(train_data)
num_features = len(training_features)
num_classes = len(df_class_balance)

class_map = pd.Series(data=range(num_classes), index=df_class_balance.index).to_dict()

In [25]:
class KEDataset(Dataset):
    def __init__(self, X: pd.DataFrame, y: pd.Series, is_labeled: bool = True):
        self.X = X.to_numpy(dtype=float)
        self.y = y.to_numpy(dtype=int)
        
    def __len__(self):
        return len(self.y)
    
    def __getitem__(self, idx):
        return self.X[idx], class_map[self.y[idx]]
    

def get_data_loaders(X_train, y_train, X_valid, y_valid, X_test, y_test, batch_size: int):
    train_loader = torch.utils.data.DataLoader(KEDataset(X_train, y_train), batch_size=batch_size, shuffle=True)
    valid_loader = torch.utils.data.DataLoader(KEDataset(X_valid, y_valid), batch_size = batch_size, shuffle=True)
    test_loader = torch.utils.data.DataLoader(KEDataset(X_test, y_test), batch_size = batch_size)
    return train_loader, valid_loader, test_loader

## NN Architecture

I'm introducing the `FC_layer` class used as a building block of MLP.

It's extremely useful to parametrize the number of layers in MLP for hyperparameter search.

In [26]:
class FC_layer(NamedTuple):
    num_neurons: int
    activation: str
    dropout_proba: float


class NN(nn.Module):
    def __init__(self,
                 n_input: int,
                 n_output: int,
                 hidden_layers: Tuple[FC_layer]):
        super().__init__()

        layers = []
        for hidden_layer in hidden_layers:
            # the layer itself
            layer = nn.Linear(n_input, hidden_layer.num_neurons)
            layers.append(layer)
            # batch normalization
            bn = nn.BatchNorm1d(hidden_layer.num_neurons)
            layers.append(bn)
            # activation function
            activation = getattr(nn, hidden_layer.activation)()
            layers.append(activation)
            # dropout layer
            dropout = nn.Dropout(hidden_layer.dropout_proba)
            layers.append(dropout)

            n_input = hidden_layer.num_neurons  # the last layer n_input == the last hidden layer n_output

        output_layer = nn.Linear(n_input, n_output)
        layers.append(output_layer)

        self.extractor = nn.Sequential(*layers)

    def forward(self, x: torch.Tensor) -> torch.Tensor:
        input = torch.flatten(x, start_dim=1)
        out = self.extractor(input)
        return out

## Training Loop

In [27]:
def train(model, dataloader, optimizer, criterion, epoch, verbose: bool = True):
    model.train()
    optimizer.zero_grad()

    y_preds = []
    y_targets = []

    for X_batch, y_batch in dataloader:
        X_batch = X_batch.to(DEVICE)
        y_batch = y_batch.to(DEVICE)

        y_pred = model(X_batch)

        loss = criterion(y_pred, y_batch)
        loss.backward()
        optimizer.step()

        y_preds += y_pred.argmax(-1).tolist()
        y_targets += y_batch.tolist()
        
    weighted_f1 = f1_score(y_targets, y_preds, average='weighted')

    if verbose and (epoch % LOG_INTERVAL == 0 or epoch == 1):
        print(f'EPOCH: {epoch}')
        print(f'Training loss: {loss.item():.4f}')
        print(f'Training Weighted F1: {weighted_f1:.4f}')

    return weighted_f1

In [28]:
def test(model, dataloader, epoch, verbose: bool = True):
    model.eval()

    with torch.no_grad():
        y_preds = []
        y_targets = []
        
        for X_batch, y_batch in dataloader:
            X_batch = X_batch.to(DEVICE)
            y_batch = y_batch.to(DEVICE)
            y_pred = model(X_batch)
            loss = F.cross_entropy(y_pred, y_batch)

            y_preds += y_pred.argmax(-1).tolist()
            y_targets += y_batch.tolist()

    weighted_f1 = f1_score(y_targets, y_preds, average='weighted')

    if verbose and (epoch % LOG_INTERVAL == 0 or epoch == 1):
        print(f'Validation loss: {loss.item():.4f}')
        print(f'Validation Weighted F1: {weighted_f1:.4f}')
        print("_" * 60)

    return weighted_f1

## Baseline Model Training

In [25]:
hidden_layers = (
    FC_layer(2200, "LeakyReLU", 0),
    FC_layer(1200, "ReLU", 0),
    )

model = NN(
    n_input=num_features,
    n_output=num_classes,
    hidden_layers=hidden_layers
).to(DEVICE)

model.double()

print(model)

NN(
  (extractor): Sequential(
    (0): Linear(in_features=2074, out_features=2200, bias=True)
    (1): BatchNorm1d(2200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): LeakyReLU(negative_slope=0.01)
    (3): Dropout(p=0, inplace=False)
    (4): Linear(in_features=2200, out_features=1200, bias=True)
    (5): BatchNorm1d(1200, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU()
    (7): Dropout(p=0, inplace=False)
    (8): Linear(in_features=1200, out_features=874, bias=True)
  )
)


In [None]:
optimizer = torch.optim.AdamW(model.parameters())
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=5, gamma=0.1)
criterion = nn.CrossEntropyLoss()

NUM_EPOCHS = 20
BATCH_SIZE = 256
train_loader, valid_loader, test_loader = get_data_loaders(X_train, y_train, X_valid, y_valid, X_test, y_test, BATCH_SIZE)

for epoch in range(1, NUM_EPOCHS + 1):
    train(model, train_loader, optimizer, criterion, epoch)
    scheduler.step()
    test(model, valid_loader, epoch)

Evaluation on the Test Set:

In [32]:
_ = test(model, test_loader, LOG_INTERVAL)

Validation loss: 0.1730
Validation Weighted F1: 0.8623


## Hyperparameters Search

I have ran 100 Optuna trials to find the best values for the following hyperparameters:

- number of hidden layers
- number of neurons in each hidden layer
- activation function of each hidden layer
- dropout probability after each hidden layer
- learning rate and weight decay for `AdamW` optimizer
- step size and gamma for `StepLR` scheduler

In [26]:
def define_model(trial):
    n_layers = trial.suggest_int("num_layers", 0, 5)
    layers = []

    for i in range(n_layers):
        out_features = trial.suggest_int(f"num_neurons_l{i}", 500, 3000)
        activation_name = trial.suggest_categorical(f"activation_l{i}", ["ReLU", "LeakyReLU", "ELU"])
        dropout_proba = trial.suggest_uniform(f"dropout_l{i}", 0, 0.3)

        layer = FC_layer(out_features, activation_name, dropout_proba)
        layers.append(layer)

    return NN(n_input=num_features, n_output=num_classes, hidden_layers=tuple(layers))

I chose the validation set F1-Weighted score as the objective for maximization.

In [27]:
def objective(trial):
    # generate the model
    model = define_model(trial).to(DEVICE)
    model.double()
    # optimizer parameters
    lr = trial.suggest_loguniform("lr", 1e-5, 1e-1)
    weight_decay = trial.suggest_uniform("weight_decay", 0, 1e-3)
    optimizer = torch.optim.AdamW(model.parameters(), lr=lr, weight_decay=weight_decay)
    # scheduler parameters
    step_size = trial.suggest_int("step_size", 1, 14)
    gamma = trial.suggest_uniform("gamma", 0.1, 0.95)
    scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=step_size, gamma=gamma)

    with open(f"./optuna/trial_{trial.number}.log", "w") as log_file:
        pprint(trial.params.items(), log_file)
        pprint(model, log_file)
        
    for epoch in range(1, NUM_EPOCHS + 1):
        train_f1w = train(model, train_loader, optimizer, criterion, epoch, verbose=False)
        scheduler.step()
        val_f1w = test(model, valid_loader, epoch, verbose=False)
        
    return val_f1w

In [28]:
def print_study_stats(study):
    pruned_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.PRUNED]
    complete_trials = [t for t in study.trials if t.state == optuna.structs.TrialState.COMPLETE]

    print("Study statistics: ")
    print("  Number of finished trials: ", len(study.trials))
    print("  Number of pruned trials: ", len(pruned_trials))
    print("  Number of complete trials: ", len(complete_trials))

    print("Best trial:")
    trial = study.best_trial

    print("  Value: ", trial.value)

    print("  Params: ")
    for key, value in trial.params.items():
        print("    {}: {}".format(key, value))

In [None]:
optuna.logging.set_verbosity(optuna.logging.INFO)

NUM_EPOCHS = 15
BATCH_SIZE = 1024
train_loader, valid_loader, test_loader = get_data_loaders(X_train, y_train, X_valid, y_valid, X_test, y_test, BATCH_SIZE)
criterion = nn.CrossEntropyLoss()

sampler = TPESampler(seed=SEED)
pruner = optuna.pruners.MedianPruner()
study = optuna.create_study(direction="maximize", sampler=sampler, pruner=pruner)

study.optimize(objective, n_trials=100)

print_study_stats(study)

## Best Model Training

After the hyperparameters search, the best model has shown `0.8959` F1-Weighted score on the test set:

In [69]:
_ = test(model, test_loader, LOG_INTERVAL)
torch.save(model, './models/model_best_train.pt')

Validation loss: 0.5056
Validation Weighted F1: 0.8959


The best model has the following architecture:

In [34]:
hidden_layers = (
    FC_layer(2432, "ELU", 0.13118521108680903),
    )

model = NN(
    n_input=num_features,
    n_output=num_classes,
    hidden_layers=hidden_layers
).to(DEVICE)

model.double()

print(model)

NN(
  (extractor): Sequential(
    (0): Linear(in_features=1200, out_features=2432, bias=True)
    (1): BatchNorm1d(2432, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ELU(alpha=1.0)
    (3): Dropout(p=0.13118521108680903, inplace=False)
    (4): Linear(in_features=2432, out_features=874, bias=True)
  )
)


The final model will have the whole dataset for training:

In [30]:
BATCH_SIZE = 256

dataset = df[training_features + targets]
train_loader = torch.utils.data.DataLoader(KEDataset(dataset[training_features], dataset[target]), batch_size=BATCH_SIZE, shuffle=False)
len(train_loader) * BATCH_SIZE

112128

Run the training loop:

In [35]:
NUM_EPOCHS = 33

optimizer = torch.optim.AdamW(model.parameters(), lr=6.490126177038149e-05, weight_decay=0.00015206958986612742)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=3, gamma=0.65)
criterion = nn.CrossEntropyLoss()

for epoch in range(1, NUM_EPOCHS + 1):
    train(model, train_loader, optimizer, criterion, epoch)
    scheduler.step()

torch.save(model, './models/model_submission.pt')

EPOCH: 1
Training loss: 3.3702
Training Weighted F1: 0.5070
EPOCH: 11
Training loss: 1.2742
Training Weighted F1: 0.9248
EPOCH: 22
Training loss: 0.6895
Training Weighted F1: 0.9729
EPOCH: 33
Training loss: 0.6635
Training Weighted F1: 0.9813


## Predictions for Submission

### Test Data Processing

In [31]:
# unwrapping the text data jsons
test_data['title'], test_data['description'], test_data['attributes'], test_data['characteristics'] =\
    zip(*test_data['text_fields'].apply(data_processor.process_json))

# text features processing
test_data_path = './test_data_preprocessed_final.csv'
test_data = get_df_with_processed_text_features\
    (test_data, text_features, data_processor.text_processor, test_data_path)

# generate embeddings for textual data
df_test_embeddings = data_processor.embedder.generate_embeddings(test_data, text_features, embeddings_folder, postfix='test')

# construct a dataset
BATCH_SIZE = 1024
df_submit = pd.concat([test_data, df_test_embeddings], axis=1)[training_features]
dl_submit = torch.utils.data.DataLoader(KEDataset(df_submit, pd.Series([2601] * len(df_submit))), batch_size=BATCH_SIZE, shuffle=False)

### Saving the Predictions

In [34]:
model.eval()
with torch.no_grad():
    y_preds = []
    for X_batch, _ in dl_submit:
        X_batch = X_batch.to(DEVICE)
        y_pred = model(X_batch)
        y_preds += y_pred.argmax(-1).tolist()

inv_class_map = {v: k for k, v in class_map.items()}
y_preds = pd.Series(y_preds).apply(lambda x: int(inv_class_map[x]))

In [35]:
result = pd.DataFrame({
    'product_id': test_data.index.to_series().reset_index(drop=True),
    'predicted_category_id': y_preds
    })
result

Unnamed: 0,product_id,predicted_category_id
0,1997646,13495
1,927375,14922
2,1921513,2803
3,1668662,12524
4,1467778,13887
...,...,...
16855,1914264,11645
16856,1310569,12357
16857,978095,13651
16858,797547,2740


In [36]:
result.to_parquet('result.parquet')

## Prediction Analysis

In [43]:
pred_categories = result.set_index('product_id')['predicted_category_id'].apply(lambda x: category_name[x])
titles_preds = pd.concat([test_data['title'], pred_categories], axis=1)

In [47]:
for i in range(10):
    print("Title:", titles_preds.iloc[i]['title'])
    print("Prediction:", titles_preds.iloc[i]['predicted_category_id'])
    print()

Title: светодиодный лента smart led strip light пульт метр usb bluetooth
Prediction: Все категории->Товары для дома->Товары для праздников->Новогодние товары->Гирлянды

Title: стекло пленка керамика матовый honor lite pro 30s psmart p40 lite
Prediction: Все категории->Электроника->Смартфоны и телефоны->Аксессуары и запчасти->Защитные стекла и пленки->Защитные стекла

Title: проводной наушник микрофон jack ios android
Prediction: Все категории->Электроника->Наушники и аудиотехника->Наушники->Проводные наушники

Title: декоративный табличка правило кухня подставка горячее разделочный доска
Prediction: Все категории->Товары для дома->Товары для кухни->Кухонные аксессуары->Скатерти и подставки под горячее

Title: подставка ложка керамический подложка клубника лаванда лимон
Prediction: Все категории->Товары для дома->Товары для кухни->Порядок на кухне->Подставки для столовых приборов

Title: футболка женский принт премиальный хлопок
Prediction: Все категории->Одежда->Женская одежда->Футболк