In [1]:
# Параметры
BATCH_SIZE = 16384
model_path_1 = 'fold_0_3.1.5.1_MLP.pth'
model_path_2 = 'fold_1_3.1.5.1_MLP.pth'
model_path_3 = 'fold_2_3.1.5.1_MLP.pth'
model_path_4 = 'fold_3_3.1.5.1_MLP.pth'


test_csv = 'test_pairs.csv'  # Путь к тестовым данным
models_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/'
custom_data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/custom_data/'
data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/data/'
test_output_path = '3.1.5.1_ensemble_predictions.csv' 
results_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/'
orig_data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/data/'

NUM_CROSS_LAYERS = 1

In [2]:
import numpy as np
import torch
import torch.nn.functional as F
import pandas as pd
from tqdm import tqdm
import torch.nn as nn

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

test = pd.read_csv(f'{orig_data_folder}{test_csv}')

In [4]:
items_meta = pd.read_parquet(f'{orig_data_folder}items_meta.parquet', engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

users_meta = pd.read_parquet(f'{orig_data_folder}users_meta.parquet', engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta['age'] = users_meta['age'].replace({1: 0, 2: 1})
users_meta.set_index('user_id', inplace=True)

In [5]:
def normalize_column(column):
    return (column - column.min()) / (column.max() - column.min())


users_meta['age'] = normalize_column(users_meta['age'])
items_meta['duration'] = normalize_column(items_meta['duration'])

In [6]:
item_embeddings_dict = items_meta['embeddings'].to_dict()

In [7]:
class MLPModel(nn.Module):
    def __init__(self, num_users, num_items, num_sources, input_dim, hidden_dim=2048, output_dim=3):
        super(MLPModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, 256)
        self.item_embedding = nn.Embedding(num_items, 256)
        self.source_embedding = nn.Embedding(num_sources, 256)
        
        # MLP модуль
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2048)
        self.fc3 = nn.Linear(2048, 1024)
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, 512)
        self.fc6 = nn.Linear(512, 256)
        self.fc7 = nn.Linear(256, 256)
        self.fc8 = nn.Linear(256, 128)
        self.fc9 = nn.Linear(128, output_dim)
        self.activation = nn.Softplus()
    
    def forward(self, user_ids, item_ids, source_ids, age, duration, gender, embeddings):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)

        x = torch.cat((user_emb, item_emb, source_emb, age, duration, gender, embeddings), dim=1)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.activation(self.fc6(x))
        x = self.activation(self.fc7(x))
        x = self.activation(self.fc8(x))
        x = self.fc9(x)
        return x

In [8]:
# Размер входных данных
num_users = users_meta.index.nunique()
num_items = items_meta.index.nunique()
num_sources = items_meta['source_id'].nunique()
input_dim = 1 + 1 + 1 + 256 + 256 + 256 + 32

In [9]:
# Загрузка моделей
model_paths = [model_path_1, model_path_2, model_path_3, model_path_4]
models = []

In [10]:
for path in model_paths:
    model = MLPModel(num_users, num_items, num_sources, input_dim).to(device)
    model.load_state_dict(torch.load(f"{models_folder}{path}")["model_state_dict"])
    model.eval()
    models.append(model)

  model.load_state_dict(torch.load(f"{models_folder}{path}")["model_state_dict"])


In [11]:
# Оценивание каждой модели и усреднение результатов
model_paths = [model_path_1, model_path_2, model_path_3, model_path_4]
all_predictions = []

In [12]:
for model_path in model_paths:
    print(f"Загрузка модели: {model_path}")
    model = MLPModel(num_users, num_items, num_sources, input_dim).to(device)
    model.load_state_dict(torch.load(f"{models_folder}{model_path}")["model_state_dict"])
    model.eval()
    
    predictions = []
    num_samples = len(test)
    num_batches = (num_samples + BATCH_SIZE - 1) // BATCH_SIZE
    
    with torch.no_grad():
        for batch_idx in tqdm(range(num_batches), desc=f"Оценивание {model_path}"):
            start_idx = batch_idx * BATCH_SIZE
            end_idx = min(start_idx + BATCH_SIZE, num_samples)
            batch = test.iloc[start_idx:end_idx]

            batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
            batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
            batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].cat.codes.values, dtype=torch.long, device=device)
            batch_age = torch.tensor(users_meta.loc[batch['user_id'].values, 'age'].values, dtype=torch.float32, device=device).unsqueeze(1)
            batch_duration = torch.tensor(items_meta.loc[batch['item_id'].values, 'duration'].values, dtype=torch.float32, device=device).unsqueeze(1)
            batch_gender = torch.tensor(users_meta.loc[batch['user_id'].values, 'gender'].cat.codes.values, dtype=torch.float32, device=device).unsqueeze(1)

            item_indices = batch_item_ids.cpu().numpy()
            embeddings = torch.tensor(np.stack(items_meta.loc[item_indices, 'embeddings'].values), device=device, dtype=torch.float32)

            outputs = model(batch_user_ids, batch_item_ids, batch_source_ids, batch_age, batch_duration, batch_gender, embeddings)
            probabilities = F.softmax(outputs, dim=1)

            # Взвешенные предсказания
            class_weights = torch.tensor([0, 1, 2], device=probabilities.device, dtype=probabilities.dtype)
            weighted_predictions = torch.sum(probabilities * class_weights, dim=1).cpu().numpy()

            predictions.extend(weighted_predictions)
    
    all_predictions.append(predictions)
    del model
    torch.cuda.empty_cache()

# Усреднение предсказаний
ensemble_predictions = np.mean(all_predictions, axis=0)

Загрузка модели: fold_0_3.1.5.1_MLP.pth


  model.load_state_dict(torch.load(f"{models_folder}{model_path}")["model_state_dict"])
Оценивание fold_0_3.1.5.1_MLP.pth: 100%|██████████| 102/102 [00:22<00:00,  4.63it/s]


Загрузка модели: fold_1_3.1.5.1_MLP.pth


  model.load_state_dict(torch.load(f"{models_folder}{model_path}")["model_state_dict"])
Оценивание fold_1_3.1.5.1_MLP.pth: 100%|██████████| 102/102 [00:28<00:00,  3.52it/s]


Загрузка модели: fold_2_3.1.5.1_MLP.pth


  model.load_state_dict(torch.load(f"{models_folder}{model_path}")["model_state_dict"])
Оценивание fold_2_3.1.5.1_MLP.pth: 100%|██████████| 102/102 [00:24<00:00,  4.17it/s]


Загрузка модели: fold_3_3.1.5.1_MLP.pth


  model.load_state_dict(torch.load(f"{models_folder}{model_path}")["model_state_dict"])
Оценивание fold_3_3.1.5.1_MLP.pth: 100%|██████████| 102/102 [00:28<00:00,  3.63it/s]


In [13]:
# Сохранение результатов
test['predict'] = ensemble_predictions
output_path = f"{results_folder}{test_output_path}"
test[['user_id', 'item_id', 'predict']].to_csv(output_path, index=False)
print(f"Результаты сохранены в {output_path}")

Результаты сохранены в C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/3.1.5.1_ensemble_predictions.csv
