In [1]:
CONFIG = {
    'data_folder': 'C:/Users/Николай/PycharmProjects/VKRecSys/data/',
    'models_folder': 'C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/',
    'results_folder': 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/',

    'test_path': 'test_pairs.csv',  
    'model_path': '4.4.pth', 
    'output_path': '4.4_predictions.csv',
    'items_meta_path' : 'items_meta.parquet',
    'users_meta_path' : 'users_meta.parquet',

    'user_emb_size': 256,
    'item_emb_size': 256,
    'source_emb_size': 256,
    'torch_precision': 40,  # Количество знаков после запятой

    'DEVICE': 'cuda',
    'SEED': 42,
    'BATCH_SIZE': 16384,
    'output_dim': 3
}

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm

In [3]:
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=CONFIG['torch_precision'])

torch.manual_seed(CONFIG['SEED'])
torch.cuda.manual_seed_all(CONFIG['SEED'])
np.random.seed(CONFIG['SEED'])

In [4]:
test = pd.read_csv(f"{CONFIG['data_folder']}{CONFIG['test_path']}")

items_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['items_meta_path']}", engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

users_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['users_meta_path']}", engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta.set_index('user_id', inplace=True)

users_meta['age'] = (users_meta['age'] - users_meta['age'].min()) / (users_meta['age'].max() - users_meta['age'].min())
items_meta['duration'] = (items_meta['duration'] - items_meta['duration'].min()) / (
            items_meta['duration'].max() - items_meta['duration'].min())

In [5]:
class MLPModel(nn.Module):
    def __init__(self, 
                 input_dim, 
                 num_users=users_meta.index.nunique(), 
                 num_items=items_meta.index.nunique(), 
                 num_sources=items_meta['source_id'].nunique(),
                 output_dim=CONFIG['output_dim'],
                 dropout_rate=0.2):  # Добавлен параметр dropout_rate
        
        super(MLPModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.item_embedding = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding = nn.Embedding(num_sources, CONFIG['source_emb_size'])

        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 256)
        self.fc6 = nn.Linear(256, 128)
        self.fc7 = nn.Linear(128, 128)
        self.fc8 = nn.Linear(128, 64)
        self.fc9 = nn.Linear(64, output_dim)
        
        self.softplus = nn.Softplus()
        self.class_weights = nn.Parameter(torch.ones(output_dim))
        
    def forward(self, user_ids, item_ids, source_ids, age, duration, gender, embeddings):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)

        x = torch.cat((user_emb, item_emb, source_emb, age, duration, gender, embeddings), dim=1)
        
        x = self.softplus(self.fc1(x))
        
        x = self.softplus(self.fc2(x))
        
        x = self.softplus(self.fc3(x))
        
        x = self.softplus(self.fc4(x))
        
        x = self.softplus(self.fc5(x))
        
        x = self.softplus(self.fc6(x))
        
        x = self.softplus(self.fc7(x))
        
        x = self.softplus(self.fc8(x))
        
        x = self.fc9(x) * self.class_weights
        
        return x

In [6]:
input_dim = 1 + 1 + 1 + CONFIG['user_emb_size'] + CONFIG['item_emb_size'] + CONFIG['source_emb_size'] + 32

In [7]:
model = MLPModel(input_dim).to(device)
model.load_state_dict(torch.load(f"{CONFIG['models_folder']}{CONFIG['model_path']}")['model_state_dict'])
model.eval()

  model.load_state_dict(torch.load(f"{CONFIG['models_folder']}{CONFIG['model_path']}")['model_state_dict'])


MLPModel(
  (user_embedding): Embedding(183404, 256)
  (item_embedding): Embedding(337727, 256)
  (source_embedding): Embedding(19613, 256)
  (fc1): Linear(in_features=803, out_features=1024, bias=True)
  (fc2): Linear(in_features=1024, out_features=1024, bias=True)
  (fc3): Linear(in_features=1024, out_features=512, bias=True)
  (fc4): Linear(in_features=512, out_features=256, bias=True)
  (fc5): Linear(in_features=256, out_features=256, bias=True)
  (fc6): Linear(in_features=256, out_features=128, bias=True)
  (fc7): Linear(in_features=128, out_features=128, bias=True)
  (fc8): Linear(in_features=128, out_features=64, bias=True)
  (fc9): Linear(in_features=64, out_features=3, bias=True)
  (softplus): Softplus(beta=1.0, threshold=20.0)
)

In [8]:
predictions = []
num_samples = len(test)
num_batches = (num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

with torch.no_grad():
    for batch_idx in tqdm(range(num_batches), desc="Predicting", unit="batch"):
        start_idx = batch_idx * CONFIG['BATCH_SIZE']
        end_idx = min(start_idx + CONFIG['BATCH_SIZE'], num_samples)
        batch = test.iloc[start_idx:end_idx]

        batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
        batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
        batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].values,
                                        dtype=torch.long, device=device)
        batch_age = torch.tensor(users_meta.loc[batch['user_id'].values, 'age'].values, dtype=torch.float32,
                                 device=device).unsqueeze(1)
        batch_duration = torch.tensor(items_meta.loc[batch['item_id'].values, 'duration'].values, dtype=torch.float32,
                                      device=device).unsqueeze(1)
        batch_gender = torch.tensor(users_meta.loc[batch['user_id'].values, 'gender'].values,
                                    dtype=torch.float32, device=device).unsqueeze(1)

        item_indices = batch_item_ids.cpu().numpy()
        embeddings = torch.tensor(np.stack(items_meta.loc[item_indices, 'embeddings'].values), device=device,
                                  dtype=torch.float32)

        outputs = model(batch_user_ids, batch_item_ids, batch_source_ids, batch_age, batch_duration, batch_gender,
                        embeddings)
        probabilities = F.softmax(outputs, dim=1)

        # Взвешенные предсказания
        class_weights = torch.tensor([0, 1, 2], device=probabilities.device, dtype=probabilities.dtype)
        weighted_predictions = torch.sum(probabilities * class_weights, dim=1).cpu().numpy()

        predictions.extend(weighted_predictions)

Predicting: 100%|██████████| 102/102 [00:26<00:00,  3.89batch/s]


In [9]:
# Добавление предсказанных значений в DataFrame
test['predict'] = predictions

test.to_csv(f"{CONFIG['results_folder']}{CONFIG['output_path']}", index=False)
print(f"Predictions saved to f'{CONFIG['results_folder']}{CONFIG['output_path']}'")

Predictions saved to f'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/4.4_predictions.csv'
