In [1]:
CONFIG = {
    'data_folder': 'C:/Users/Николай/PycharmProjects/VKRecSys/data/',
    'models_folder': 'C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/',
    'results_folder': 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/',

    'test_path': 'test_pairs.csv',  
    'model_path': '5.7.pth', 
    'output_path': '5.7_predictions.csv',
    'items_meta_path' : 'items_meta.parquet',
    'users_meta_path' : 'users_meta.parquet',

    'user_emb_size': 256,
    'item_emb_size': 256,
    'source_emb_size': 256,
    'torch_precision': 40,  # Количество знаков после запятой

    'DEVICE': 'cuda',
    'SEED': 42,
    'BATCH_SIZE': 16384,
    'output_dim': 3
}

In [2]:
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.optim import Adam
from tqdm import tqdm

In [3]:
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=CONFIG['torch_precision'])

torch.manual_seed(CONFIG['SEED'])
torch.cuda.manual_seed_all(CONFIG['SEED'])
np.random.seed(CONFIG['SEED'])

In [4]:
test = pd.read_csv(f"{CONFIG['data_folder']}{CONFIG['test_path']}")

items_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['items_meta_path']}", engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

users_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['users_meta_path']}", engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta.set_index('user_id', inplace=True)

users_meta['age'] = (users_meta['age'] - users_meta['age'].min()) / (users_meta['age'].max() - users_meta['age'].min())
items_meta['duration'] = (items_meta['duration'] - items_meta['duration'].min()) / (
            items_meta['duration'].max() - items_meta['duration'].min())

In [5]:
# Model definition
class TwoTowerModel(nn.Module):
    def __init__(self, 
                 user_input_dim, 
                 item_input_dim, 
                 num_users=users_meta.index.nunique(), 
                 num_items=items_meta.index.nunique(), 
                 num_sources=items_meta['source_id'].nunique(),
                 output_dim=CONFIG['output_dim']):  # Параметр dropout добавлен для регуляризации
        super(TwoTowerModel, self).__init__()
        
        # User tower
        self.user_embedding = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.user_fc1 = nn.Linear(user_input_dim, 1024)
        self.user_fc2 = nn.Linear(1024, 512)
        self.user_fc3 = nn.Linear(512, 256)
        
        # Item tower
        self.item_embedding = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding = nn.Embedding(num_sources, CONFIG['source_emb_size'])
        self.item_fc1 = nn.Linear(item_input_dim, 1024)
        self.item_fc2 = nn.Linear(1024, 512)
        self.item_fc3 = nn.Linear(512, 256)
        
        # Final output layer
        self.output_layer = nn.Linear(512, output_dim) 
        
        # Activation and dropout
        self.gelu = nn.GELU()

    def forward(self, user_ids, age, gender, 
                item_ids, source_ids, duration, embeddings):
        # User tower
        user_emb = self.user_embedding(user_ids)
        user_input = torch.cat((user_emb, age, gender), dim=1)
        user_x = self.gelu(self.user_fc1(user_input))
        user_x = self.gelu(self.user_fc2(user_x))
        user_x = self.gelu(self.user_fc3(user_x))
        
        # Item tower
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)
        item_input = torch.cat((item_emb, source_emb, duration, embeddings), dim=1)
        item_x = self.gelu(self.item_fc1(item_input))
        item_x = self.gelu(self.item_fc2(item_x))
        item_x = self.gelu(self.item_fc3(item_x))
        
        # Combine user and item representations
        combined = torch.cat((user_x, item_x), dim=1) 
        output = self.output_layer(combined)
        return output

In [9]:
user_input_dim = 1 + 1 + CONFIG['user_emb_size']
item_input_dim = 1 + CONFIG['item_emb_size'] + CONFIG['source_emb_size'] + 32

In [11]:
model = TwoTowerModel(user_input_dim, item_input_dim).to(device)
model.load_state_dict(torch.load(f"{CONFIG['models_folder']}{CONFIG['model_path']}")['model_state_dict'])
model.eval()

  model.load_state_dict(torch.load(f"{CONFIG['models_folder']}{CONFIG['model_path']}")['model_state_dict'])


TwoTowerModel(
  (user_embedding): Embedding(183404, 256)
  (user_fc1): Linear(in_features=258, out_features=1024, bias=True)
  (user_fc2): Linear(in_features=1024, out_features=512, bias=True)
  (user_fc3): Linear(in_features=512, out_features=256, bias=True)
  (item_embedding): Embedding(337727, 256)
  (source_embedding): Embedding(19613, 256)
  (item_fc1): Linear(in_features=545, out_features=1024, bias=True)
  (item_fc2): Linear(in_features=1024, out_features=512, bias=True)
  (item_fc3): Linear(in_features=512, out_features=256, bias=True)
  (output_layer): Linear(in_features=512, out_features=3, bias=True)
  (gelu): GELU(approximate='none')
)

In [13]:
predictions = []
num_samples = len(test)
num_batches = (num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

with torch.no_grad():
    for batch_idx in tqdm(range(num_batches), desc="Predicting", unit="batch"):
        start_idx = batch_idx * CONFIG['BATCH_SIZE']
        end_idx = min(start_idx + CONFIG['BATCH_SIZE'], num_samples)
        batch = test.iloc[start_idx:end_idx]

        batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
        batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
        batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].values,
                                        dtype=torch.long, device=device)
        batch_age = torch.tensor(users_meta.loc[batch['user_id'].values, 'age'].values, dtype=torch.float32,
                                 device=device).unsqueeze(1)
        batch_duration = torch.tensor(items_meta.loc[batch['item_id'].values, 'duration'].values, dtype=torch.float32,
                                      device=device).unsqueeze(1)
        batch_gender = torch.tensor(users_meta.loc[batch['user_id'].values, 'gender'].values,
                                    dtype=torch.float32, device=device).unsqueeze(1)

        item_indices = batch_item_ids.cpu().numpy()
        embeddings = torch.tensor(np.stack(items_meta.loc[item_indices, 'embeddings'].values), device=device,
                                  dtype=torch.float32)

        outputs = model(batch_user_ids, batch_age, batch_gender, batch_item_ids, batch_source_ids, batch_duration, embeddings)
        probabilities = F.softmax(outputs, dim=1)

        # Взвешенные предсказания
        class_weights = torch.tensor([0, 1, 2], device=probabilities.device, dtype=probabilities.dtype)
        weighted_predictions = torch.sum(probabilities * class_weights, dim=1).cpu().numpy()

        predictions.extend(weighted_predictions)

Predicting: 100%|██████████| 102/102 [00:22<00:00,  4.49batch/s]


In [14]:
# Добавление предсказанных значений в DataFrame
test['predict'] = predictions

test.to_csv(f"{CONFIG['results_folder']}{CONFIG['output_path']}", index=False)
print(f"Predictions saved to f'{CONFIG['results_folder']}{CONFIG['output_path']}'")

Predictions saved to f'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/5.7_predictions.csv'
