In [1]:
CONFIG = {
    'data_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/data/',
    'models_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/',
    
    'train_path' : 'train_interactions.parquet',
    'items_meta_path' : 'items_meta.parquet',
    'users_meta_path' : 'users_meta.parquet',
    'model_path' : '5.2.pth',
    
    'user_emb_size' : 256, # 183404
    'item_emb_size' : 256, # 337727
    'source_emb_size' : 256, # 19613
    'age_emb_size' : 64, # 43
    'duration_emb_size' : 128, # ~175   
    'gender_emb_size' : 16, # 3
    'torch_precision' : 40, # number of decimal places for printing numbers
    
    'DEVICE' : 'cuda',
    'SEED' : 42,
    'BATCH_SIZE' : 16384,
    'LR' : 0.001,
    'EPOCHS' : 3,
    'output_dim' : 3
    
}

In [2]:
# Import libs
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch
import torch.nn as nn
from torch.optim import Adam

In [3]:
# Device and seed
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=CONFIG['torch_precision']) 

torch.manual_seed(CONFIG['SEED'])  
torch.cuda.manual_seed_all(CONFIG['SEED'])  
np.random.seed(CONFIG['SEED'])  

In [4]:
# Loading data
train = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['train_path']}", engine='pyarrow')
train['like'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['dislike'], inplace=True)
train['like'] = train['like'].astype('int8')
train.rename(columns={'like' : 'target'}, inplace=True)
train['target'] = train['target'].replace({-1:0, 0:1, 1:2})

items_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['items_meta_path']}", engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

items_meta['duration'] = items_meta['duration'] - 5

users_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['users_meta_path']}", engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['gender'] = users_meta['gender'].replace({1:0, 2:1})
users_meta.set_index('user_id', inplace=True)

users_meta['age'] = users_meta['age'] - 18

In [5]:
# Model definition
class MLPModel(nn.Module):
    def __init__(self, 
                 input_dim, 
                 num_users=users_meta.index.nunique(), 
                 num_items=items_meta.index.nunique(), 
                 num_sources=items_meta['source_id'].nunique(),
                 num_ages=users_meta['age'].nunique(),
                 num_durations=items_meta['duration'].nunique(),
                 num_genders=users_meta['gender'].nunique(), 
                 output_dim=CONFIG['output_dim'],
                 dropout_rate=0.2):  # Добавлен параметр dropout_rate
        
        super(MLPModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.item_embedding = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding = nn.Embedding(num_sources, CONFIG['source_emb_size'])
        self.age_embedding = nn.Embedding(num_ages, CONFIG['age_emb_size'])
        self.duration_embedding = nn.Embedding(num_durations, CONFIG['duration_emb_size'])
        self.gender_embedding = nn.Embedding(num_genders, CONFIG['gender_emb_size'])

        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 256)
        self.fc6 = nn.Linear(256, 128)
        self.fc7 = nn.Linear(128, 128)
        self.fc8 = nn.Linear(128, 64)
        self.fc9 = nn.Linear(64, output_dim)
        
        self.gelu = nn.GELU()

    def forward(self, user_ids, item_ids, source_ids, age_ids, duration_ids, gender_ids, embeddings):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)
        age_emb = self.age_embedding(age_ids)
        duration_emb = self.duration_embedding(duration_ids)
        gender_emb = self.gender_embedding(gender_ids)

        x = torch.cat((user_emb, item_emb, source_emb, age_emb, duration_emb, gender_emb, embeddings), dim=1)
        
        x = self.gelu(self.fc1(x))
        
        x = self.gelu(self.fc2(x))
        
        x = self.gelu(self.fc3(x))
        
        x = self.gelu(self.fc4(x))
        
        x = self.gelu(self.fc5(x))
        
        x = self.gelu(self.fc6(x))
        
        x = self.gelu(self.fc7(x))
        
        x = self.gelu(self.fc8(x))
        
        x = self.fc9(x)
        return x

In [6]:
# Input dimension
input_dim = (CONFIG['user_emb_size'] + 
             CONFIG['item_emb_size'] + 
             CONFIG['source_emb_size'] + 
             CONFIG['age_emb_size'] +
             CONFIG['duration_emb_size'] + 
             CONFIG['gender_emb_size'] + 
             32)

In [7]:
# Model creation
model = MLPModel(input_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=CONFIG['LR'])

In [8]:
# Train
num_samples = len(train)
num_batches = (num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

for epoch in range(CONFIG['EPOCHS']):
    running_loss = 0.0
    with tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{CONFIG['EPOCHS']}", unit="batch") as t:
        for batch_idx in t:
            start_idx = batch_idx * CONFIG['BATCH_SIZE']
            end_idx = min(start_idx + CONFIG['BATCH_SIZE'], num_samples)
            batch = train.iloc[start_idx:end_idx]

            batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
            batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
            batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].values, dtype=torch.long, device=device)
            batch_age_ids = torch.tensor(users_meta.loc[batch['user_id'].values, 'age'].values, dtype=torch.long, device=device)
            batch_duration_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'duration'].values, dtype=torch.long, device=device)
            batch_gender_ids = torch.tensor(users_meta.loc[batch['user_id'].values, 'gender'].values, dtype=torch.long, device=device)

            item_indices = batch_item_ids.cpu().numpy()
            embeddings = torch.tensor(np.stack(items_meta.loc[item_indices, 'embeddings'].values), device=device, dtype=torch.float32)

            targets = torch.tensor(batch['target'].values, dtype=torch.long, device=device)

            optimizer.zero_grad()
            outputs = model(batch_user_ids, batch_item_ids, batch_source_ids, batch_age_ids, batch_duration_ids, batch_gender_ids, embeddings)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()
            
            running_loss += loss.item()
            t.set_postfix(mean_loss=f"{running_loss / (batch_idx + 1):.6f}")

    print(f"Epoch [{epoch+1}/{CONFIG['EPOCHS']}], Loss: {running_loss / num_batches:.4f}")

Epoch 1/3: 100%|██████████| 8891/8891 [52:43<00:00,  2.81batch/s, mean_loss=0.133795]


Epoch [1/3], Loss: 0.1338


Epoch 2/3: 100%|██████████| 8891/8891 [1:00:54<00:00,  2.43batch/s, mean_loss=0.124481]


Epoch [2/3], Loss: 0.1245


Epoch 3/3: 100%|██████████| 8891/8891 [58:56<00:00,  2.51batch/s, mean_loss=0.122420]  

Epoch [3/3], Loss: 0.1224





In [9]:
# Save model_state
torch.save({"model_state_dict": model.state_dict()}, f"{CONFIG['models_folder']}{CONFIG['model_path']}")
print(f"Модель сохранена в f'{CONFIG['models_folder']}{CONFIG['model_path']}'")

Модель сохранена в f'C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/5.2.pth'
