In [1]:
CONFIG = {
    'data_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/data/',
    'custom_data_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/custom_data/',
    'models_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/',
    
    
    'train_path' : 'train_interactions.parquet',
    'items_meta_path' : 'av4_items_meta.parquet',
    'users_meta_path' : 'av4_users_meta.parquet',
    'model_path' : '5.6.pth',
    
    'user_emb_size' : 256, # 183404
    'item_emb_size' : 256, # 337727
    'source_emb_size' : 256, # 19613
    'torch_precision' : 40, # number of decimal places for printing numbers
        
    'DEVICE' : 'cuda',
    'SEED' : 42,
    'BATCH_SIZE' : 16384,
    'LR' : 0.001,
    'EPOCHS' : 3,
    'output_dim' : 3
    
}

In [2]:
import numpy as np
import torch
import torch.nn as nn
from torch.optim import Adam
import pandas as pd
from tqdm import tqdm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(42)  
torch.cuda.manual_seed_all(42)  
np.random.seed(42)  

In [4]:
# Loading data
train = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['train_path']}", engine='pyarrow')
train['like'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['dislike'], inplace=True)
train['like'] = train['like'].astype('int8')
train.rename(columns={'like': 'target'}, inplace=True)
train['target'] = train['target'].replace({-1: 0, 0: 1, 1: 2})
items_meta = pd.read_parquet(f"{CONFIG['custom_data_folder']}{CONFIG['items_meta_path']}", engine='pyarrow')
users_meta = pd.read_parquet(f"{CONFIG['custom_data_folder']}{CONFIG['users_meta_path']}", engine='pyarrow')
users_meta['gender'] = users_meta['gender'].replace({1: 0, 2: 1})
# Normalization of numeric features
users_meta['age'] = (users_meta['age'] - users_meta['age'].min()) / (users_meta['age'].max() - users_meta['age'].min())
items_meta['duration'] = (items_meta['duration'] - items_meta['duration'].min()) / (
items_meta['duration'].max() - items_meta['duration'].min())

In [5]:
folds = pd.read_csv(f"{CONFIG['data_folder']}fold.csv")

In [6]:
# Model definition
class MLPModel(nn.Module):
    def __init__(self, 
                 input_dim, 
                 num_users=users_meta.index.nunique(), 
                 num_items=items_meta.index.nunique(), 
                 num_sources=items_meta['source_id'].nunique(),
                 output_dim=CONFIG['output_dim'],
                 dropout_rate=0.2):  # Добавлен параметр dropout_rate
        
        super(MLPModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.item_embedding = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding = nn.Embedding(num_sources, CONFIG['source_emb_size'])

        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 1024)
        self.fc3 = nn.Linear(1024, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 256)
        self.fc6 = nn.Linear(256, 128)
        self.fc7 = nn.Linear(128, 128)
        self.fc8 = nn.Linear(128, 64)
        self.fc9 = nn.Linear(64, output_dim)
        
        self.gelu = nn.GELU()

    def forward(self, user_ids, item_ids, source_ids, embeddings, u, i):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)

        x = torch.cat((user_emb, item_emb, source_emb, embeddings, u, i), dim=1)
        
        x = self.gelu(self.fc1(x))
        
        x = self.gelu(self.fc2(x))
        
        x = self.gelu(self.fc3(x))
        
        x = self.gelu(self.fc4(x))
        
        x = self.gelu(self.fc5(x))
        
        x = self.gelu(self.fc6(x))
        
        x = self.gelu(self.fc7(x))
        
        x = self.gelu(self.fc8(x))
        
        x = self.fc9(x)
        return x

In [7]:
input_dim = 1 + 1 + 1 + CONFIG['user_emb_size'] + CONFIG['item_emb_size'] + CONFIG['source_emb_size'] + 32 + 24

In [8]:
import gc  # Для сборщика мусора

for fold in range(4):
    print(f"Обучение модели для fold {fold}...")
    
    # Разделение данных на train и validation
    train_data = train[folds['fold'] != fold]
    val_data = train[folds['fold'] == fold]
    
    # Model creation
    model = MLPModel(input_dim).to(device)
    criterion = nn.CrossEntropyLoss()
    optimizer = Adam(model.parameters(), lr=CONFIG['LR'])
    
    # Обучение модели
    num_samples = len(train_data)
    num_batches = (num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']
    
    for epoch in range(CONFIG['EPOCHS']):
        running_loss = 0.0
        with tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{CONFIG['EPOCHS']}", unit="batch") as t:
            for batch_idx in t:
                start_idx = batch_idx * CONFIG['BATCH_SIZE']
                end_idx = min(start_idx + CONFIG['BATCH_SIZE'], num_samples)
                batch = train.iloc[start_idx:end_idx]
    
                batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
                batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
                batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].values, dtype=torch.long, device=device)
                
                users_features = users_meta.loc[batch['user_id'].values]  
                users_features = torch.tensor(users_features.values, dtype=torch.float32, device=device)
                
                items_features = items_meta.loc[batch['item_id'].values].drop(columns=['source_id', 'embeddings'])
                items_features = torch.tensor(items_features.values, dtype=torch.float32, device=device)
                
                item_indices = batch_item_ids.cpu().numpy()
                embeddings = torch.tensor(np.stack(items_meta.loc[item_indices, 'embeddings'].values), device=device, dtype=torch.float32)
                
                
                targets = torch.tensor(batch['target'].values, dtype=torch.long, device=device)
    
                optimizer.zero_grad()
                outputs = model(batch_user_ids, batch_item_ids, batch_source_ids, embeddings, users_features, items_features)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()
                
                running_loss += loss.item()
                t.set_postfix(mean_loss=f"{running_loss / (batch_idx + 1):.6f}")
    
        print(f"Epoch [{epoch+1}/{CONFIG['EPOCHS']}], Fold {fold}, Loss: {running_loss / num_batches:.4f}")
    
    # Сохранение модели для текущего fold
    fold_model_path = f"{CONFIG['models_folder']}_{CONFIG['model_path']}_fold_{fold}"
    torch.save({"model_state_dict": model.state_dict()}, fold_model_path)
    print(f"Модель для fold {fold} сохранена в {fold_model_path}")

    # Очистка VRAM
    del model, optimizer, criterion  # Удаляем объекты модели и оптимизатора
    torch.cuda.empty_cache()  # Очищаем видеопамять
    gc.collect()  # Сбор мусора в системе
    print(f"VRAM очищена после fold {fold}.")

Обучение модели для fold 0...


Epoch 1/3: 100%|██████████| 6664/6664 [28:54<00:00,  3.84batch/s, mean_loss=0.130121]


Epoch [1/3], Fold 0, Loss: 0.1301


Epoch 2/3: 100%|██████████| 6664/6664 [33:37<00:00,  3.30batch/s, mean_loss=0.122709]


Epoch [2/3], Fold 0, Loss: 0.1227


Epoch 3/3: 100%|██████████| 6664/6664 [32:13<00:00,  3.45batch/s, mean_loss=0.119192]


Epoch [3/3], Fold 0, Loss: 0.1192
Модель для fold 0 сохранена в C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/_5.6.pth_fold_0
VRAM очищена после fold 0.
Обучение модели для fold 1...


Epoch 1/3: 100%|██████████| 6667/6667 [34:28<00:00,  3.22batch/s, mean_loss=0.130236]


Epoch [1/3], Fold 1, Loss: 0.1302


Epoch 2/3: 100%|██████████| 6667/6667 [33:02<00:00,  3.36batch/s, mean_loss=0.122780]


Epoch [2/3], Fold 1, Loss: 0.1228


Epoch 3/3: 100%|██████████| 6667/6667 [31:49<00:00,  3.49batch/s, mean_loss=0.119294]


Epoch [3/3], Fold 1, Loss: 0.1193
Модель для fold 1 сохранена в C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/_5.6.pth_fold_1
VRAM очищена после fold 1.
Обучение модели для fold 2...


Epoch 1/3: 100%|██████████| 6670/6670 [33:30<00:00,  3.32batch/s, mean_loss=0.130327]


Epoch [1/3], Fold 2, Loss: 0.1303


Epoch 2/3: 100%|██████████| 6670/6670 [34:06<00:00,  3.26batch/s, mean_loss=0.122799]


Epoch [2/3], Fold 2, Loss: 0.1228


Epoch 3/3: 100%|██████████| 6670/6670 [34:47<00:00,  3.20batch/s, mean_loss=0.119276]


Epoch [3/3], Fold 2, Loss: 0.1193
Модель для fold 2 сохранена в C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/_5.6.pth_fold_2
VRAM очищена после fold 2.
Обучение модели для fold 3...


Epoch 1/3: 100%|██████████| 6673/6673 [35:19<00:00,  3.15batch/s, mean_loss=0.130021]


Epoch [1/3], Fold 3, Loss: 0.1300


Epoch 2/3: 100%|██████████| 6673/6673 [35:02<00:00,  3.17batch/s, mean_loss=0.122762]


Epoch [2/3], Fold 3, Loss: 0.1228


Epoch 3/3: 100%|██████████| 6673/6673 [35:11<00:00,  3.16batch/s, mean_loss=0.119233]


Epoch [3/3], Fold 3, Loss: 0.1192
Модель для fold 3 сохранена в C:/Users/Николай/PycharmProjects/VKRecSys/B.Processing/Модели/_5.6.pth_fold_3
VRAM очищена после fold 3.
