In [1]:
# paths
train_parquet = 'train_interactions.parquet'
models_folder = '/kaggle/working/'
model_path = '3.1.5_MLP.pth'
custom_data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/custom_data/'
orig_data_folder = '/kaggle/input/vkrecsys/'
folds_path = 'fold.csv'

# model params
BATCH_SIZE = 16384
NUM_CROSS_LAYERS = 1
LR = 0.001
EPOCHS = 3

In [2]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
import pandas as pd
from tqdm import tqdm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(42)  
torch.cuda.manual_seed_all(42)  
np.random.seed(42)  

In [4]:
train = pd.read_parquet(f'{orig_data_folder}{train_parquet}', engine='pyarrow')
train['like'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['dislike'], inplace=True)
train['like'] = train['like'].astype('int8')
train.rename(columns={'like' : 'target'}, inplace=True)
train['target'] = train['target'].replace({-1:0, 0:1, 1:2})

items_meta = pd.read_parquet(f'{orig_data_folder}items_meta.parquet', engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

# users_meta
users_meta = pd.read_parquet(f'{orig_data_folder}users_meta.parquet', engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta['age'] = users_meta['age'].replace({1:0, 2:1})
users_meta.set_index('user_id', inplace=True)

# Преобразуем embeddings в словарь
item_embeddings_dict = items_meta['embeddings'].to_dict()

In [5]:
folds = pd.read_csv(f'{orig_data_folder}fold.csv')

In [6]:
def normalize_column(column):
    return (column - column.min()) / (column.max() - column.min())

In [7]:
users_meta['age'] = normalize_column(users_meta['age'])
items_meta['duration'] = normalize_column(items_meta['duration'])

In [8]:
num_users = train['user_id'].nunique()
num_items = train['item_id'].nunique()
num_sources = items_meta['source_id'].nunique()

In [9]:
class MLPModel(nn.Module):
    def __init__(self, num_users, num_items, num_sources, input_dim, hidden_dim=2048, output_dim=3):
        super(MLPModel, self).__init__()
        self.user_embedding = nn.Embedding(num_users, 256)
        self.item_embedding = nn.Embedding(num_items, 256)
        self.source_embedding = nn.Embedding(num_sources, 256)
        
        # MLP модуль
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2048)
        self.fc3 = nn.Linear(2048, 1024)
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, 512)
        self.fc6 = nn.Linear(512, 256)
        self.fc7 = nn.Linear(256, 256)
        self.fc8 = nn.Linear(256, 128)
        self.fc9 = nn.Linear(128, output_dim)
        self.activation = nn.Softplus()
    
    def forward(self, user_ids, item_ids, source_ids, age, duration, gender, embeddings):
        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)

        x = torch.cat((user_emb, item_emb, source_emb, age, duration, gender, embeddings), dim=1)
        x = self.activation(self.fc1(x))
        x = self.activation(self.fc2(x))
        x = self.activation(self.fc3(x))
        x = self.activation(self.fc4(x))
        x = self.activation(self.fc5(x))
        x = self.activation(self.fc6(x))
        x = self.activation(self.fc7(x))
        x = self.activation(self.fc8(x))
        x = self.fc9(x)
        return x

# Размер входных данных
input_dim = 1 + 1 + 1 + 256 + 256 + 256 + 32

In [10]:
import gc  # Для сборщика мусора

for fold in range(4):
    print(f"Обучение модели для fold {fold}...")
    
    # Разделение данных на train и validation
    train_data = train[folds['fold'] != fold]
    val_data = train[folds['fold'] == fold]
    
    # Создание новой модели для каждого fold
    model = MLPModel(num_users, num_items, num_sources, input_dim).to(device)
    optimizer = Adam(model.parameters(), lr=LR)
    criterion = nn.CrossEntropyLoss()
    
    # Обучение модели
    num_samples = len(train_data)
    num_batches = (num_samples + BATCH_SIZE - 1) // BATCH_SIZE
    
    for epoch in range(EPOCHS):
        running_loss = 0.0
        with tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{EPOCHS} (Fold {fold})", unit="batch") as t:
            for batch_idx in t:
                start_idx = batch_idx * BATCH_SIZE
                end_idx = min(start_idx + BATCH_SIZE, num_samples)
                batch = train_data.iloc[start_idx:end_idx]

                batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
                batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
                batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].cat.codes.values, dtype=torch.long, device=device)
                batch_age = torch.tensor(users_meta.loc[batch['user_id'].values, 'age'].values, dtype=torch.float32, device=device).unsqueeze(1)
                batch_duration = torch.tensor(items_meta.loc[batch['item_id'].values, 'duration'].values, dtype=torch.float32, device=device).unsqueeze(1)
                batch_gender = torch.tensor(users_meta.loc[batch['user_id'].values, 'gender'].cat.codes.values, dtype=torch.float32, device=device).unsqueeze(1)

                item_indices = batch_item_ids.cpu().numpy()
                embeddings = torch.tensor(np.stack(items_meta.loc[item_indices, 'embeddings'].values), device=device, dtype=torch.float32)

                targets = torch.tensor(batch['target'].values, dtype=torch.long, device=device)

                optimizer.zero_grad()
                outputs = model(batch_user_ids, batch_item_ids, batch_source_ids, batch_age, batch_duration, batch_gender, embeddings)
                loss = criterion(outputs, targets)
                loss.backward()
                optimizer.step()

                running_loss += loss.item()
                t.set_postfix(mean_loss=f"{running_loss / (batch_idx + 1):.6f}")

        print(f"Epoch [{epoch+1}/{EPOCHS}], Fold {fold}, Loss: {running_loss / num_batches:.4f}")
    
    # Сохранение модели для текущего fold
    fold_model_path = f"{models_folder}fold_{fold}_{model_path}"
    torch.save({"model_state_dict": model.state_dict()}, fold_model_path)
    print(f"Модель для fold {fold} сохранена в {fold_model_path}")

    # Очистка VRAM
    del model, optimizer, criterion  # Удаляем объекты модели и оптимизатора
    torch.cuda.empty_cache()  # Очищаем видеопамять
    gc.collect()  # Сбор мусора в системе
    print(f"VRAM очищена после fold {fold}.")

Обучение модели для fold 0...


Epoch 1/3 (Fold 0): 100%|██████████| 6664/6664 [22:10<00:00,  5.01batch/s, mean_loss=0.138331]


Epoch [1/3], Fold 0, Loss: 0.1383


Epoch 2/3 (Fold 0): 100%|██████████| 6664/6664 [21:52<00:00,  5.08batch/s, mean_loss=0.125388]


Epoch [2/3], Fold 0, Loss: 0.1254


Epoch 3/3 (Fold 0): 100%|██████████| 6664/6664 [21:47<00:00,  5.10batch/s, mean_loss=0.123699]


Epoch [3/3], Fold 0, Loss: 0.1237
Модель для fold 0 сохранена в /kaggle/working/fold_0_3.1.5_MLP.pth
VRAM очищена после fold 0.
Обучение модели для fold 1...


Epoch 1/3 (Fold 1): 100%|██████████| 6667/6667 [21:49<00:00,  5.09batch/s, mean_loss=0.136219]


Epoch [1/3], Fold 1, Loss: 0.1362


Epoch 2/3 (Fold 1): 100%|██████████| 6667/6667 [21:45<00:00,  5.11batch/s, mean_loss=0.125394]


Epoch [2/3], Fold 1, Loss: 0.1254


Epoch 3/3 (Fold 1): 100%|██████████| 6667/6667 [21:44<00:00,  5.11batch/s, mean_loss=0.123899]


Epoch [3/3], Fold 1, Loss: 0.1239
Модель для fold 1 сохранена в /kaggle/working/fold_1_3.1.5_MLP.pth
VRAM очищена после fold 1.
Обучение модели для fold 2...


Epoch 1/3 (Fold 2): 100%|██████████| 6670/6670 [21:51<00:00,  5.09batch/s, mean_loss=0.135451]


Epoch [1/3], Fold 2, Loss: 0.1355


Epoch 2/3 (Fold 2): 100%|██████████| 6670/6670 [21:54<00:00,  5.07batch/s, mean_loss=0.125385]


Epoch [2/3], Fold 2, Loss: 0.1254


Epoch 3/3 (Fold 2): 100%|██████████| 6670/6670 [21:52<00:00,  5.08batch/s, mean_loss=0.124018]


Epoch [3/3], Fold 2, Loss: 0.1240
Модель для fold 2 сохранена в /kaggle/working/fold_2_3.1.5_MLP.pth
VRAM очищена после fold 2.
Обучение модели для fold 3...


Epoch 1/3 (Fold 3): 100%|██████████| 6673/6673 [21:53<00:00,  5.08batch/s, mean_loss=0.135774]


Epoch [1/3], Fold 3, Loss: 0.1358


Epoch 2/3 (Fold 3): 100%|██████████| 6673/6673 [21:57<00:00,  5.07batch/s, mean_loss=0.125414]


Epoch [2/3], Fold 3, Loss: 0.1254


Epoch 3/3 (Fold 3): 100%|██████████| 6673/6673 [21:55<00:00,  5.07batch/s, mean_loss=0.124055]


Epoch [3/3], Fold 3, Loss: 0.1241
Модель для fold 3 сохранена в /kaggle/working/fold_3_3.1.5_MLP.pth
VRAM очищена после fold 3.
