In [1]:
# paths
train_parquet = 'train_interactions.parquet'
models_folder = '/kaggle/working/'
model_path = '3.1.3_DCN_MLP.pth'
custom_data_folder = 'C:/Users/Николай/PycharmProjects/VKRecSys/custom_data/'
orig_data_folder = '/kaggle/input/vkrecsys/'

# model params
BATCH_SIZE = 16384
NUM_CROSS_LAYERS = 1
LR = 0.001
EPOCHS = 3

In [2]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader
import torch.nn as nn
from torch.optim import Adam
import torch.nn.functional as F
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import LabelEncoder
import torch.nn as nn
import torch.optim as optim
from tqdm import tqdm

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

torch.manual_seed(42)  
torch.cuda.manual_seed_all(42)  
np.random.seed(42)  

In [4]:
train = pd.read_parquet(f'{orig_data_folder}{train_parquet}', engine='pyarrow')
train['like'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['dislike'], inplace=True)
train['like'] = train['like'].astype('int8')
train.rename(columns={'like' : 'target'}, inplace=True)
train['target'] = train['target'].replace({-1:0, 0:1, 1:2})

items_meta = pd.read_parquet(f'{orig_data_folder}items_meta.parquet', engine='pyarrow')
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

# users_meta
users_meta = pd.read_parquet(f'{orig_data_folder}users_meta.parquet', engine='pyarrow')
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta['age'] = users_meta['age'].replace({1:0, 2:1})
users_meta.set_index('user_id', inplace=True)

In [5]:
class EmbeddingLayer(nn.Module):
    def __init__(self, num_embeddings, embedding_dim):
        super(EmbeddingLayer, self).__init__()
        self.embedding = nn.Embedding(num_embeddings, embedding_dim)
        
    def forward(self, x):
        return self.embedding(x)

In [6]:
def normalize_column(column):
    return (column - column.min()) / (column.max() - column.min())

In [7]:
users_meta['age'] = normalize_column(users_meta['age'])
items_meta['duration'] = normalize_column(items_meta['duration'])

In [8]:
num_users = train['user_id'].nunique()
num_items = train['item_id'].nunique()
num_sources = items_meta['source_id'].nunique()

In [9]:
user_embedding = EmbeddingLayer(num_users, 256).to(device)
item_embedding = EmbeddingLayer(num_items, 256).to(device)
source_embedding = EmbeddingLayer(num_sources, 256).to(device)

In [10]:
input_dim = (1 + # gender
             1 + # age
             1 + # duration
             user_embedding.embedding.embedding_dim + 
             item_embedding.embedding.embedding_dim + 
             source_embedding.embedding.embedding_dim +
             32 # embeddings
             )

In [11]:
class DCN(nn.Module):
    def __init__(self, input_dim, num_cross_layers):
        super(DCN, self).__init__()
        self.input_dim = input_dim
        self.num_cross_layers = num_cross_layers
        
        # Параметры для слоев пересечения
        self.cross_weights = nn.ParameterList(
            [nn.Parameter(torch.randn(input_dim, 1)) for _ in range(num_cross_layers)]
        )
        self.cross_biases = nn.ParameterList(
            [nn.Parameter(torch.randn(input_dim)) for _ in range(num_cross_layers)]
        )
        
    def forward(self, x):
        # Инициализируем x0
        x0 = x
        for i in range(self.num_cross_layers):
            x = x0 * (x @ self.cross_weights[i]) + self.cross_biases[i] + x
        return x

class DCNWithMLP(nn.Module):
    def __init__(self, input_dim, num_cross_layers=3, hidden_dim=2048, output_dim=3):
        super(DCNWithMLP, self).__init__()
        
        # Нормализация входных данных
        self.batch_norm = nn.BatchNorm1d(input_dim)
        
        # DCN модуль
        self.dcn = DCN(input_dim, num_cross_layers)
        
        # MLP модуль
        self.fc1 = nn.Linear(input_dim, hidden_dim)
        self.fc2 = nn.Linear(hidden_dim, 2048)
        self.fc3 = nn.Linear(2048, 1024)
        self.fc4 = nn.Linear(1024, 512)
        self.fc5 = nn.Linear(512, 512)
        self.fc6 = nn.Linear(512, 256)
        self.fc7 = nn.Linear(256, 256)
        self.fc8 = nn.Linear(256, 128)
        self.fc9 = nn.Linear(128, output_dim)
        self.softplus = nn.Softplus()
    
    def forward(self, x):
        # Применяем нормализацию входных данных
        # x = self.batch_norm(x)
        
        # Пропускаем через DCN
        # x = self.dcn(x)
        
        # Пропускаем через MLP
        x = self.softplus(self.fc1(x))
        x = self.softplus(self.fc2(x))
        x = self.softplus(self.fc3(x))
        x = self.softplus(self.fc4(x))
        x = self.softplus(self.fc5(x))
        x = self.softplus(self.fc6(x))
        x = self.softplus(self.fc7(x))
        x = self.softplus(self.fc8(x))
        x = self.fc9(x)
        return x

# Определяем параметры
num_cross_layers = NUM_CROSS_LAYERS  # Количество слоев DCN

# Создаем модель
model = DCNWithMLP(input_dim, num_cross_layers).to(device)

In [12]:
# Кросс-энтропийная функция потерь для многоклассовой классификации
criterion = nn.CrossEntropyLoss()

# Оптимизатор
optimizer = Adam(list(model.parameters()) +
                 list(user_embedding.parameters()) +
                 list(item_embedding.parameters()) +
                 list(source_embedding.parameters()), lr=LR)

In [13]:
# Преобразуем embeddings в массив и храним в tensor
item_embeddings_array = torch.tensor(
    np.stack(items_meta['embeddings'].values), 
    device=device, 
    dtype=torch.float32
)

# Сохраняем индексы для быстрого доступа
item_id_to_index = {item: idx for idx, item in enumerate(items_meta.index)}

In [14]:
num_samples = len(train)
num_batches = (num_samples + BATCH_SIZE - 1) // BATCH_SIZE
user_embedding_weights_before = user_embedding.embedding.weight.clone().detach().cpu()

for epoch in range(EPOCHS):
    running_loss = 0.0  # Накопленный лосс для средней величины
    with tqdm(range(num_batches), desc=f"Epoch {epoch+1}/{EPOCHS}", unit="batch") as t:
        for batch_idx in t:
            start_idx = batch_idx * BATCH_SIZE
            end_idx = min(start_idx + BATCH_SIZE, num_samples)
            
            batch = train.iloc[start_idx:end_idx]

            batch_user_ids = torch.tensor(batch['user_id'].values, dtype=torch.long, device=device)
            batch_item_ids = torch.tensor(batch['item_id'].values, dtype=torch.long, device=device)
            batch_source_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'source_id'].cat.codes.values, dtype=torch.long, device=device)
            batch_age_ids = torch.tensor(users_meta.loc[batch['user_id'].values, 'age'].values, dtype=torch.float32, device=device).unsqueeze(1)
            batch_duration_ids = torch.tensor(items_meta.loc[batch['item_id'].values, 'duration'].values, dtype=torch.float32, device=device).unsqueeze(1)
            batch_gender_ids = torch.tensor(users_meta.loc[batch['user_id'].values, 'gender'].cat.codes.values, dtype=torch.float32, device=device).unsqueeze(1)

            user_emb = user_embedding(batch_user_ids)
            item_emb = item_embedding(batch_item_ids)
            source_emb = source_embedding(batch_source_ids)

            item_indices = batch_item_ids.cpu().numpy()
            embeddings = torch.tensor(
                np.stack(items_meta.loc[item_indices, 'embeddings'].values), 
                device=device, 
                dtype=torch.float32
            )

            inputs = torch.cat((
                user_emb,
                item_emb,
                source_emb,
                batch_age_ids,
                batch_duration_ids,
                batch_gender_ids,
                embeddings
            ), dim=1).float()

            targets = torch.tensor(batch['target'].values, dtype=torch.long, device=device)

            optimizer.zero_grad()
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            loss.backward()
            optimizer.step()

            batch_loss = loss.item()  # Лосс для текущего батча
            running_loss += batch_loss  # Накопление общего лосса

            # Обновляем tqdm выводом текущего и среднего лосса
            t.set_postfix(
                batch_loss=f"{batch_loss:.6f}",
                mean_loss=f"{running_loss / (batch_idx + 1):.6f}"
            )  

    # Средний лосс после эпохи
    epoch_loss = running_loss / num_batches
    print(f"Epoch [{epoch+1}/{EPOCHS}], Mean Loss: {epoch_loss:.4f}")

Epoch 1/3: 100%|██████████| 8891/8891 [30:19<00:00,  4.89batch/s, batch_loss=0.130457, mean_loss=0.133235]


Epoch [1/3], Mean Loss: 0.1332


Epoch 2/3: 100%|██████████| 8891/8891 [30:21<00:00,  4.88batch/s, batch_loss=0.129136, mean_loss=0.125161]


Epoch [2/3], Mean Loss: 0.1252


Epoch 3/3: 100%|██████████| 8891/8891 [29:42<00:00,  4.99batch/s, batch_loss=0.127979, mean_loss=0.123919]

Epoch [3/3], Mean Loss: 0.1239





In [15]:
# Сохраняем обучаемую модель и эмбеддинги
state = {
    "model_state_dict": model.state_dict(),
    "user_embedding_state_dict": user_embedding.state_dict(),
    "item_embedding_state_dict": item_embedding.state_dict(),
    "source_embedding_state_dict": source_embedding.state_dict()
}

save_path = f'{models_folder}{model_path}'
torch.save(state, save_path)
print(f"Модель и эмбеддинги сохранены в {save_path}")

Модель и эмбеддинги сохранены в /kaggle/working/3.1.3_DCN_MLP.pth
