In [1]:
number = '7.6.20'

In [2]:
CONFIG = {
    'data_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/data/',
    'val_pred_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/val/',
    'test_pred_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/test/',
    
    'train_path' : 'train_interactions.parquet',
    'test_path': 'test_pairs.csv',  
    'items_meta_path' : 'items_meta.parquet',
    'users_meta_path' : 'users_meta.parquet',
    'folds_path' : 'fold.csv',
    'val_output_path' : f'{number}_val',
    'test_output_path' : f'{number}_test',
    
    'user_emb_size' : 256, 
    'item_emb_size' : 256, 
    'source_emb_size' : 256, 
    'age_emb_size' : 256, 
    'duration_emb_size' : 256, 
    'gender_emb_size' : 256, 
    
    'DEVICE' : 'cuda',
    'SEED' : 42,
    'BATCH_SIZE' : 16384,
    'LR' : 0.001,
    'EPOCHS' : 1,
    'output_dim' : 3
    
}

In [3]:
# Import libs
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.optim import Adam

In [4]:
# Device, torch decimal places and seed for reproducibility
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=40) 

torch.manual_seed(CONFIG['SEED'])  
torch.cuda.manual_seed_all(CONFIG['SEED'])  
np.random.seed(CONFIG['SEED'])  

In [5]:
# Load and prepare data
train = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['train_path']}", engine='pyarrow')
train['target'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['like', 'dislike'], inplace=True)
train['target'] = train['target'].astype('int8')
train['target'] = train['target'].replace({-1:0, 0:1, 1:2})

test = pd.read_csv(f"{CONFIG['data_folder']}{CONFIG['test_path']}")
test_to_save = test.copy()

items_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['items_meta_path']}", engine='pyarrow')
items_meta['duration'] = items_meta['duration'] - 5
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

users_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['users_meta_path']}", engine='pyarrow')
users_meta['age'] = users_meta['age'] - 18
users_meta['gender'] = users_meta['gender'].replace({1:0, 2:1})
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta.set_index('user_id', inplace=True)

In [6]:
user_history = {
    user_id: {
        "embeddings_like": [],
        "embeddings_dislike": [],
        "embeddings_ignore": [],
    }
    for user_id in range(train['user_id'].nunique())
}

In [8]:
import torch.nn.init as init

class Model(nn.Module):
    def __init__(self, 
                 input_dim, 
                 num_users=users_meta.index.nunique(), 
                 num_items=items_meta.index.nunique(), 
                 num_sources=items_meta['source_id'].nunique(),
                 num_ages=users_meta['age'].nunique(),
                 num_durations=items_meta['duration'].nunique(),
                 num_genders=users_meta['gender'].nunique(), 
                 output_dim=CONFIG['output_dim']): 
        
        super(Model, self).__init__()
        self.user_embedding = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.item_embedding = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding = nn.Embedding(num_sources, CONFIG['source_emb_size'])
        self.age_embedding = nn.Embedding(num_ages, CONFIG['age_emb_size'])
        self.duration_embedding = nn.Embedding(num_durations, CONFIG['duration_emb_size'])
        self.gender_embedding = nn.Embedding(num_genders, CONFIG['gender_emb_size'])
        
        self.user_embedding_dcn = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.item_embedding_dcn = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding_dcn = nn.Embedding(num_sources, CONFIG['source_emb_size'])
        self.age_embedding_dcn = nn.Embedding(num_ages, CONFIG['age_emb_size'])
        self.duration_embedding_dcn = nn.Embedding(num_durations, CONFIG['duration_emb_size'])
        self.gender_embedding_dcn = nn.Embedding(num_genders, CONFIG['gender_emb_size'])
         
        self.fc1 = nn.Linear(input_dim, input_dim)
        self.fc2 = nn.Linear(input_dim, input_dim)
        self.fc3 = nn.Linear(input_dim, input_dim)
        self.fc4 = nn.Linear(input_dim + input_dim, output_dim)
        
        self.relu = nn.ReLU()
        
        self.num_cross_layers = 3
        
        self.cross_weights = nn.ParameterList(
            [nn.Parameter(torch.randn(input_dim, input_dim)) for _ in range(self.num_cross_layers)]
        )
        self.cross_biases = nn.ParameterList(
            [nn.Parameter(torch.randn(input_dim)) for _ in range(self.num_cross_layers)]
        )
        
        # Инициализация весов
        self._initialize_weights()

    def _initialize_weights(self):
        # Xavier для линейных слоев
        for layer in [self.fc1, self.fc2, self.fc3, self.fc4]:
            init.xavier_uniform_(layer.weight)
            if layer.bias is not None:
                init.zeros_(layer.bias)
        
        # Xavier для эмбеддингов
        for embedding in [self.user_embedding, self.item_embedding, self.source_embedding, 
                          self.age_embedding, self.duration_embedding, self.gender_embedding,
                          self.user_embedding_dcn, self.item_embedding_dcn, self.source_embedding_dcn, 
                          self.age_embedding_dcn, self.duration_embedding_dcn, self.gender_embedding_dcn]:
            init.xavier_uniform_(embedding.weight)
        
        # Xavier для DCNv2 слоев
        for weight in self.cross_weights:
            init.xavier_uniform_(weight)
        for bias in self.cross_biases:
            init.zeros_(bias)

    def DCNv2_forward(self, x):
        # Инициализируем x0
        x0 = x
        for i in range(self.num_cross_layers):
            x = x0 * (x @ self.cross_weights[i]) + self.cross_biases[i] + x
        return x
    
    def forward(self, user_ids, item_ids, source_ids, age_ids, duration_ids, gender_ids, embeddings, prev_embeddings):

        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)
        age_emb = self.age_embedding(age_ids)
        duration_emb = self.duration_embedding(duration_ids)
        gender_emb = self.gender_embedding(gender_ids)
        
        user_emb_dcn = self.user_embedding_dcn(user_ids)
        item_emb_dcn = self.item_embedding_dcn(item_ids)
        source_emb_dcn = self.source_embedding_dcn(source_ids)
        age_emb_dcn = self.age_embedding_dcn(age_ids)
        duration_emb_dcn = self.duration_embedding_dcn(duration_ids)
        gender_emb_dcn = self.gender_embedding_dcn(gender_ids)
        
        x = torch.cat((user_emb, item_emb, source_emb, age_emb, duration_emb, gender_emb, embeddings, prev_embeddings), dim=1)
        x_dcn = torch.cat((user_emb_dcn, item_emb_dcn, source_emb_dcn, age_emb_dcn, duration_emb_dcn, gender_emb_dcn, embeddings, prev_embeddings), dim=1)
        
        x_dcn = self.DCNv2_forward(x_dcn)
        
        x = self.relu(self.fc1(x))
        
        
        x_combined = torch.cat((x, x_dcn), dim=1)
        
        x_out = self.fc4(x_combined)
        
        return x_out

In [9]:
# Input dimension
input_dim = (CONFIG['user_emb_size'] + 
             CONFIG['item_emb_size'] + 
             CONFIG['source_emb_size'] + 
             CONFIG['age_emb_size'] +
             CONFIG['duration_emb_size'] + 
             CONFIG['gender_emb_size'] + 
             32 + 96)

In [10]:
# Model, criterion and optimizer
model = Model(input_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=CONFIG['LR'])

In [11]:
train_targets = train['target'].tolist()
train_items = train['item_id'].tolist()
test_items = test['item_id'].tolist()
items_embs = items_meta['embeddings'].to_dict()
train_users = train['user_id'].tolist()
test_users = test['user_id'].tolist()

In [12]:
from scipy.spatial.distance import cosine
def get_user_embedding_with_targets(user_id, user_history, item_emb):
    
    like_emb = np.full(32, 0)
    dislike_emb = np.full(32, 0)
    ignore_emb = np.full(32, 0)

    liked_embs = user_history[user_id]['embeddings_like']
    disliked_embs = user_history[user_id]['embeddings_dislike']
    ignore_embs = user_history[user_id]['embeddings_dislike']

    if liked_embs:
        like_emb = min(liked_embs, key=lambda emb: cosine(emb, item_emb))
    
    if disliked_embs:
        dislike_emb = min(disliked_embs, key=lambda emb: cosine(emb, item_emb))

    if ignore_embs:
        ignore_emb = min(ignore_embs, key=lambda emb: cosine(emb, item_emb))

    return np.concatenate((like_emb, dislike_emb, ignore_emb))

In [13]:
def update_user_history(user_id, user_history, item_emb, target):
    
    if target == 2:
        user_history[user_id]['embeddings_like'].append(item_emb)
    
    elif target == 1:
        user_history[user_id]['embeddings_ignore'].append(item_emb)
        
    else:
        user_history[user_id]['embeddings_dislike'].append(item_emb)

In [14]:
# Training
train_num_samples = len(train)
train_num_batches = (train_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

test_num_samples = len(test)
test_num_batches = (test_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

for epoch in range(CONFIG['EPOCHS']):
##################################################################TRAIN##################################################################
    model.train()
    train_running_loss = 0.0

    with tqdm(range(train_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as t:
        for batch_idx in t:
            start_idx = batch_idx * CONFIG['BATCH_SIZE']
            end_idx = min(start_idx + CONFIG['BATCH_SIZE'], train_num_samples)
            
            batch_main = train.iloc[start_idx:end_idx]
            
            batch_user_values = batch_main['user_id'].values
            batch_item_values = batch_main['item_id'].values
            
            batch_users_meta = users_meta.loc[batch_user_values]
            batch_items_meta = items_meta.loc[batch_item_values]
            
            targets = batch_main['target'].values
            embeddings = np.stack(batch_items_meta['embeddings'].values)
            
            batch_embeddings = []
            
            for i in range(start_idx, end_idx):
                user_id = train_users[i]
                target = train_targets[i]
                item = train_items[i]
                emb = items_embs[item]
                user_emb = get_user_embedding_with_targets(user_id, user_history, emb)
                update_user_history(user_id, user_history, emb, target)
                batch_embeddings.append(user_emb)
            
            targets = torch.tensor(targets, dtype=torch.long, device=device)
            embeddings = torch.tensor(embeddings, device=device, dtype=torch.float32)

            batch_user_values = torch.tensor(batch_user_values, dtype=torch.long, device=device)
            batch_item_values = torch.tensor(batch_item_values, dtype=torch.long, device=device)

            batch_gender_values = torch.tensor(batch_users_meta['gender'].values, dtype=torch.long, device=device)
            batch_age_values = torch.tensor(batch_users_meta['age'].values, dtype=torch.long, device=device)
            batch_source_values = torch.tensor(batch_items_meta['source_id'].values, dtype=torch.long, device=device)
            batch_duration_values = torch.tensor(batch_items_meta['duration'].values, dtype=torch.long, device=device)

                        
            batch_embeddings = torch.tensor(np.array(batch_embeddings), dtype=torch.float32, device=device)
            
            optimizer.zero_grad()
            
            outputs = model(batch_user_values, 
                            batch_item_values, 
                            batch_source_values, 
                            batch_age_values, 
                            batch_duration_values, 
                            batch_gender_values, 
                            embeddings,
                            batch_embeddings)
            
            batch_loss = criterion(outputs, targets)
            batch_loss.backward()
            optimizer.step()

            train_running_loss += batch_loss.item()
            t.set_postfix(train_mean_loss=f"{train_running_loss / (batch_idx + 1):.6f}")
        
##################################################################EVAL##################################################################
    model.eval()
    
    outputs_list = []

    with torch.no_grad():
        with tqdm(range(test_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as v:
            for batch_idx in v:
                start_idx = batch_idx * CONFIG['BATCH_SIZE']
                end_idx = min(start_idx + CONFIG['BATCH_SIZE'], test_num_samples)
                
                batch_main = test.iloc[start_idx:end_idx]
                
                batch_user_values = batch_main['user_id'].values
                batch_item_values = batch_main['item_id'].values
    
                batch_users_meta = users_meta.loc[batch_user_values]
                batch_items_meta = items_meta.loc[batch_item_values]
                
                embeddings = np.stack(batch_items_meta['embeddings'].values)
                
                batch_embeddings = []
                
                for i in range(start_idx, end_idx):
                    user_id = test_users[i]
                    item = test_items[i]
                    emb = items_embs[item]
                    user_emb = get_user_embedding_with_targets(user_id, user_history, emb)
                    batch_embeddings.append(user_emb)
                    
                embeddings = torch.tensor(embeddings, device=device, dtype=torch.float32)

                batch_user_values = torch.tensor(batch_user_values, dtype=torch.long, device=device)
                batch_item_values = torch.tensor(batch_item_values, dtype=torch.long, device=device)
    
                batch_gender_values = torch.tensor(batch_users_meta['gender'].values, dtype=torch.long, device=device)
                batch_age_values = torch.tensor(batch_users_meta['age'].values, dtype=torch.long, device=device)
                batch_source_values = torch.tensor(batch_items_meta['source_id'].values, dtype=torch.long, device=device)
                batch_duration_values = torch.tensor(batch_items_meta['duration'].values, dtype=torch.long, device=device)
         
                batch_embeddings = torch.tensor(np.array(batch_embeddings), dtype=torch.float32, device=device)
                
                outputs = model(batch_user_values, 
                                batch_item_values, 
                                batch_source_values, 
                                batch_age_values, 
                                batch_duration_values, 
                                batch_gender_values, 
                                embeddings,
                                batch_embeddings)
                
                probabilities = F.softmax(outputs, dim=1)
                class_weights = torch.tensor([0, 1, 2], device=probabilities.device, dtype=probabilities.dtype)
                weighted_predictions = torch.sum(probabilities * class_weights, dim=1).cpu().numpy()
        
                outputs_list.extend(weighted_predictions)

##################################################################SAVE##################################################################
    df_outputs = pd.DataFrame(outputs_list, columns=['predict'])
    test_to_save['predict'] = df_outputs['predict']
    output_path = f"{CONFIG['test_pred_folder']}{CONFIG['test_output_path']}_e{epoch}.csv"
    test_to_save.to_csv(output_path, index=False)

    train_loss = train_running_loss / train_num_batches

    print('Outputs saved at', output_path)
    print(f"Epoch [{epoch + 1}/{CONFIG['EPOCHS']}]: Train Loss: {train_loss:.6f}")

Epoch 1/1: 100%|██████████| 7/7 [00:06<00:00,  1.10batch/s, train_mean_loss=0.444874]
Epoch 1/1: 100%|██████████| 102/102 [00:26<00:00,  3.88batch/s]


Outputs saved at C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/test/7.6.20_test_e0.csv
Epoch [1/1]: Train Loss: 0.444874


In [15]:
user_history[3810]

{'embeddings_like': [],
 'embeddings_dislike': [],
 'embeddings_ignore': [array([-1.62824228e-01, -2.78125286e-01,  4.80833650e-01,  7.75979906e-02,
         -1.17562540e-01,  1.50914816e-03,  9.31575820e-02,  4.45409156e-02,
          7.31663927e-02,  4.11580727e-02,  1.32241309e-01,  8.08857828e-02,
         -4.90048807e-03, -3.78018357e-02,  2.55614311e-01, -8.63022581e-02,
          7.60158710e-03, -1.77995354e-01, -5.19777741e-03, -1.96822826e-02,
         -1.20560318e-01,  4.60970943e-04, -2.21246984e-02, -1.96457729e-01,
          8.00502449e-02, -1.20071866e-01, -1.42696891e-02, -7.86759779e-02,
         -1.86622128e-01, -2.31833026e-01,  1.19009323e-01,  1.00954428e-01],
        dtype=float32),
  array([-0.01001257, -0.3558621 ,  0.00093723,  0.19572876, -0.15319543,
          0.04580693,  0.07114328,  0.07296939, -0.08358033, -0.06345286,
          0.18956222,  0.0527563 ,  0.00340621, -0.10992459,  0.22952643,
         -0.04658013,  0.07291584, -0.22779834, -0.11227264,  0.0