In [14]:
number = '7.11.5'

In [15]:
CONFIG = {
    'data_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/data/',
    'val_pred_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/val/',
    'test_pred_folder' : 'C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/test/',
    
    'train_path' : 'train_interactions.parquet',
    'test_path': 'test_pairs.csv',  
    'items_meta_path' : 'items_meta.parquet',
    'users_meta_path' : 'users_meta.parquet',
    'folds_path' : 'fold.csv',
    'val_output_path' : f'{number}_val',
    'test_output_path' : f'{number}_test',
    
    'user_emb_size' : 256, 
    'item_emb_size' : 256, 
    'source_emb_size' : 256, 
    'age_emb_size' : 256, 
    'duration_emb_size' : 256, 
    'gender_emb_size' : 256, 
    
    'DEVICE' : 'cuda',
    'SEED' : 42,
    'BATCH_SIZE' : 16384,
    'LR' : 0.001,
    'EPOCHS' : 1,
    'output_dim' : 3
    
}

In [3]:
# Import libs
import numpy as np
from tqdm import tqdm
import pandas as pd
import torch.nn.functional as F
import torch
import torch.nn as nn
from torch.optim import Adam

In [4]:
# Device, torch decimal places and seed for reproducibility
device = torch.device(CONFIG['DEVICE'] if torch.cuda.is_available() else "cpu")
torch.set_printoptions(precision=40) 

torch.manual_seed(CONFIG['SEED'])  
torch.cuda.manual_seed_all(CONFIG['SEED'])  
np.random.seed(CONFIG['SEED'])  

In [5]:
# Load and prepare data
train = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['train_path']}", engine='pyarrow')
train['target'] = train['like'] + train['dislike'].replace({1: -1})
train.drop(columns=['like', 'dislike'], inplace=True)
train['target'] = train['target'].astype('int8')
train['target'] = train['target'].replace({-1:0, 0:1, 1:2})

test = pd.read_csv(f"{CONFIG['data_folder']}{CONFIG['test_path']}")
test_to_save = test.copy()

items_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['items_meta_path']}", engine='pyarrow')
items_meta['duration'] = items_meta['duration'] - 5
items_meta['item_id'] = items_meta['item_id'].astype('category')
items_meta['source_id'] = items_meta['source_id'].astype('category')
items_meta.set_index('item_id', inplace=True)

users_meta = pd.read_parquet(f"{CONFIG['data_folder']}{CONFIG['users_meta_path']}", engine='pyarrow')
users_meta['age'] = users_meta['age'] - 18
users_meta['gender'] = users_meta['gender'].replace({1:0, 2:1})
users_meta['user_id'] = users_meta['user_id'].astype('category')
users_meta['gender'] = users_meta['gender'].astype('category')
users_meta.set_index('user_id', inplace=True)

In [6]:
users_meta['prev_emb'] = [np.zeros(32, dtype=np.float32) for _ in range(len(users_meta))]
users_meta['prev_emb_2'] = [np.zeros(32, dtype=np.float32) for _ in range(len(users_meta))]
users_meta['prev_emb_3'] = [np.zeros(32, dtype=np.float32) for _ in range(len(users_meta))]

In [7]:
users_meta['prev_target'] = 0
users_meta['prev_target_2'] = 0
users_meta['prev_target_3'] = 0

In [8]:
# Model definition
import torch.nn.init as init

class Model(nn.Module):
    def __init__(self, 
                 input_dim, 
                 num_users=users_meta.index.nunique(), 
                 num_items=items_meta.index.nunique(), 
                 num_sources=items_meta['source_id'].nunique(),
                 num_ages=users_meta['age'].nunique(),
                 num_durations=items_meta['duration'].nunique(),
                 num_genders=users_meta['gender'].nunique(), 
                 output_dim=CONFIG['output_dim']): 
        
        super(Model, self).__init__()
        self.user_embedding = nn.Embedding(num_users, CONFIG['user_emb_size'])
        self.item_embedding = nn.Embedding(num_items, CONFIG['item_emb_size'])
        self.source_embedding = nn.Embedding(num_sources, CONFIG['source_emb_size'])
        self.age_embedding = nn.Embedding(num_ages, CONFIG['age_emb_size'])
        self.duration_embedding = nn.Embedding(num_durations, CONFIG['duration_emb_size'])
        self.gender_embedding = nn.Embedding(num_genders, CONFIG['gender_emb_size'])
         
        self.fc1 = nn.Linear(input_dim, 1024)
        self.fc2 = nn.Linear(1024, 512)
        self.fc3 = nn.Linear(512, 512)
        self.fc4 = nn.Linear(512, 256)
        self.fc5 = nn.Linear(256, 128)
        self.fc6 = nn.Linear(128, 128)
        self.fc7 = nn.Linear(128, 64)
        self.fc8 = nn.Linear(64, 32)
        self.fc9 = nn.Linear(32, 32)
        self.fc10 = nn.Linear(32, 16)
        self.fc11 = nn.Linear(16, 8)
        self.fc12 = nn.Linear(8, 8)
        self.fc13 = nn.Linear(8 + 33 + 33 + 33, output_dim)
        
        self.gelu = nn.GELU()

        self._initialize_weights()
    
    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                init.xavier_uniform_(m.weight)  # Инициализация весов
                if m.bias is not None:
                    init.zeros_(m.bias)  # Инициализация биасов нулями
            elif isinstance(m, nn.Embedding):
                init.xavier_uniform_(m.weight)  # Инициализация весов для Embedding
    
    def forward(self, user_ids, item_ids, source_ids, age_ids, duration_ids, gender_ids, embeddings, ids):

        user_emb = self.user_embedding(user_ids)
        item_emb = self.item_embedding(item_ids)
        source_emb = self.source_embedding(source_ids)
        age_emb = self.age_embedding(age_ids)
        duration_emb = self.duration_embedding(duration_ids)
        gender_emb = self.gender_embedding(gender_ids)
        
        x = torch.cat((user_emb, item_emb, source_emb, age_emb, duration_emb, gender_emb, embeddings), dim=1)
        
        x = self.gelu(self.fc1(x))
        x = self.gelu(self.fc2(x))
        x = self.gelu(self.fc3(x))
        x = self.gelu(self.fc4(x))
        x = self.gelu(self.fc5(x))
        x = self.gelu(self.fc6(x))
        x = self.gelu(self.fc7(x))
        x = self.gelu(self.fc8(x))
        x = self.gelu(self.fc9(x))
        x = self.gelu(self.fc10(x))
        x = self.gelu(self.fc11(x))
        x = self.gelu(self.fc12(x))
        x = torch.cat((x, ids), dim=1)
        x = self.fc13(x)
        
        return x

In [9]:
# Input dimension
input_dim = (CONFIG['user_emb_size'] + 
             CONFIG['item_emb_size'] + 
             CONFIG['source_emb_size'] + 
             CONFIG['age_emb_size'] +
             CONFIG['duration_emb_size'] + 
             CONFIG['gender_emb_size'] + 
             32)

In [10]:
# Model, criterion and optimizer
model = Model(input_dim).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = Adam(model.parameters(), lr=CONFIG['LR'])

In [11]:
# Training
train_num_samples = len(train)
train_num_batches = (train_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

test_num_samples = len(test)
test_num_batches = (test_num_samples + CONFIG['BATCH_SIZE'] - 1) // CONFIG['BATCH_SIZE']

for epoch in range(CONFIG['EPOCHS']):
##################################################################TRAIN##################################################################
    model.train()
    train_running_loss = 0.0

    with tqdm(range(train_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as t:
        for batch_idx in t:
            start_idx = batch_idx * CONFIG['BATCH_SIZE']
            end_idx = min(start_idx + CONFIG['BATCH_SIZE'], train_num_samples)
            
            batch_main = train.iloc[start_idx:end_idx]
            
            batch_user_values = batch_main['user_id'].values
            batch_item_values = batch_main['item_id'].values
            
            batch_users_meta = users_meta.loc[batch_user_values]
            batch_items_meta = items_meta.loc[batch_item_values]
            
            targets = batch_main['target'].values
            embeddings = np.stack(batch_items_meta['embeddings'].values)
            
            batch_embeddings = []
            
            for i in range(len(batch_user_values)):
                user_id = batch_user_values[i]
                target = targets[i] - 1

                prev_emb = users_meta.at[user_id, 'prev_emb']  
                prev_target = users_meta.at[user_id, 'prev_target']  
                prev_emb_2 = users_meta.at[user_id, 'prev_emb_2']  
                prev_target_2 = users_meta.at[user_id, 'prev_target_2']  
                prev_emb_3 = users_meta.at[user_id, 'prev_emb_3']  
                prev_target_3 = users_meta.at[user_id, 'prev_target_3']  
                
                users_meta.at[user_id, 'prev_emb_3'] = users_meta.at[user_id, 'prev_emb_2']
                users_meta.at[user_id, 'prev_target_3'] = users_meta.at[user_id, 'prev_target_2']
                users_meta.at[user_id, 'prev_emb_2'] = users_meta.at[user_id, 'prev_emb']
                users_meta.at[user_id, 'prev_target_2'] = users_meta.at[user_id, 'prev_target']
                users_meta.at[user_id, 'prev_emb'] = embeddings[i]
                users_meta.at[user_id, 'prev_target'] = target
                
                concatenated_emb = np.concatenate([prev_emb, [prev_target], prev_emb_2, [prev_target_2], prev_emb_3, [prev_target_3]], axis=0)
                batch_embeddings.append(concatenated_emb)
            
            targets = torch.tensor(targets, dtype=torch.long, device=device)
            embeddings = torch.tensor(embeddings, device=device, dtype=torch.float32)

            batch_user_values = torch.tensor(batch_user_values, dtype=torch.long, device=device)
            batch_item_values = torch.tensor(batch_item_values, dtype=torch.long, device=device)

            batch_gender_values = torch.tensor(batch_users_meta['gender'].values, dtype=torch.long, device=device)
            batch_age_values = torch.tensor(batch_users_meta['age'].values, dtype=torch.long, device=device)
            batch_source_values = torch.tensor(batch_items_meta['source_id'].values, dtype=torch.long, device=device)
            batch_duration_values = torch.tensor(batch_items_meta['duration'].values, dtype=torch.long, device=device)

                        
            batch_embeddings = torch.tensor(np.array(batch_embeddings), dtype=torch.float32, device=device)
            
            optimizer.zero_grad()
            
            outputs = model(batch_user_values, 
                            batch_item_values, 
                            batch_source_values, 
                            batch_age_values, 
                            batch_duration_values, 
                            batch_gender_values, 
                            embeddings,
                            batch_embeddings)
            
            batch_loss = criterion(outputs, targets)
            batch_loss.backward()
            optimizer.step()

            train_running_loss += batch_loss.item()
            t.set_postfix(train_mean_loss=f"{train_running_loss / (batch_idx + 1):.6f}")
        
##################################################################EVAL##################################################################
    model.eval()
    
    outputs_list = []

    with torch.no_grad():
        with tqdm(range(test_num_batches), desc=f"Epoch {epoch + 1}/{CONFIG['EPOCHS']}", unit="batch") as v:
            for batch_idx in v:
                start_idx = batch_idx * CONFIG['BATCH_SIZE']
                end_idx = min(start_idx + CONFIG['BATCH_SIZE'], test_num_samples)
                
                batch_main = test.iloc[start_idx:end_idx]
                
                batch_user_values = batch_main['user_id'].values
                batch_item_values = batch_main['item_id'].values
    
                batch_users_meta = users_meta.loc[batch_user_values]
                batch_items_meta = items_meta.loc[batch_item_values]
                
                embeddings = np.stack(batch_items_meta['embeddings'].values)
                
                batch_embeddings = []
                
                for i in range(len(batch_user_values)):
                    user_id = batch_user_values[i]
    
                    prev_emb = users_meta.at[user_id, 'prev_emb']  
                    prev_target = users_meta.at[user_id, 'prev_target']  
                    prev_emb_2 = users_meta.at[user_id, 'prev_emb_2']  
                    prev_target_2 = users_meta.at[user_id, 'prev_target_2']  
                    prev_emb_3 = users_meta.at[user_id, 'prev_emb_3']  
                    prev_target_3 = users_meta.at[user_id, 'prev_target_3']  
                    
                    concatenated_emb = np.concatenate([prev_emb, [prev_target], prev_emb_2, [prev_target_2], prev_emb_3, [prev_target_3]])
                    batch_embeddings.append(concatenated_emb)
                    
                embeddings = torch.tensor(embeddings, device=device, dtype=torch.float32)

                batch_user_values = torch.tensor(batch_user_values, dtype=torch.long, device=device)
                batch_item_values = torch.tensor(batch_item_values, dtype=torch.long, device=device)
    
                batch_gender_values = torch.tensor(batch_users_meta['gender'].values, dtype=torch.long, device=device)
                batch_age_values = torch.tensor(batch_users_meta['age'].values, dtype=torch.long, device=device)
                batch_source_values = torch.tensor(batch_items_meta['source_id'].values, dtype=torch.long, device=device)
                batch_duration_values = torch.tensor(batch_items_meta['duration'].values, dtype=torch.long, device=device)
         
                batch_embeddings = torch.tensor(np.array(batch_embeddings), dtype=torch.float32, device=device)
                
                outputs = model(batch_user_values, 
                                batch_item_values, 
                                batch_source_values, 
                                batch_age_values, 
                                batch_duration_values, 
                                batch_gender_values, 
                                embeddings,
                                batch_embeddings)
                
                probabilities = F.softmax(outputs, dim=1)
                class_weights = torch.tensor([0, 1, 2], device=probabilities.device, dtype=probabilities.dtype)
                weighted_predictions = torch.sum(probabilities * class_weights, dim=1).cpu().numpy()
        
                outputs_list.extend(weighted_predictions)

##################################################################SAVE##################################################################
    df_outputs = pd.DataFrame(outputs_list, columns=['predict'])
    test_to_save['predict'] = df_outputs['predict']
    output_path = f"{CONFIG['test_pred_folder']}{CONFIG['test_output_path']}_e{epoch}.csv"
    test_to_save.to_csv(output_path, index=False)

    train_loss = train_running_loss / train_num_batches

    print('Outputs saved at', output_path)
    print(f"Epoch [{epoch + 1}/{CONFIG['EPOCHS']}]: Train Loss: {train_loss:.6f}")

Epoch 1/1: 100%|██████████| 8891/8891 [10:46:35<00:00,  4.36s/batch, train_mean_loss=0.127721]  
Epoch 1/1: 100%|██████████| 102/102 [01:00<00:00,  1.70batch/s]


Outputs saved at C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/test/7.11.4_test_e0.csv
Epoch [1/1]: Train Loss: 0.127721


In [21]:
df_outputs = pd.DataFrame(outputs_list, columns=['predict'])
test_to_save['predict'] = df_outputs['predict']
output_path = f"{CONFIG['test_pred_folder']}{CONFIG['test_output_path']}_e{epoch}.csv"
test_to_save.to_csv(output_path, index=False)

train_loss = train_running_loss / train_num_batches

print('Outputs saved at', output_path)
print(f"Epoch [{epoch + 1}/{CONFIG['EPOCHS']}]: Train Loss: {train_loss:.6f}")

Outputs saved at C:/Users/Николай/PycharmProjects/VKRecSys/C.Results/test/7.11.5_test_e0.csv
Epoch [1/1]: Train Loss: 0.127721


In [22]:
test_to_save.head()

Unnamed: 0,user_id,item_id,predict
0,1,7363,1.107743
1,1,73770,1.171896
2,1,75700,1.254958
3,1,81204,1.118529
4,1,110249,1.064205


In [36]:
import numpy as np
from collections import deque
import time

# Количество чисел для теста
N = 100000
maxlen = 3

# --- Способ 1: Используем deque ---
start_time_deque = time.time()
history_deque = deque(maxlen=maxlen)
for i in range(N):
    history_deque.append(i)
end_time_deque = time.time()

time_deque = end_time_deque - start_time_deque

# --- Способ 2: Используем numpy.roll ---
start_time_roll = time.time()
history_roll = np.zeros(maxlen, dtype=int)
for i in range(N):
    history_roll = np.roll(history_roll, -1)
    history_roll[-1] = i
end_time_roll = time.time()

time_roll = end_time_roll - start_time_roll

# Вывод результатов
print(f"Время работы deque: {time_deque:.6f} секунд")
print(f"Время работы numpy.roll: {time_roll:.6f} секунд")

Время работы deque: 0.005745 секунд
Время работы numpy.roll: 0.816467 секунд
