In [1]:
import torch
import pandas as pd
from pathlib import Path
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from random import sample
import os
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import math
import torch.nn as nn


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda


In [3]:
model_path = 'embeding_models/embeding_v3_8_dim.pt'
model = torch.jit.load(model_path).encoder.cpu()

In [4]:
BASE_DATE = datetime.strptime("2025-01-03", '%Y-%m-%d').timestamp()
print(BASE_DATE)

1735858800.0


In [5]:
tracks_raw_data = pd.read_json(Path('../data_v2/tracks_artists.jsonl'), lines=True)

# processs ARTIST HASH

def postprocess_hash_to_list(x):
    str_x = str(x)
    if len(str_x) < 8:
        str_x = "0" * (8 - len(str_x)) + str_x
    return [int(x) for x in str_x]

tracks_raw_data["id_artist_hash"] = tracks_raw_data["id_artist_hash"].apply(postprocess_hash_to_list)

In [17]:
for i in range(8):
    print(tracks_raw_data.iloc[i].values)
    print(len(tracks_raw_data.iloc[i].explode().values))
    print(tracks_raw_data.iloc[i].explode().values)

['0RNxWy0PC3AyH4ThH3aGK6' 0.55 0.0488966953 list([1, 0]) -0.7453601641
 0.673 0.377 0 0.23568333330000002 0.0697 0.586 0.0 0.332 0.713
 0.4042408189 list([0, 0, 0, 0, 0, 0, 0]) list([5, 2, 3, 5, 1, 3, 7, 9])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
48
['0RNxWy0PC3AyH4ThH3aGK6' 0.55 0.0488966953 1 0 -0.7453601641 0.673 0.377
 0 0.23568333330000002 0.0697 0.586 0.0 0.332 0.713 0.4042408189 0 0 0 0 0
 0 0 5 2 3 5 1 3 7 9 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0]
['2W889aLIKxULEefrleFBFI' 0.54 0.0480552431 list([1, 0]) -0.4908675752
 0.20400000000000001 0.151 2 0.2973666667 0.041800000000000004
 0.9470000000000001 9.15e-06 0.321 0.134 0.417007801
 list([0, 0, 0, 0, 0, 0, 0]) list([5, 7, 1, 1, 8, 0, 2, 7])
 list([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0])]
48
['2W889aLIKxULEefrleFBFI' 0.54 0.0480552431 1 0 -0.4908675752
 0.20400000000000001 0.151 2 0.2973666667 0.041800000000000004
 0.9470000000000001 9.15e-06 0.321 0.134 0.417007801 0 0 0 0 0 0 0 5 7 1 1
 8 0 2 7 0 0 0

In [6]:
class TracksDataset(Dataset):
    def __init__(self, tracks_data: pd.DataFrame):
        self.data = tracks_data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def __getitem__(self, idx):
        unpacked_data = self.data.iloc[idx].drop("id_track").explode().values.astype(np.float64)
        return torch.from_numpy(unpacked_data)

In [7]:
embedings_per_id = pd.DataFrame()
embedings_per_id["id_track"] = tracks_raw_data["id_track"]
embedings_per_id["embeding"] = [model(torch.Tensor(x)).detach().cpu().numpy() for x in TracksDataset(tracks_raw_data)]

In [8]:
test_intigers = [687, 852, 528, 562, 426, 1092, 171, 250, 223, 265, 981, 607, 738, 1020, 510, 899, 596, 1047, 826, 669, 923, 905, 1063, 139, 1031]

In [9]:
files_to_load = [file for file in os.listdir("../data_v2/sessions") if int(file.split(".")[0].split("_")[-1]) not in test_intigers]

raw_sessions_data = pd.concat([pd.read_json(Path(f'../data_v2/sessions/{file}'), lines=True) for file in files_to_load])
print(raw_sessions_data.head())

   session_id               timestamp  user_id                track_id  \
0       89426 2024-09-17 10:51:23.000      260  1xPec5BN0Zxv77zrWKq43S   
1       89427 2023-05-20 00:35:36.227      260  7s0lDK7y3XLmI7tcsRAbW0   
2       89427 2023-05-20 00:38:48.054      260  48lQegoLqGAzaRLnMwK0mO   
3       89427 2023-05-20 00:43:30.802      260  48lQegoLqGAzaRLnMwK0mO   
4       89427 2023-05-20 00:44:59.387      260  4usVYcPlxRgRet6YashdCJ   

  event_type  
0       play  
1       play  
2       play  
3       like  
4       play  


In [10]:
merged_sesions_data = raw_sessions_data.merge(embedings_per_id, left_on="track_id", right_on="id_track").drop("id_track", axis=1)
merged_sesions_data["timestamp"] = (pd.to_datetime(merged_sesions_data["timestamp"], format='mixed').apply(lambda x: x.timestamp())).div(BASE_DATE)
merged_sesions_data = pd.get_dummies(merged_sesions_data, columns=["event_type"], dtype = int)

merged_sesions_data = [pd.DataFrame(y) for _, y in merged_sesions_data.groupby('session_id', as_index=False) if len(y) > 1]

NUMBER_OF_INPUTS = 12
EMBEDING_SIZE = 8

In [11]:
class CustomUserSesionsDataset(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def __getitem__(self, idx):
        sessions = []
        for session in self.data[idx].drop("track_id", axis=1).drop("session_id", axis=1).drop("user_id", axis=1).values:
            unpacked_data = np.array([])
            for data in session:
                unpacked_data = np.append(unpacked_data, data)
            sessions.append(torch.tensor(unpacked_data))
        return sessions

In [12]:
user_sesions_dataset = CustomUserSesionsDataset(merged_sesions_data)

In [13]:
COMPARE_PERCENTAGE = 0.25

def pad_sessions_collate(batch, pad_value=0.0):
    input_sequences = [
    ]
    output_sequences = [
    ]

    for batch_element in batch:
        number_of_sequence_data = len(batch_element)
        number_of_reference_data = math.ceil(number_of_sequence_data * COMPARE_PERCENTAGE)

        input_sequences.append(torch.stack(batch_element[:number_of_sequence_data - number_of_reference_data]))
        output_sequences.append(sample(batch_element[number_of_sequence_data - number_of_reference_data:], 1)[0])

    x_lens = [len(x) for x in input_sequences]

    padded_input_sequence = pad_sequence(input_sequences, batch_first=True, padding_value=pad_value)

    return padded_input_sequence, torch.stack([x[1:EMBEDING_SIZE + 1] for x in output_sequences]), x_lens


In [14]:
train_loader = DataLoader(user_sesions_dataset, batch_size=8192, shuffle=True, collate_fn=pad_sessions_collate)

In [26]:
# class UsertPreferenceGenerator(torch.nn.Module):
#     def __init__(self):
#         super(UsertPreferenceGenerator, self).__init__()
#         self.num_layers = 1
#         self.hidden_size = 256
#         self.rnn = torch.nn.LSTM(input_size=NUMBER_OF_INPUTS, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
#         self.act2 = torch.nn.LeakyReLU()
#         self.act1 = torch.nn.Hardsigmoid()
#         self.fc1 = torch.nn.Linear(self.hidden_size, 128)
#         self.fc2 = torch.nn.Linear(128, EMBEDING_SIZE)

#     def init_hidden(self, batch_size):
#         hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
#         state = torch.zeros(self.num_layers, batch_size, self.hidden_size)
#         return hidden, state

#     def forward(self, x, x_lens, hidden):

#         x_packed = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
#         packed_output, hidden = self.rnn(x_packed, hidden)
#         output, _ = pad_packed_sequence(packed_output, batch_first=True)
#         x = self.fc1(self.act1(output))
#         x = self.fc2(self.act2(x))

#         return x, hidden


class UsertPreferenceGenerator(torch.nn.Module):
    def __init__(self):
        super(UsertPreferenceGenerator, self).__init__()
        self.num_layers = 3
        self.hidden_size = 32
        self.rnn = torch.nn.LSTM(input_size=NUMBER_OF_INPUTS, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
        self.act2 = torch.nn.ReLU()
        self.act1 = torch.nn.Hardsigmoid()
        self.fc1 = torch.nn.Linear(self.hidden_size, 256)
        self.fc2 = torch.nn.Linear(256, EMBEDING_SIZE)

    def init_hidden(self, batch_size):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        state = torch.zeros(self.num_layers, batch_size, self.hidden_size)
        return hidden, state

    def forward(self, x, x_lens, hidden):

        x_packed = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        packed_output, hidden = self.rnn(x_packed, hidden)
        output, _ = pad_packed_sequence(packed_output, batch_first=True)
        x = self.fc1(self.act1(output))
        x = self.fc2(self.act2(x))

        return x, hidden


class UsertPreferenceGenerator(torch.nn.Module):
    def __init__(self):
        super(UsertPreferenceGenerator, self).__init__()
        self.num_layers = 1
        self.hidden_size = 8
        self.rnn = torch.nn.LSTM(input_size=NUMBER_OF_INPUTS, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
        self.fc1 = torch.nn.Linear(self.hidden_size, EMBEDING_SIZE)

    def reset_hidden_state(self, batch_size, target_device):
        hidden = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(target_device)
        state = torch.zeros(self.num_layers, batch_size, self.hidden_size).to(target_device)
        dummy_input_seq = torch.zeros(batch_size, 1, NUMBER_OF_INPUTS).to(target_device)
        _, _ = self.rnn(dummy_input_seq, (hidden, state))


    def forward(self, x, x_lens):
        x_packed = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        packed_output, (hidden, cell) = self.rnn(x_packed)
        hidden = hidden[-1]
        # output, _ = pad_packed_sequence(packed_output, batch_first=True)
        x = self.fc1(hidden)
        # x = self.fc2(self.act2(x))

        return x

In [20]:
model = UsertPreferenceGenerator().to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
loss_fun = nn.L1Loss()

# torch.onnx.export(model,[] ,'loop.onnx', verbose=True)

UsertPreferenceGenerator(
  (rnn): LSTM(12, 512, batch_first=True)
  (act2): LeakyReLU(negative_slope=0.01)
  (fc2): Linear(in_features=512, out_features=8, bias=True)
)


In [27]:
# Training loop
model = UsertPreferenceGenerator().to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
loss_fun = nn.L1Loss()


model.train()
for epoch in range(3):
    loss_sum = 0
    for x, targets, x_lens in train_loader:
        x = x.float().to(device)
        targets = targets.to(torch.float).to(device)

        # hidden, state = model.init_hidden(x.size(0))
        # hidden, state = hidden.to(device), state.to(device)
        model.reset_hidden_state(x.size(0), device)

        preds = model(x, x_lens)
        loss = loss_fun(preds, targets)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()
    print(f"Epoch: {epoch}, loss: {loss_sum:.3}")

UsertPreferenceGenerator(
  (rnn): LSTM(12, 8, batch_first=True)
  (fc1): Linear(in_features=8, out_features=8, bias=True)
)
Epoch: 0, loss: 49.5
Epoch: 1, loss: 48.4
Epoch: 2, loss: 47.2


In [22]:
torch.save(model, 'recomendation_models/user_preference_model_v3_8_dim.pt')

In [None]:
import torch.autograd.profiler as profiler

with profiler.profile(use_cuda=True) as prof:
    preds, _ = model(x, x_lens, (hidden, state))
print(prof.key_averages().table(sort_by="cuda_time_total"))


-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                                                   Name    Self CPU %      Self CPU   CPU total %     CPU total  CPU time avg     Self CUDA   Self CUDA %    CUDA total  CUDA time avg    # of Calls  
-------------------------------------------------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  ------------  
                             aten::_pad_packed_sequence        10.52%       1.156ms        32.01%       3.519ms       3.519ms     909.000us         8.04%       3.522ms       3.522ms             1  
                                             aten::sort        12.44%       1.368ms        27.00%       2.968ms       1.484ms       1.377ms        12.18%       2.978ms       1.489ms             2  
         

STAGE:2024-12-24 14:20:35 4443:4443 ActivityProfilerController.cpp:314] Completed Stage: Warm Up
STAGE:2024-12-24 14:20:35 4443:4443 ActivityProfilerController.cpp:320] Completed Stage: Collection
STAGE:2024-12-24 14:20:35 4443:4443 ActivityProfilerController.cpp:324] Completed Stage: Post Processing


In [None]:
model([[0.993923, 4.7692013 ,  0.50016433,  1.8391552 , -4.93348   ,  1.0898505 ,
          1.9289817 , 10.784728  , -3.9150748 ,  5.587146  ,  7.239292  ,
          7.1324625 ,  2.55823   ,  4.176082  ,  8.504707  ,  5.0005746 ,
         -2.202329, 0, 1, 0]], [1], torch.zeros(1, 1, 32))

AttributeError: 'list' object has no attribute 'device'

In [28]:
i = 0
for data in merged_sesions_data:
    sessions = []
    i += 1
    for session in (
        data.drop("track_id", axis=1)
        .drop("session_id", axis=1)
        .drop("user_id", axis=1)
        .values
    ):
        unpacked_data = np.array([])
        for element in session:
            unpacked_data = np.append(unpacked_data, element)
        sessions.append(torch.tensor(unpacked_data))
    sessions = torch.stack(sessions)
    # print(data)
    print(model.cpu()(torch.stack([sessions]).float(), torch.tensor([len(sessions)])))
    print("____________________________")
    if i > 8:
        break

tensor([[-0.0105,  0.0447, -0.1620, -0.3420,  0.1107,  0.2987,  0.1665,  0.0876]],
       grad_fn=<AddmmBackward0>)
____________________________
tensor([[ 0.0065,  0.2074, -0.0183, -0.4650, -0.2124,  0.1552,  0.2210,  0.1181]],
       grad_fn=<AddmmBackward0>)
____________________________
tensor([[ 0.0681,  0.0448, -0.1201, -0.4092, -0.0575,  0.2778,  0.1861,  0.2018]],
       grad_fn=<AddmmBackward0>)
____________________________
tensor([[ 0.0075,  0.0817, -0.0888, -0.3866, -0.1440,  0.2165,  0.1444,  0.1451]],
       grad_fn=<AddmmBackward0>)
____________________________
tensor([[ 0.0271,  0.1080, -0.0817, -0.4007, -0.1632,  0.2157,  0.1812,  0.1765]],
       grad_fn=<AddmmBackward0>)
____________________________
tensor([[ 0.0096,  0.0772, -0.1237, -0.3234, -0.0748,  0.2153,  0.1484,  0.1144]],
       grad_fn=<AddmmBackward0>)
____________________________
tensor([[-0.0395,  0.0302, -0.1360, -0.3503, -0.0037,  0.3085,  0.1338,  0.1378]],
       grad_fn=<AddmmBackward0>)
______________

KeyboardInterrupt: 

In [25]:
model_scripted = torch.jit.script(model.cpu())
model_scripted.save("recomendation_models/scripted_user_preference_model_v4_8_dim.pt")