In [1]:
import torch
import pandas as pd
from pathlib import Path
from datetime import datetime
from torch.utils.data import Dataset, DataLoader
from random import sample
import os
import numpy as np
from torch.nn.utils.rnn import pad_sequence, pack_padded_sequence, pad_packed_sequence
import math
import torch.nn as nn


In [2]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda


In [3]:
model_path = 'embeding_models/embeding_v1_16_dim.pt'
embeding_model = torch.jit.load(model_path).encoder.cpu()

In [4]:
BASE_DATE = datetime.strptime("2025-01-03", '%Y-%m-%d').timestamp()
print(BASE_DATE)

1735858800.0


In [5]:
tracks_raw_data = pd.read_json(Path('../data_v2/tracks_artists.jsonl'), lines=True)

# processs ARTIST HASH

def postprocess_hash_to_list(x):
    str_x = str(x)
    if len(str_x) < 8:
        str_x = "0" * (8 - len(str_x)) + str_x
    return [int(x) for x in str_x]

tracks_raw_data["id_artist_hash"] = tracks_raw_data["id_artist_hash"].apply(postprocess_hash_to_list)

In [6]:
class TracksDataset(Dataset):
    def __init__(self, tracks_data: pd.DataFrame):
        self.data = tracks_data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def __getitem__(self, idx):
        unpacked_data = self.data.iloc[idx].drop("id_track").explode().values.astype(np.float64)
        return torch.from_numpy(unpacked_data)

In [7]:
embedings_per_id = pd.DataFrame()
embedings_per_id["id_track"] = tracks_raw_data["id_track"]
embedings_per_id["embeding"] = [embeding_model(torch.Tensor(x.float())).detach().cpu().numpy() for x in TracksDataset(tracks_raw_data)]

In [8]:
# Generated using "sample"
test_intigers = [687, 852, 528, 562, 426, 1092, 171, 250, 223, 265, 981, 607, 738, 1020, 510, 899, 596, 1047, 826, 669, 923, 905, 1063, 139, 1031]

In [9]:
files_to_load = [file for file in os.listdir("../data_v2/sessions") if int(file.split(".")[0].split("_")[-1]) not in test_intigers]

raw_sessions_data = pd.concat([pd.read_json(Path(f'../data_v2/sessions/{file}'), lines=True) for file in files_to_load])
print(raw_sessions_data.head())

   session_id               timestamp  user_id                track_id  \
0       89426 2024-09-17 10:51:23.000      260  1xPec5BN0Zxv77zrWKq43S   
1       89427 2023-05-20 00:35:36.227      260  7s0lDK7y3XLmI7tcsRAbW0   
2       89427 2023-05-20 00:38:48.054      260  48lQegoLqGAzaRLnMwK0mO   
3       89427 2023-05-20 00:43:30.802      260  48lQegoLqGAzaRLnMwK0mO   
4       89427 2023-05-20 00:44:59.387      260  4usVYcPlxRgRet6YashdCJ   

  event_type  
0       play  
1       play  
2       play  
3       like  
4       play  


In [73]:
merged_sesions_data = raw_sessions_data.merge(embedings_per_id, left_on="track_id", right_on="id_track").drop("id_track", axis=1)
merged_sesions_data["timestamp"] = (pd.to_datetime(merged_sesions_data["timestamp"], format='mixed').apply(lambda x: x.timestamp())).div(BASE_DATE)
merged_sesions_data = pd.get_dummies(merged_sesions_data, columns=["event_type"], dtype = int)

merged_sesions_data = [pd.DataFrame(y) for _, y in merged_sesions_data.groupby('session_id', as_index=False) if len(y) > 3]

In [17]:
EMBEDING_SIZE = 16
NUMBER_OF_INPUTS = EMBEDING_SIZE + 4

In [11]:
print(merged_sesions_data[0].iloc[0].to_dict())

{'session_id': 124, 'timestamp': 0.9939232413373714, 'user_id': 101, 'track_id': '1ZAkJE2vi1wbo7tyvgWuXN', 'embeding': array([ 1.02536   ,  1.1276305 ,  2.40779   , -0.29763827, -1.450943  ,
       -2.0151002 ,  0.3454442 ,  0.9167252 ], dtype=float32), 'event_type_like': 0, 'event_type_play': 1, 'event_type_skip': 0}


In [74]:
class CustomUserSesionsDataset(Dataset):
    def __init__(self, data: pd.DataFrame):
        self.data = data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def flatten_array(self, array):
        unpacked_data = np.array([])
        for data in array:
            unpacked_data = np.append(unpacked_data, data)
        return unpacked_data

    def __getitem__(self, idx):
        sessions = [torch.tensor(self.flatten_array(session)) for session in self.data[idx].drop("track_id", axis=1).drop("session_id", axis=1).drop("user_id", axis=1).values]
        return sessions

In [75]:
user_sesions_dataset = CustomUserSesionsDataset(merged_sesions_data)

In [76]:
COMPARE_PERCENTAGE = 0.25

def pad_sessions_collate(batch, pad_value=0.0):
    input_sequences = [
    ]
    output_sequences = [
    ]

    for batch_element in batch:
        number_of_sequence_data = len(batch_element)
        number_of_reference_data = math.ceil(number_of_sequence_data * COMPARE_PERCENTAGE)

        input_sequences.append(torch.stack(batch_element[:number_of_sequence_data - number_of_reference_data]))
        output_sequences.append(sample(batch_element[number_of_sequence_data - number_of_reference_data:], 1)[0])

    x_lens = [len(x) for x in input_sequences]

    padded_input_sequence = pad_sequence(input_sequences, batch_first=True, padding_value=pad_value)

    return padded_input_sequence, torch.stack([x[1:EMBEDING_SIZE + 1] for x in output_sequences]), x_lens


In [77]:
train_loader = DataLoader(user_sesions_dataset, batch_size=8192, shuffle=True, collate_fn=pad_sessions_collate)

In [None]:
class UsertPreferenceGenerator(torch.nn.Module):
    def __init__(self):
        super(UsertPreferenceGenerator, self).__init__()
        self.num_layers = 5
        self.hidden_size = 4*EMBEDING_SIZE
        self.rnn = torch.nn.LSTM(input_size=NUMBER_OF_INPUTS, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
        self.fc1 = torch.nn.Linear(self.hidden_size, 2048)
        self.act2 = torch.nn.LeakyReLU(negative_slope=0.15)
        self.fc2 = torch.nn.Linear(2048, EMBEDING_SIZE)
        self.init_hidden(1)

    def init_hidden(self, batch_size):
        self.hidden = (torch.zeros(self.num_layers, batch_size, self.hidden_size),torch.zeros(self.num_layers, batch_size, self.hidden_size))
        return self.hidden

    def forward(self, x, x_lens):
        x_packed = pack_padded_sequence(x, x_lens, batch_first=True, enforce_sorted=False)
        packed_output, self.hidden = self.rnn(x_packed, (self.hidden[0].to(x.device), self.hidden[1].to(x.device)))
        hidden = self.hidden[0][-1]
        # output, _ = pad_packed_sequence(packed_output, batch_first=True)
        x = self.fc1(hidden)
        x = self.fc2(self.act2(x))
        # x = self.act2(x)

        return x

In [134]:
model = UsertPreferenceGenerator().to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.0001)
loss_fun = nn.MSELoss()

# torch.onnx.export(model,[] ,'loop.onnx', verbose=True)

UsertPreferenceGenerator(
  (rnn): LSTM(20, 64, num_layers=5, batch_first=True)
  (fc1): Linear(in_features=64, out_features=2048, bias=True)
  (act2): LeakyReLU(negative_slope=0.15)
  (fc2): Linear(in_features=2048, out_features=16, bias=True)
)


In [135]:
# Training loop
model = UsertPreferenceGenerator().to(device)
print(model)

optimizer = torch.optim.Adam(model.parameters(), lr = 0.00005)
loss_fun = nn.MSELoss()


model.train()
for epoch in range(4):
    loss_sum = 0
    for x, targets, x_lens in train_loader:
        x = x.float().to(device)
        targets = targets.to(torch.float).to(device)

        # hidden, state = model.init_hidden(x.size(0))
        # hidden, state = hidden.to(device), state.to(device)
        # model.reset_hidden_state(x.size(0), device)
        model.init_hidden(x.size(0))
        preds = model(x, x_lens)
        loss = loss_fun(preds, targets)
        loss.backward()
        loss_sum += loss.item()
        optimizer.step()
    print(f"Epoch: {epoch}, loss: {loss_sum:.3}")

UsertPreferenceGenerator(
  (rnn): LSTM(20, 64, num_layers=5, batch_first=True)
  (fc1): Linear(in_features=64, out_features=2048, bias=True)
  (act2): LeakyReLU(negative_slope=0.15)
  (fc2): Linear(in_features=2048, out_features=16, bias=True)
)
Epoch: 0, loss: 1.54e+02
Epoch: 1, loss: 1.47e+02
Epoch: 2, loss: 1.36e+02
Epoch: 3, loss: 1.15e+02


In [136]:
i = 0
model.eval()
previos_user = 0
for data in merged_sesions_data:
    sessions = []
    if previos_user == data["user_id"].values[0] and i > 2:
        continue
    elif previos_user != data["user_id"].values[0]:
        i = 0
    i += 1
    for session in (
        data.drop("track_id", axis=1)
        .drop("session_id", axis=1)
        .drop("user_id", axis=1)
        .values
    ):
        unpacked_data = np.array([])
        for element in session:
            unpacked_data = np.append(unpacked_data, element)
        sessions.append(torch.tensor(unpacked_data))
    sessions = torch.stack(sessions)
    previos_user = data["user_id"].values[0]
    model.init_hidden(1)
    print(data["user_id"].values[0])
    print(model(torch.stack([sessions]).float().to(device), torch.tensor([len(sessions)])))
    print("____________________________")

101
tensor([[ 4.1354, -3.2477,  4.4248, -4.4678,  4.1620,  4.3366,  4.3668, -4.1995,
          4.6507,  4.4015,  3.9576,  4.1079,  3.4182,  4.2094,  4.2788, -3.6079]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
____________________________
101
tensor([[ 4.0140, -3.1610,  4.2924, -4.3355,  4.0369,  4.2071,  4.2385, -4.0719,
          4.5104,  4.2704,  3.8425,  3.9863,  3.3184,  4.0832,  4.1542, -3.5051]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
____________________________
101
tensor([[ 4.2626, -3.3387,  4.5647, -4.6078,  4.2934,  4.4733,  4.5029, -4.3348,
          4.7986,  4.5395,  4.0794,  4.2350,  3.5232,  4.3433,  4.4110, -3.7162]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
____________________________
102
tensor([[ 2.9374, -2.3882,  3.1290, -3.1784,  2.9421,  3.0673,  3.1026, -2.9637,
          3.2749,  3.1184,  2.8220,  2.9139,  2.4367,  2.9865,  3.0610, -2.5926]],
       device='cuda:0', grad_fn=<AddmmBackward0>)
____________________________
102
tensor([

In [137]:
reference_embeding = torch.tensor([ 1.9414, -1.6596,  2.0431, -2.1083,  1.9402,  2.0081,  2.0438, -1.9451,
          2.1338,  2.0530,  1.8675,  1.9206,  1.6198,  1.9776,  2.0474, -1.7319])
for closest_track in sorted(embedings_per_id.iterrows(), key=lambda x: abs(torch.dist(torch.from_numpy(x[1]["embeding"]), reference_embeding)))[:20]:
    print(closest_track)
    print(torch.dist(torch.from_numpy(closest_track[1]["embeding"]), reference_embeding))
    print("_______________________________________________")

(9401, id_track                               6vJ6FR4WtlKSxe8lmh3F1C
embeding    [1.1440185, -2.2282503, 2.69161, -2.33038, 0.6...
Name: 9401, dtype: object)
tensor(4.0372)
_______________________________________________
(17408, id_track                               0TU7xhMfZbI1okpLyHptFV
embeding    [1.7516255, -1.0265095, 1.4417157, -2.5329847,...
Name: 17408, dtype: object)
tensor(4.3454)
_______________________________________________
(17346, id_track                               7BR3N1IbXTjEJffLd8aics
embeding    [1.7520267, -1.0005742, 1.4708413, -2.5598593,...
Name: 17346, dtype: object)
tensor(4.3739)
_______________________________________________
(17251, id_track                               7u32XGioKOcqZYrEZphaF5
embeding    [1.5983043, 0.21562803, 1.3008103, -2.2185829,...
Name: 17251, dtype: object)
tensor(4.3804)
_______________________________________________
(7876, id_track                               5IzZpz0vA73IIjqFPpXSXP
embeding    [1.5966141, 0.21843353, 1.296

In [138]:
reference_embeding = torch.tensor([ 3.4030, -2.7226,  3.6306, -3.6759,  3.4126,  3.5589,  3.5932, -3.4391,
          3.8069,  3.6148,  3.2643,  3.3756,  2.8162,  3.4569,  3.5316, -2.9874])
for closest_track in sorted(embedings_per_id.iterrows(), key=lambda x: abs(torch.dist(torch.from_numpy(x[1]["embeding"]), reference_embeding)))[:20]:
    print(closest_track)
    print(torch.dist(torch.from_numpy(closest_track[1]["embeding"]), reference_embeding))
    print("_______________________________________________")

(8262, id_track                               1NmVZsG18CzCAtw7rnV3yA
embeding    [2.5190046, -1.0258049, 4.23624, -4.2312994, 1...
Name: 8262, dtype: object)
tensor(5.3576)
_______________________________________________
(16761, id_track                               4bXCcoesMt8u99xMsbLr9U
embeding    [2.5197072, -1.0250027, 4.2366147, -4.231409, ...
Name: 16761, dtype: object)
tensor(5.3578)
_______________________________________________
(16789, id_track                               1kTPQnabROVkW9bUXdCGrB
embeding    [2.5151258, -1.0272943, 4.2377143, -4.20871, 1...
Name: 16789, dtype: object)
tensor(5.3675)
_______________________________________________
(13422, id_track                               4L560fic5PaTuCFm6CviKa
embeding    [2.5144317, -1.0278113, 4.237182, -4.20825, 1....
Name: 13422, dtype: object)
tensor(5.3690)
_______________________________________________
(16531, id_track                               7hZuICN5eaCuQyp443RCt6
embeding    [2.511023, -1.0319961, 4.259

In [139]:
model_scripted = torch.jit.script(model.cpu())
print(model)
print(model_scripted)
model_scripted.save("recomendation_models/scripted_user_preference_model_v19_16_dim_ok.pt")

UsertPreferenceGenerator(
  (rnn): LSTM(20, 64, num_layers=5, batch_first=True)
  (fc1): Linear(in_features=64, out_features=2048, bias=True)
  (act2): LeakyReLU(negative_slope=0.15)
  (fc2): Linear(in_features=2048, out_features=16, bias=True)
)
RecursiveScriptModule(
  original_name=UsertPreferenceGenerator
  (rnn): RecursiveScriptModule(original_name=LSTM)
  (fc1): RecursiveScriptModule(original_name=Linear)
  (act2): RecursiveScriptModule(original_name=LeakyReLU)
  (fc2): RecursiveScriptModule(original_name=Linear)
)
