In [2]:
import torch
import pandas as pd
from pathlib import Path
from datetime import datetime
from torch.utils.data import Dataset


In [3]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)

Device: cuda


In [4]:
model_path = 'embeding_models/embeding_v0.pt'
model = torch.jit.load(model_path)

In [5]:
BASE_DATE = datetime.strptime("2025-01-03", '%Y-%m-%d').timestamp()
print(BASE_DATE)

1735858800.0


In [None]:
tracks_raw_data = pd.read_json(Path('../data_v2/tracks_artists.jsonl'), lines=True)


# NORMALIZE DATES
tracks_raw_data["release_date"] = (pd.to_datetime(tracks_raw_data["release_date"], format='mixed').apply(lambda x: x.timestamp())).div(BASE_DATE)

# NORMALIZE DURATION
tracks_raw_data["duration_ms"] = tracks_raw_data["duration_ms"].div(tracks_raw_data["duration_ms"].max())

# NORMALIZE TEMPO
tracks_raw_data["tempo"] = tracks_raw_data["tempo"].div(tracks_raw_data["tempo"].max())

# EXPLICITE ENCODING
tracks_raw_data["explicit"] = tracks_raw_data["explicit"].apply(lambda x: [0, 1] if x else [1, 0])

# processs ARTIST HASH

def postprocess_hash_to_list(x):
    str_x = str(x)
    if len(str_x) < 8:
        str_x = "0" * (8 - len(str_x)) + str_x
    return [int(x) for x in str_x]

tracks_raw_data["id_artist_hash"] = tracks_raw_data["id_artist_hash"].apply(postprocess_hash_to_list)


id_track                                       0RNxWy0PC3AyH4ThH3aGK6
popularity                                                       0.55
duration_ms                                                    201467
explicit                                                            0
release_date                                                     1929
danceability                                                    0.673
energy                                                          0.377
key                                                                 0
loudness                                                     0.235683
speechiness                                                    0.0697
acousticness                                                    0.586
instrumentalness                                                  0.0
liveness                                                        0.332
valence                                                         0.713
tempo               

In [7]:
class TracksDataset(Dataset):
    def __init__(self, tracks_data: pd.DataFrame):
        self.data = tracks_data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def __getitem__(self, idx):
        unpacked_data = []
        for data in self.data.iloc[idx].drop("id_track").values:
            if type(data) != list:
                unpacked_data.append(data)
            else:
                unpacked_data += data
        return torch.Tensor(unpacked_data)

In [33]:
user_id = 101
session_raw_data = pd.read_json(Path(f'../data_v2/sessions/sessions_user_{user_id}.jsonl'), lines=True)

filtered_data = session_raw_data[session_raw_data['event_type'] == 'like']

latest_play = filtered_data.loc[filtered_data['timestamp'].idxmax()]
print(latest_play)

latest_track_id = latest_play['track_id']

# print(tracks_raw_data)

last_liked_track = tracks_raw_data[tracks_raw_data['id_track'] == latest_track_id]
last_liked_row_number = last_liked_track.index[0] 




session_id                           493
timestamp     2024-11-25 20:55:36.783000
user_id                              101
track_id          4mmkhcEm1Ljy1U9nwtsxUo
event_type                          like
Name: 1687, dtype: object


In [34]:

tracks_raw_dataset = TracksDataset(tracks_raw_data)


In [36]:

model.eval()  
reference_embedding = model.encoder(tracks_raw_dataset[last_liked_row_number].to(device))

# Przechowuj minimalny dystans i odpowiadający mu indeks
min_distance = float('inf')
closest_track_id = None

for i, track in enumerate(tracks_raw_dataset):
    if i == last_liked_row_number:
        continue
    # Oblicz embedding dla bieżącego tracka
    track_embedding = model.encoder(track.to(device))
    
    # Oblicz dystans
    distance = torch.dist(reference_embedding, track_embedding).item()
    # Sprawdź, czy znaleziono mniejszy dystans
    if distance < min_distance:
        min_distance = distance
        closest_track = tracks_raw_data.iloc[i]
        
closest_track_id = closest_track['id_track']
closest_track_genre = closest_track['genre_hot_one']
last_liked_track_genre = tracks_raw_data.iloc[last_liked_row_number]['genre_hot_one']

print(f"Track ID with smallest distance: {closest_track_id}")
print(f"Track genre with smallest distance: {closest_track_genre}")
print(f"Track genre last liked: {last_liked_track_genre}")

print(f"Smallest Distance: {min_distance}")



Track ID with smallest distance: 4R5bSS8yoCl2czeWLr61aO
Track genre with smallest distance: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Track genre last liked: [0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0]
Smallest Distance: 0.11900537461042404
