In [15]:
import torch
import pandas as pd
from pathlib import Path
from datetime import datetime
from torch.utils.data import Dataset
import heapq
import numpy as np

In [16]:
import os
os.environ['CUDA_LAUNCH_BLOCKING'] = '1'
os.environ["CUDA_VISIBLE_DEVICES"] = "0"  # Lub inne dostępne GPU


In [17]:
device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
print("Device:", device)
print(torch.__version__)
print(torch.version.cuda)

torch.cuda.empty_cache()


Device: cuda
2.5.1+cu124
12.4


In [18]:
model_path = 'embeding_models/embeding_v0.pt'
model = torch.jit.load(model_path)

In [19]:
BASE_DATE = datetime.strptime("2025-01-03", '%Y-%m-%d').timestamp()
print(BASE_DATE)

1735858800.0


In [20]:
tracks_raw_data = pd.read_json(Path('../data_v2/tracks_artists.jsonl'), lines=True)


# NORMALIZE DATES
tracks_raw_data["release_date"] = (pd.to_datetime(tracks_raw_data["release_date"], format='mixed').apply(lambda x: x.timestamp())).div(BASE_DATE)

# NORMALIZE DURATION
tracks_raw_data["duration_ms"] = tracks_raw_data["duration_ms"].div(tracks_raw_data["duration_ms"].max())

# NORMALIZE TEMPO
tracks_raw_data["tempo"] = tracks_raw_data["tempo"].div(tracks_raw_data["tempo"].max())

# EXPLICITE ENCODING
tracks_raw_data["explicit"] = tracks_raw_data["explicit"].apply(lambda x: [0, 1] if x else [1, 0])

# processs ARTIST HASH

def postprocess_hash_to_list(x):
    str_x = str(x)
    if len(str_x) < 8:
        str_x = "0" * (8 - len(str_x)) + str_x
    return [int(x) for x in str_x]

tracks_raw_data["id_artist_hash"] = tracks_raw_data["id_artist_hash"].apply(postprocess_hash_to_list)


In [21]:
class TracksDataset(Dataset):
    def __init__(self, tracks_data: pd.DataFrame):
        self.data = tracks_data

    def __len__(self):
        return len(self.data)

    def get_item(self, idx):
        return self.data.iloc[idx].values

    def __getitem__(self, idx):
        unpacked_data = []
        for data in self.data.iloc[idx].drop("id_track").values:
            if type(data) != list:
                unpacked_data.append(data)
            else:
                unpacked_data += data
        return torch.Tensor(unpacked_data)

In [22]:
user_id = 106
user_raw_data = pd.read_json(Path(f'../data_v2/users.jsonl'), lines=True)
user_data = user_raw_data[user_raw_data['user_id'] == user_id]

user_genre = user_data['genre_hot_one'].iloc[0]
print(user_genre)

session_raw_data = pd.read_json(Path(f'../data_v2/sessions/sessions_user_{user_id}.jsonl'), lines=True)



filtered_data = session_raw_data[session_raw_data['event_type'] == 'like']
latest_play = filtered_data.loc[filtered_data['timestamp'].idxmax()]


print(latest_play)

session_track_ids = set(session_raw_data[session_raw_data['timestamp'] > latest_play['timestamp']]['track_id'])
print(session_track_ids)

latest_track_id = latest_play['track_id']

# print(tracks_raw_data)

last_liked_track = tracks_raw_data[tracks_raw_data['id_track'] == latest_track_id]
last_liked_row_number = last_liked_track.index[0] 




[0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1]
session_id                          3553
timestamp     2024-11-19 14:04:50.946000
user_id                              106
track_id          00TU7szqwBrxWTd9SO7ihK
event_type                          like
Name: 3197, dtype: object
{'2IBXG98HZdppZf20F1UaAV', '5cZqsjVs6MevCnAkasbEOX', '54Mjd3irHbiPNjW2MNgzKV', '1cCbsojaA6GIT7Y3zuMJ1q', '6FqNHDHEsUTKqVWXBkMd9y', '2u8UO2DyOaroGATrPjjZHe', '1WMMrUKNFsnEG2csDURxhb', '1PtJclc46wTk367PlsU6Uj'}


In [23]:

tracks_raw_dataset = TracksDataset(tracks_raw_data)
model.eval()  


RecursiveScriptModule(
  original_name=Autoencoder
  (encoder): RecursiveScriptModule(
    original_name=Encoder
    (fc1): RecursiveScriptModule(original_name=Linear)
    (act1): RecursiveScriptModule(original_name=ReLU6)
    (fc5): RecursiveScriptModule(original_name=Linear)
  )
  (decoder): RecursiveScriptModule(
    original_name=Decoder
    (fc1): RecursiveScriptModule(original_name=Linear)
    (act1): RecursiveScriptModule(original_name=ReLU6)
    (fc5): RecursiveScriptModule(original_name=Linear)
  )
)

In [24]:

closest_tracks = []
track_embeddings = []

for i, track in enumerate(tracks_raw_dataset):
    track_embeddings.append(model.encoder(track.to(device)))

reference_embedding = track_embeddings[last_liked_row_number]

for i, track in enumerate(tracks_raw_dataset):
    if i == last_liked_row_number:
        continue

    # Oblicz dystans
    distance = torch.dist(reference_embedding, track_embeddings[i]).item()
    
    # Dodaj do listy i utrzymaj tylko 5 najbliższych
    if len(closest_tracks) < 5:
        heapq.heappush(closest_tracks, (-distance, i))  # Używamy -distance dla max-heap
    else:
        # Jeśli lista ma już 5 elementów, dodaj tylko, jeśli dystans jest mniejszy
        if -closest_tracks[0][0] > distance:
            heapq.heapreplace(closest_tracks, (-distance, i))
        

closest_tracks = sorted(closest_tracks, key=lambda x: -x[0])

# Uzyskaj szczegóły 5 najbliższych utworów
closest_tracks_details = [tracks_raw_data.iloc[i] for _, i in closest_tracks]



for track_detail in closest_tracks_details:
    print(track_detail)
    closest_track_id = track_detail['id_track']
    closest_track_genre = track_detail['genre_hot_one']
    is_in_session = closest_track_id in session_track_ids
    

    print(f"Track ID with smallest distance: {closest_track_id}")
    print(f"Track genre with smallest distance: {closest_track_genre}")
    print(f"Is in user session: {'Yes' if is_in_session else 'No'}")

last_liked_track_genre = tracks_raw_data.iloc[last_liked_row_number]['genre_hot_one']
print(f"Track genre last liked: {last_liked_track_genre}")
print(f"Smallest Distance: {closest_tracks[0][0]}")



id_track                                       4PBou1i63krEp3jKCPhxVD
popularity                                                       0.63
duration_ms                                                  0.071008
explicit                                                       [1, 0]
release_date                                                 0.286696
danceability                                                    0.763
energy                                                          0.939
key                                                                 8
loudness                                                     0.080433
speechiness                                                    0.0488
acousticness                                                    0.422
instrumentalness                                                0.136
liveness                                                        0.119
valence                                                         0.448
tempo               

In [None]:

for user_id in range(1011, 1051):
    session_raw_data = pd.read_json(Path(f'../data_v2/sessions/sessions_user_{user_id}.jsonl'), lines=True)
    print(user_id)
    user_data = user_raw_data[user_raw_data['user_id'] == user_id]
    user_genre = user_data['genre_hot_one'].iloc[0]
    filtered_data = session_raw_data[session_raw_data['event_type'] == 'like']
    latest_play = filtered_data.loc[filtered_data['timestamp'].idxmax()]
    session_track_ids = list(session_raw_data['track_id'])
    latest_track_id = latest_play['track_id']
    last_liked_track = tracks_raw_data[tracks_raw_data['id_track'] == latest_track_id]
    last_liked_row_number = last_liked_track.index[0] 

    reference_embedding = track_embeddings[last_liked_row_number]

    closest_track = None
    min_distance = float('inf')

    for i, track in enumerate(track_embeddings):
        if i == last_liked_row_number:
            continue
        
        distance = torch.dist(reference_embedding, track).item()
            
        track_genre = tracks_raw_data.iloc[i]['genre_hot_one']
        has_matching_genre = any(u == 1 and t == 1 for u, t in zip(user_genre, track_genre))
        
        if has_matching_genre and distance < min_distance:
            closest_track = tracks_raw_data.iloc[i]
            min_distance = distance

    closest_track_id = closest_track['id_track']
    is_in_session = closest_track_id in session_track_ids
    
    if is_in_session:
        print(f"Good recomendations {closest_track_id}")
        print(latest_track_id)



1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
Good recomendations 0O7lENhqOySbsL743G7PqD
6do7nVH4UOuZlKcA2ZBHbh
1047
1048
1049
1050
