In [1]:
!pip install scikit-learn



In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import random
import time 
from sklearn.preprocessing import normalize

In [4]:
K = 5

file_path = "lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"

# Load interaction log
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"], on_bad_lines='skip')

In [5]:
# Count how many times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mapping dictionaries for user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map user/artist names to indices in the dataframe
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the user-artist matrix (UAM) in sparse format
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# Normalize rows (each row becomes a unit vector for cosine similarity)
UAM = normalize(UAM, norm='l2', axis=1)

# Precompute UAM transpose for efficient cosine similarity calculations
# This avoids repeatedly computing UAM.transpose() within the loop.
UAM_T = UAM.transpose().tocsr()

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

if __name__ == '__main__':
    start_time = time.time()
    num_users = UAM.shape[0]
    
    for u in range(num_users):
        print("Seed user-id:", user_ids[u])
        
        # Retrieve the normalized playcount vector (row) for the current user
        pc_vec = UAM.getrow(u)
        
        # Compute cosine similarity between the seed user and all other users
        # Using the precomputed UAM_T speeds up the dot-product calculation.
        sims = pc_vec.dot(UAM_T)
        
        # Convert the similarity result to COO format to easily access indices and data.
        sims_coo = sims.tocoo()
        
        # Remove self-similarity by filtering out the element where the column index equals the seed user.
        mask = sims_coo.col != u
        neighbor_indices = sims_coo.col[mask]
        neighbor_scores = sims_coo.data[mask]
        
        # Proceed only if there is at least one similar user
        if neighbor_scores.size == 0:
            print("No similar users found for", user_ids[u])
            print('-' * 80)
            continue
        
        # Use argpartition to quickly find indices of top K similarities.
        # For cases where there are fewer than K neighbors, return all.
        if neighbor_scores.size > K:
            top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
            # Sort the top K neighbors in descending order by their similarity scores.
            sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
            top_neighbors = neighbor_indices[top_k_idx][sorted_order]
        else:
            sorted_order = np.argsort(neighbor_scores)[::-1]
            top_neighbors = neighbor_indices[sorted_order]
        
        # Retrieve the actual user IDs for the top K neighbors
        recommended_user_ids = user_ids[top_neighbors]
        print(f"Nearest K={K} neighbors' user-ids:", recommended_user_ids.flatten())
        
        # Get all artist indices listened to by the top K similar users in one vectorized operation.
        # This avoids looping over each neighbor.
        neighbor_rows = UAM[top_neighbors, :]
        neighbor_coo = neighbor_rows.tocoo()
        recommended_artists_idx = np.unique(neighbor_coo.col)
        
        # Remove artists already known to the seed user
        seed_artists = pc_vec.indices
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)
        
        # Randomly choose 5 artists (or fewer, if not enough exist) for recommendation
        if recommended_artists_idx.size > 0:
            num_to_select = min(5, recommended_artists_idx.size)
            random_elements = random.sample(list(recommended_artists_idx), num_to_select)
            print(f"Indices of {len(random_elements)} recommended artists:", random_elements)
            rec_artist_names = [artist_ids[i] for i in random_elements]
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
            
        print('-' * 80)
    
    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists: [28341, 12357, 5495, 34731, 988]
Recommended artist names: ['Exile', 'Soul Coughing', 'Little Computer People', 'Wechsel Garland', 'Elbow']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists: [10909, 8624, 51082, 3038, 2906]
Recommended artist names: ['Sly & The Family Stone', 'Helmet', 'Charley Patton', 'Elastica', 'Chikinki']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids: ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Indices of 5 recommended artists: [59737, 94003, 3224, 7398, 15233]
Recommended artist names: ['Luk