In [7]:
!pip install scikit-learn

Collecting scikit-learn
  Downloading scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting joblib>=1.2.0 (from scikit-learn)
  Using cached joblib-1.4.2-py3-none-any.whl.metadata (5.4 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Downloading threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.6.1-cp39-cp39-macosx_12_0_arm64.whl (11.1 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m11.1/11.1 MB[0m [31m16.3 MB/s[0m eta [36m0:00:00[0ma [36m0:00:01[0m
[?25hUsing cached joblib-1.4.2-py3-none-any.whl (301 kB)
Downloading threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, joblib, scikit-learn
Successfully installed joblib-1.4.2 scikit-learn-1.6.1 threadpoolctl-3.6.0


In [5]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import random
import time 
from sklearn.preprocessing import normalize

In [3]:
K = 5

file_path = "Downloads/lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"

# Load interaction log
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"], on_bad_lines='skip')

In [8]:
# Count number of times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mappings: user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map to index
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the sparse matrix (UAM)
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# normalize rows (for cosine similarity)

UAM = normalize(UAM, norm='l2', axis=1)

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))


if __name__ == '__main__':
    start_time = time.time()
    for u in range(0, UAM.shape[0]):
        print("Seed user-id: " + str(user_ids[u]))

        # Get normalized playcount vector for current user
        pc_vec = UAM.getrow(u)

        # Compute cosine similarity between this user and all users
        uU_sim = pc_vec.dot(UAM.transpose()).tocoo()
        uU_user_idx = uU_sim.col
        uU_data = uU_sim.data

        # Remove self-similarity
        uU_data[uU_user_idx == u] = 0

        # Eliminate zeros
        uU_sim.data = uU_data
        uU_sim = uU_sim.tocsr()
        uU_sim.eliminate_zeros()
        uU_sim = uU_sim.tocoo()

        # Re-assign user indices and scores
        uU_user_idx = uU_sim.col
        uU_data = uU_sim.data

        # Sort by similarity score
        sort_index = np.argsort(uU_data)

        # Select top-K nearest neighbors
        # Note that uU_user_idx indeed provides the indices for users in UAM
        recommended_user_idx = uU_user_idx[sort_index[-K:]]
        # Get user_ids corresponding to nearest neighbors
        recommended_user_ids = user_ids[recommended_user_idx]
        # Get similarity score for nearest neighbors
        recommended_user_scores = uU_data[sort_index[-K:]]

        print("Nearest K=" + str(K) + " neighbors' user-ids: ", recommended_user_ids.flatten())

        # Get all artists these similar users have listened to
        recommended_artists_idx = []
        for u_idx in recommended_user_idx:
            recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))

        # Remove duplicates and sort
        recommended_artists_idx = sorted(set(recommended_artists_idx))

        # Remove artists already known to seed user
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, pc_vec.indices)

        # Narrow down to random 5 artist out of all artists that similar users listen to for illustration purpose
        random_indices = random.sample(range(len(recommended_artists_idx)), 5)
        random_elements = [recommended_artists_idx[i] for i in random_indices]

        print("Indices of " + str(len(random_elements)) + " recommended artists: ", random_elements)
        print("Recommended artist names:", [artist_ids[i] for i in random_elements])
        print('-' * 80)

    end_time = time.time()
    total_users = UAM.shape[0]
    avg_time = (end_time - start_time) / total_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids:  ['user_000168' 'user_000844' 'user_000862' 'user_000629' 'user_000074']
Indices of 5 recommended artists:  [np.int32(29828), np.int32(12423), np.int32(2138), np.int32(1714), np.int32(10856)]
Recommended artist names: [np.str_('Audio Lotion'), np.str_('Team Sleep'), np.str_('Emerson, Lake & Palmer'), np.str_('The Kooks'), np.str_('Racoon')]
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids:  ['user_000513' 'user_000726' 'user_000143' 'user_000238' 'user_000673']
Indices of 5 recommended artists:  [np.int32(13470), np.int32(120605), np.int32(10615), np.int32(10578), np.int32(22845)]
Recommended artist names: [np.str_('The Bouncing Souls'), np.str_('Peter, Bjorn & John Feat Victoria Bergsman'), np.str_('Ex Pistols'), np.str_('Desmond Dekker'), np.str_('Download')]
-------------------------------------------------------------------------------

In [6]:
# Count how many times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mapping dictionaries for user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map user/artist names to indices in the dataframe
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the user-artist matrix (UAM) in sparse format
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# Normalize rows (each row becomes a unit vector for cosine similarity)
UAM = normalize(UAM, norm='l2', axis=1)

# Precompute UAM transpose for efficient cosine similarity calculations
# This avoids repeatedly computing UAM.transpose() within the loop.
UAM_T = UAM.transpose().tocsr()

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

if __name__ == '__main__':
    start_time = time.time()
    num_users = UAM.shape[0]
    
    for u in range(num_users):
        print("Seed user-id:", user_ids[u])
        
        # Retrieve the normalized playcount vector (row) for the current user
        pc_vec = UAM.getrow(u)
        
        # Compute cosine similarity between the seed user and all other users
        # Using the precomputed UAM_T speeds up the dot-product calculation.
        sims = pc_vec.dot(UAM_T)
        
        # Convert the similarity result to COO format to easily access indices and data.
        sims_coo = sims.tocoo()
        
        # Remove self-similarity by filtering out the element where the column index equals the seed user.
        mask = sims_coo.col != u
        neighbor_indices = sims_coo.col[mask]
        neighbor_scores = sims_coo.data[mask]
        
        # Proceed only if there is at least one similar user
        if neighbor_scores.size == 0:
            print("No similar users found for", user_ids[u])
            print('-' * 80)
            continue
        
        # Use argpartition to quickly find indices of top K similarities.
        # For cases where there are fewer than K neighbors, return all.
        if neighbor_scores.size > K:
            top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
            # Sort the top K neighbors in descending order by their similarity scores.
            sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
            top_neighbors = neighbor_indices[top_k_idx][sorted_order]
        else:
            sorted_order = np.argsort(neighbor_scores)[::-1]
            top_neighbors = neighbor_indices[sorted_order]
        
        # Retrieve the actual user IDs for the top K neighbors
        recommended_user_ids = user_ids[top_neighbors]
        print(f"Nearest K={K} neighbors' user-ids:", recommended_user_ids.flatten())
        
        # Get all artist indices listened to by the top K similar users in one vectorized operation.
        # This avoids looping over each neighbor.
        neighbor_rows = UAM[top_neighbors, :]
        neighbor_coo = neighbor_rows.tocoo()
        recommended_artists_idx = np.unique(neighbor_coo.col)
        
        # Remove artists already known to the seed user
        seed_artists = pc_vec.indices
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)
        
        # Randomly choose 5 artists (or fewer, if not enough exist) for recommendation
        if recommended_artists_idx.size > 0:
            num_to_select = min(5, recommended_artists_idx.size)
            random_elements = random.sample(list(recommended_artists_idx), num_to_select)
            print(f"Indices of {len(random_elements)} recommended artists:", random_elements)
            rec_artist_names = [artist_ids[i] for i in random_elements]
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
            
        print('-' * 80)
    
    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists: [np.int32(123814), np.int32(17426), np.int32(34351), np.int32(53763), np.int32(47752)]
Recommended artist names: [np.str_('The Vogado Projects Vol. 1'), np.str_('Pastora'), np.str_('Juiceboxxx And Dre Skull'), np.str_('Operator & Baffled'), np.str_('Vanja Lazarova')]
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists: [np.int32(72353), np.int32(109929), np.int32(180), np.int32(2003), np.int32(2743)]
Recommended artist names: [np.str_('Villancicos'), np.str_("D'Malicious"), np.str_('Fila Brazillia'), np.str_('Black Sabbath'), np.str_('Aqualung')]
--------------------------------------------------------------------------------
Seed use