In [1]:
!pip install scikit-learn

Collecting scikit-learn
  Using cached scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl.metadata (31 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Using cached scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl.metadata (61 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.6.0-py3-none-any.whl.metadata (13 kB)
Using cached scikit_learn-1.6.1-cp310-cp310-macosx_12_0_arm64.whl (11.1 MB)
Using cached scipy-1.15.2-cp310-cp310-macosx_14_0_arm64.whl (22.4 MB)
Using cached threadpoolctl-3.6.0-py3-none-any.whl (18 kB)
Installing collected packages: threadpoolctl, scipy, scikit-learn
Successfully installed scikit-learn-1.6.1 scipy-1.15.2 threadpoolctl-3.6.0


In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import random
import time 
from sklearn.preprocessing import normalize
from multiprocessing import Pool, cpu_count

In [4]:
K = 5

file_path = "lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"

# Load interaction log
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"], on_bad_lines='skip')

In [7]:
# Count how many times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mapping dictionaries for user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map user/artist names to indices in the dataframe
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the user-artist matrix (UAM) in sparse format
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# Normalize rows (each row becomes a unit vector for cosine similarity)
UAM = normalize(UAM, norm='l2', axis=1)

# Precompute UAM transpose for efficient cosine similarity calculations
# This avoids repeatedly computing UAM.transpose() within the loop.
UAM_T = UAM.transpose().tocsr()

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

if __name__ == '__main__':
    start_time = time.time()
    num_users = UAM.shape[0]
    
    for u in range(num_users):
        print("Seed user-id:", user_ids[u])
        
        # Retrieve the normalized playcount vector (row) for the current user
        pc_vec = UAM.getrow(u)
        
        # Compute cosine similarity between the seed user and all other users
        # Using the precomputed UAM_T speeds up the dot-product calculation.
        sims = pc_vec.dot(UAM_T)
        
        # Convert the similarity result to COO format to easily access indices and data.
        sims_coo = sims.tocoo()
        
        # Remove self-similarity by filtering out the element where the column index equals the seed user.
        mask = sims_coo.col != u
        neighbor_indices = sims_coo.col[mask]
        neighbor_scores = sims_coo.data[mask]
        
        # Proceed only if there is at least one similar user
        if neighbor_scores.size == 0:
            print("No similar users found for", user_ids[u])
            print('-' * 80)
            continue
        
        # Use argpartition to quickly find indices of top K similarities.
        # For cases where there are fewer than K neighbors, return all.
        if neighbor_scores.size > K:
            top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
            # Sort the top K neighbors in descending order by their similarity scores.
            sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
            top_neighbors = neighbor_indices[top_k_idx][sorted_order]
        else:
            sorted_order = np.argsort(neighbor_scores)[::-1]
            top_neighbors = neighbor_indices[sorted_order]
        
        # Retrieve the actual user IDs for the top K neighbors
        recommended_user_ids = user_ids[top_neighbors]
        print(f"Nearest K={K} neighbors' user-ids:", recommended_user_ids.flatten())
        
        # Get all artist indices listened to by the top K similar users in one vectorized operation.
        # This avoids looping over each neighbor.
        neighbor_rows = UAM[top_neighbors, :]
        neighbor_coo = neighbor_rows.tocoo()
        recommended_artists_idx = np.unique(neighbor_coo.col)
        
        # Remove artists already known to the seed user
        seed_artists = pc_vec.indices
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)
        
        # Randomly choose 5 artists (or fewer, if not enough exist) for recommendation
        if recommended_artists_idx.size > 0:
            num_to_select = min(5, recommended_artists_idx.size)
            random_elements = random.sample(list(recommended_artists_idx), num_to_select)
            print(f"Indices of {len(random_elements)} recommended artists:", random_elements)
            rec_artist_names = [artist_ids[i] for i in random_elements]
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
            
        print('-' * 80)
    
    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists: [5607, 14960, 9597, 6915, 17584]
Recommended artist names: ['Mitchell Akiyama', 'Fridge', 'The Dust Brothers', 'Girls Under Glass', 'The Sound']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists: [217, 122063, 6856, 72335, 13541]
Recommended artist names: ['Hefner', 'Ida Corr Vs Fedde Le Grand + Camilla Jones Vs Fedde Le Grand', 'Daughtry', 'Larrinkin Love', 'The Ponys']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids: ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Indices of 5 recommended artists: [115222, 50403, 1151

In [8]:
#Thread
from concurrent.futures import ThreadPoolExecutor, as_completed
MAX_WORKERS = 8
# Build User-Artist Matrix (UAM)
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

UAM = normalize(UAM, norm='l2', axis=1)

user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

UAM_T = UAM.transpose().tocsr()

num_users = UAM.shape[0]

# --- Define function to process a single user ---
def process_single_user(u):
    pc_vec = UAM.getrow(u)
    sims = pc_vec.dot(UAM_T)
    sims_coo = sims.tocoo()
    
    mask = sims_coo.col != u
    neighbor_indices = sims_coo.col[mask]
    neighbor_scores = sims_coo.data[mask]
    
    if neighbor_scores.size == 0:
        return (user_ids[u], None, None)

    if neighbor_scores.size > K:
        top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
        sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
        top_neighbors = neighbor_indices[top_k_idx][sorted_order]
    else:
        sorted_order = np.argsort(neighbor_scores)[::-1]
        top_neighbors = neighbor_indices[sorted_order]

    recommended_user_ids = user_ids[top_neighbors]

    neighbor_rows = UAM[top_neighbors, :]
    neighbor_coo = neighbor_rows.tocoo()
    recommended_artists_idx = np.unique(neighbor_coo.col)

    seed_artists = UAM.getrow(u).indices
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)

    if recommended_artists_idx.size > 0:
        num_to_select = min(5, recommended_artists_idx.size)
        random_elements = random.sample(list(recommended_artists_idx), num_to_select)
        rec_artist_names = [artist_ids[i] for i in random_elements]
        return (user_ids[u], recommended_user_ids, rec_artist_names)
    else:
        return (user_ids[u], recommended_user_ids, None)

# --- Main Program ---
if __name__ == '__main__':
    start_time = time.time()

    all_results = []

    # Use ThreadPoolExecutor to process all users in parallel
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_single_user, u) for u in range(num_users)]
        for future in as_completed(futures):
            res = future.result()
            all_results.append(res)

    # Print all results
    for res in all_results:
        seed_user, neighbors, rec_artists = res
        print("Seed user-id:", seed_user)
        if neighbors is None:
            print("No similar users found for", seed_user)
            print('-' * 80)
            continue
        print(f"Nearest K={K} neighbors' user-ids:", neighbors.flatten())
        if rec_artists is not None:
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
        print('-' * 80)

    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Recommended artist names: ['Jewel', 'Julieta Venegas', 'Reverend Horton Heat', 'Big & Rich', 'Boogie Down Productions']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Recommended artist names: ['Jewel', 'Julieta Venegas', 'Reverend Horton Heat', 'Big & Rich', 'Boogie Down Productions']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids: ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Recommended artist names: ['Jewel', 'Julieta Venegas', 'Reverend Horton Heat', 'Big & Rich', 'Boogie Down Productions']
--------------------------------------------------------------------------------
Seed