In [2]:
!pip install scikit-learn



In [4]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import random
import time 
from sklearn.preprocessing import normalize
from multiprocessing import Pool, cpu_count

In [6]:
K = 5

file_path = r'C:\MRS\lastfm-dataset-1K\lastfm-dataset-1K\userid-timestamp-artid-artname-traid-traname.tsv'

# Load interaction log
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"], on_bad_lines='skip')

In [5]:
# Count number of times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mappings: user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map to index
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the sparse matrix (UAM)
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# normalize rows (for cosine similarity)

UAM = normalize(UAM, norm='l2', axis=1)

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))


if __name__ == '__main__':
    start_time = time.time()
    for u in range(0, UAM.shape[0]):
        print("Seed user-id: " + str(user_ids[u]))

        # Get normalized playcount vector for current user
        pc_vec = UAM.getrow(u)

        # Compute cosine similarity between this user and all users
        uU_sim = pc_vec.dot(UAM.transpose()).tocoo()
        uU_user_idx = uU_sim.col
        uU_data = uU_sim.data

        # Remove self-similarity
        uU_data[uU_user_idx == u] = 0

        # Eliminate zeros
        uU_sim.data = uU_data
        uU_sim = uU_sim.tocsr()
        uU_sim.eliminate_zeros()
        uU_sim = uU_sim.tocoo()

        # Re-assign user indices and scores
        uU_user_idx = uU_sim.col
        uU_data = uU_sim.data

        # Sort by similarity score
        sort_index = np.argsort(uU_data)

        # Select top-K nearest neighbors
        # Note that uU_user_idx indeed provides the indices for users in UAM
        recommended_user_idx = uU_user_idx[sort_index[-K:]]
        # Get user_ids corresponding to nearest neighbors
        recommended_user_ids = user_ids[recommended_user_idx]
        # Get similarity score for nearest neighbors
        recommended_user_scores = uU_data[sort_index[-K:]]

        print("Nearest K=" + str(K) + " neighbors' user-ids: ", recommended_user_ids.flatten())

        # Get all artists these similar users have listened to
        recommended_artists_idx = []
        for u_idx in recommended_user_idx:
            recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))

        # Remove duplicates and sort
        recommended_artists_idx = sorted(set(recommended_artists_idx))

        # Remove artists already known to seed user
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, pc_vec.indices)

        # Narrow down to random 5 artist out of all artists that similar users listen to for illustration purpose
        random_indices = random.sample(range(len(recommended_artists_idx)), 5)
        random_elements = [recommended_artists_idx[i] for i in random_indices]

        print("Indices of " + str(len(random_elements)) + " recommended artists: ", random_elements)
        print("Recommended artist names:", [artist_ids[i] for i in random_elements])
        print('-' * 80)

    end_time = time.time()
    total_users = UAM.shape[0]
    avg_time = (end_time - start_time) / total_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids:  ['user_000168' 'user_000844' 'user_000862' 'user_000629' 'user_000074']
Indices of 5 recommended artists:  [160722, 9537, 33476, 3107, 8724]
Recommended artist names: ['Silvia Aprile', 'Telefuzz', 'Bryn Christopher', 'Frida Hyvönen', "Jim O'Rourke"]
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids:  ['user_000513' 'user_000726' 'user_000143' 'user_000238' 'user_000673']
Indices of 5 recommended artists:  [18821, 61158, 9651, 23604, 62401]
Recommended artist names: ['Maceo & The Macks', 'La Polla Records', 'The Mountain Goats', 'Chiara Mastroianni & Benjamin Biolay', 'La Quinta Estación']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids:  ['user_000392' 'user_000741' 'user_000400' 'user_000558' 'user_000813']
Indices of 5 recommended artists:  [7024, 

In [52]:
# Count how many times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mapping dictionaries for user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map user/artist names to indices in the dataframe
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the user-artist matrix (UAM) in sparse format
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# Normalize rows (each row becomes a unit vector for cosine similarity)
UAM = normalize(UAM, norm='l2', axis=1)

# Precompute UAM transpose for efficient cosine similarity calculations
# This avoids repeatedly computing UAM.transpose() within the loop.
UAM_T = UAM.transpose().tocsr()

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

if __name__ == '__main__':
    start_time = time.time()
    num_users = UAM.shape[0]
    
    for u in range(num_users):
        print("Seed user-id:", user_ids[u])
        
        # Retrieve the normalized playcount vector (row) for the current user
        pc_vec = UAM.getrow(u)
        
        # Compute cosine similarity between the seed user and all other users
        # Using the precomputed UAM_T speeds up the dot-product calculation.
        sims = pc_vec.dot(UAM_T)
        
        # Convert the similarity result to COO format to easily access indices and data.
        sims_coo = sims.tocoo()
        
        # Remove self-similarity by filtering out the element where the column index equals the seed user.
        mask = sims_coo.col != u
        neighbor_indices = sims_coo.col[mask]
        neighbor_scores = sims_coo.data[mask]
        
        # Proceed only if there is at least one similar user
        if neighbor_scores.size == 0:
            print("No similar users found for", user_ids[u])
            print('-' * 80)
            continue
        
        # Use argpartition to quickly find indices of top K similarities.
        # For cases where there are fewer than K neighbors, return all.
        if neighbor_scores.size > K:
            top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
            # Sort the top K neighbors in descending order by their similarity scores.
            sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
            top_neighbors = neighbor_indices[top_k_idx][sorted_order]
        else:
            sorted_order = np.argsort(neighbor_scores)[::-1]
            top_neighbors = neighbor_indices[sorted_order]
        
        # Retrieve the actual user IDs for the top K neighbors
        recommended_user_ids = user_ids[top_neighbors]
        print(f"Nearest K={K} neighbors' user-ids:", recommended_user_ids.flatten())
        
        # Get all artist indices listened to by the top K similar users in one vectorized operation.
        # This avoids looping over each neighbor.
        neighbor_rows = UAM[top_neighbors, :]
        neighbor_coo = neighbor_rows.tocoo()
        recommended_artists_idx = np.unique(neighbor_coo.col)
        
        # Remove artists already known to the seed user
        seed_artists = pc_vec.indices
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)
        
        # Randomly choose 5 artists (or fewer, if not enough exist) for recommendation
        if recommended_artists_idx.size > 0:
            num_to_select = min(5, recommended_artists_idx.size)
            random_elements = random.sample(list(recommended_artists_idx), num_to_select)
            print(f"Indices of {len(random_elements)} recommended artists:", random_elements)
            rec_artist_names = [artist_ids[i] for i in random_elements]
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
            
        print('-' * 80)
    
    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists: [5826, 72140, 50044, 9633, 160647]
Recommended artist names: ['Sami Koivikko', 'Houdini', 'Александр Ляпин', 'The Karminsky Experience Inc.', 'Giuann Shadai, Daker, Ill Mario']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists: [2447, 6940, 63, 36736, 3369]
Recommended artist names: ['Quiet Riot', 'Jack Off Jill', 'Björk', 'Cdoass', 'Lush']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids: ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Indices of 5 recommended artists: [1414, 93138, 11969, 93202, 115126]
Recommended art

In [54]:
#Batch Version
from concurrent.futures import ThreadPoolExecutor, as_completed
BATCH_SIZE = 8 
# Build User-Artist Matrix (UAM)
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

UAM = normalize(UAM, norm='l2', axis=1)

user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

UAM_T = UAM.transpose().tocsr()

num_users = UAM.shape[0]

# --- Function to process a single batch ---
def process_batch(batch_start, batch_end):
    batch_users = np.arange(batch_start, batch_end)

    pc_mat = UAM[batch_users, :]
    sim_mat = pc_mat.dot(UAM_T).toarray()

    batch_results = []

    for idx_in_batch, u in enumerate(batch_users):
        sim_vec = sim_mat[idx_in_batch]
        sim_vec[u] = 0  # Remove self-similarity

        neighbor_indices = np.where(sim_vec > 0)[0]
        neighbor_scores = sim_vec[neighbor_indices]

        if neighbor_scores.size == 0:
            batch_results.append((user_ids[u], None, None))
            continue

        if neighbor_scores.size > K:
            top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
            sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
            top_neighbors = neighbor_indices[top_k_idx][sorted_order]
        else:
            sorted_order = np.argsort(neighbor_scores)[::-1]
            top_neighbors = neighbor_indices[sorted_order]

        recommended_user_ids = user_ids[top_neighbors]

        neighbor_rows = UAM[top_neighbors, :]
        neighbor_coo = neighbor_rows.tocoo()
        recommended_artists_idx = np.unique(neighbor_coo.col)

        seed_artists = UAM.getrow(u).indices
        recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)

        if recommended_artists_idx.size > 0:
            num_to_select = min(5, recommended_artists_idx.size)
            random_elements = random.sample(list(recommended_artists_idx), num_to_select)
            rec_artist_names = [artist_ids[i] for i in random_elements]
            batch_results.append((user_ids[u], recommended_user_ids, rec_artist_names))
        else:
            batch_results.append((user_ids[u], recommended_user_ids, None))

    return batch_results

# --- Main Program ---
if __name__ == '__main__':
    start_time = time.time()

    # Create batches
    batches = [(i, min(i+BATCH_SIZE, num_users)) for i in range(0, num_users, BATCH_SIZE)]

    all_results = []

    # Sequentially process each batch
    for batch_start, batch_end in batches:
        batch_result = process_batch(batch_start, batch_end)
        all_results.extend(batch_result)

    # Print results
    for res in all_results:
        seed_user, neighbors, rec_artists = res
        print("Seed user-id:", seed_user)
        if neighbors is None:
            print("No similar users found for", seed_user)
            print('-' * 80)
            continue
        print(f"Nearest K={K} neighbors' user-ids:", neighbors.flatten())
        if rec_artists is not None:
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
        print('-' * 80)

    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Recommended artist names: ['Lionel Hampton', 'Cal Tjader', 'Julee Cruise', 'Владимир Давидович Ашкенази', 'Brian Setzer']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Recommended artist names: ['Lionel Hampton', 'Cal Tjader', 'Julee Cruise', 'Владимир Давидович Ашкенази', 'Brian Setzer']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids: ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Recommended artist names: ['Lionel Hampton', 'Cal Tjader', 'Julee Cruise', 'Владимир Давидович Ашкенази', 'Brian Setzer']
-------------------------------------------------------------------------------

In [60]:
#Thread
from concurrent.futures import ThreadPoolExecutor, as_completed
MAX_WORKERS = 8
# Build User-Artist Matrix (UAM)
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

UAM = normalize(UAM, norm='l2', axis=1)

user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

UAM_T = UAM.transpose().tocsr()

num_users = UAM.shape[0]

# --- Define function to process a single user ---
def process_single_user(u):
    pc_vec = UAM.getrow(u)
    sims = pc_vec.dot(UAM_T)
    sims_coo = sims.tocoo()
    
    mask = sims_coo.col != u
    neighbor_indices = sims_coo.col[mask]
    neighbor_scores = sims_coo.data[mask]
    
    if neighbor_scores.size == 0:
        return (user_ids[u], None, None)

    if neighbor_scores.size > K:
        top_k_idx = np.argpartition(neighbor_scores, -K)[-K:]
        sorted_order = np.argsort(neighbor_scores[top_k_idx])[::-1]
        top_neighbors = neighbor_indices[top_k_idx][sorted_order]
    else:
        sorted_order = np.argsort(neighbor_scores)[::-1]
        top_neighbors = neighbor_indices[sorted_order]

    recommended_user_ids = user_ids[top_neighbors]

    neighbor_rows = UAM[top_neighbors, :]
    neighbor_coo = neighbor_rows.tocoo()
    recommended_artists_idx = np.unique(neighbor_coo.col)

    seed_artists = UAM.getrow(u).indices
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, seed_artists)

    if recommended_artists_idx.size > 0:
        num_to_select = min(5, recommended_artists_idx.size)
        random_elements = random.sample(list(recommended_artists_idx), num_to_select)
        rec_artist_names = [artist_ids[i] for i in random_elements]
        return (user_ids[u], recommended_user_ids, rec_artist_names)
    else:
        return (user_ids[u], recommended_user_ids, None)

# --- Main Program ---
if __name__ == '__main__':
    start_time = time.time()

    all_results = []

    # Use ThreadPoolExecutor to process all users in parallel
    with ThreadPoolExecutor(max_workers=MAX_WORKERS) as executor:
        futures = [executor.submit(process_single_user, u) for u in range(num_users)]
        for future in as_completed(futures):
            res = future.result()
            all_results.append(res)

    # Print all results
    for res in all_results:
        seed_user, neighbors, rec_artists = res
        print("Seed user-id:", seed_user)
        if neighbors is None:
            print("No similar users found for", seed_user)
            print('-' * 80)
            continue
        print(f"Nearest K={K} neighbors' user-ids:", neighbors.flatten())
        if rec_artists is not None:
            print("Recommended artist names:", rec_artist_names)
        else:
            print("No new recommended artists found.")
        print('-' * 80)

    end_time = time.time()
    avg_time = (end_time - start_time) / num_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")

Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Recommended artist names: ['Lionel Hampton', 'Cal Tjader', 'Julee Cruise', 'Владимир Давидович Ашкенази', 'Brian Setzer']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids: ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Recommended artist names: ['Lionel Hampton', 'Cal Tjader', 'Julee Cruise', 'Владимир Давидович Ашкенази', 'Brian Setzer']
--------------------------------------------------------------------------------
Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Recommended artist names: ['Lionel Hampton', 'Cal Tjader', 'Julee Cruise', 'Владимир Давидович Ашкенази', 'Brian Setzer']
-------------------------------------------------------------------------------