In [2]:
!pip install annoy

Collecting annoy
  Downloading annoy-1.17.3.tar.gz (647 kB)
Building wheels for collected packages: annoy
  Building wheel for annoy (setup.py): started
  Building wheel for annoy (setup.py): finished with status 'done'
  Created wheel for annoy: filename=annoy-1.17.3-cp39-cp39-win_amd64.whl size=54004 sha256=1f0625e06a712524fd87e27dac841260663e1c8c7a7d000b7321b95364963f1e
  Stored in directory: c:\users\hahah\appdata\local\pip\cache\wheels\09\a9\54\37478e65995fe712f7da465749da9ddb21db6b1a599d591ac7
Successfully built annoy
Installing collected packages: annoy
Successfully installed annoy-1.17.3


In [2]:
import pandas as pd
from scipy.sparse import csr_matrix
import numpy as np
import random
import time
from sklearn.preprocessing import normalize
from annoy import AnnoyIndex  # Make sure to install Annoy: pip install annoy

In [7]:
# Set number of neighbors to retrieve (K)
K = 5

# Load interaction log
file_path = "lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"],
                           on_bad_lines='skip')

In [8]:
# Count the number of times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mappings: user_id and artist_name to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map original user and artist identifiers to their indices
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the sparse User-Artist Matrix (UAM)
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# Normalize rows (for cosine similarity)
UAM = normalize(UAM, norm='l2', axis=1)

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

# ------------------------
# Build Annoy index
# ------------------------
# Annoy needs dense vectors. Here, each user vector is of dimension equal to the number of artists.
f = UAM.shape[1]
annoy_index = AnnoyIndex(f, 'angular')  # Using angular metric corresponds to cosine similarity on normalized vectors.

# Add each user's normalized playcount vector to the index.
for i in range(UAM.shape[0]):
    # Convert the sparse vector to a dense array
    vector = UAM.getrow(i).toarray()[0]
    annoy_index.add_item(i, vector)

# Build the Annoy index tree structure. The number of trees can be adjusted (more trees means higher precision at the cost of slower indexing).
n_trees = 10
annoy_index.build(n_trees)

# ------------------------
# Recommendation using Annoy
# ------------------------
start_time = time.time()
for u in range(UAM.shape[0]):
    print("Seed user-id: " + str(user_ids[u]))

    # Query Annoy for K+1 neighbors (we request one extra because the query user appears as the nearest neighbor)
    neighbor_indices, neighbor_distances = annoy_index.get_nns_by_item(u, K + 1, include_distances=True)
    
    # Exclude the seed user from the neighbor list
    neighbor_pairs = [(idx, dist) for idx, dist in zip(neighbor_indices, neighbor_distances) if idx != u]
    # In case there are more than K neighbors, we take the top K closest ones.
    top_neighbors = neighbor_pairs[:K]

    # Convert Annoy's angular distance to cosine similarity. For normalized vectors:
    # cosine_similarity = 1 - (angular_distance^2 / 2)
    top_neighbors_similarities = [1 - (dist * dist / 2) for idx, dist in top_neighbors]
    recommended_user_idx = [idx for idx, dist in top_neighbors]
    recommended_user_scores = top_neighbors_similarities

    print("Nearest K=" + str(K) + " neighbors' user-ids: ", user_ids[recommended_user_idx])

    # Aggregate artists from the recommended users
    recommended_artists_idx = []
    for u_idx in recommended_user_idx:
        recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))
    
    # Remove duplicates and sort
    recommended_artists_idx = sorted(set(recommended_artists_idx))
    
    # Exclude artists the seed user has already listened to
    seed_artist_indices = set(UAM.getrow(u).indices)
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, list(seed_artist_indices))

    # For illustration, randomly sample 5 recommended artists (if fewer than 5, take all)
    if len(recommended_artists_idx) < 5:
        random_elements = recommended_artists_idx.tolist()
    else:
        random_indices = random.sample(range(len(recommended_artists_idx)), 5)
        random_elements = [recommended_artists_idx[i] for i in random_indices]

    print("Indices of {} recommended artists: ".format(len(random_elements)), random_elements)
    print("Recommended artist names:", [artist_ids[i] for i in random_elements])
    print('-' * 80)

end_time = time.time()
total_users = UAM.shape[0]
avg_time = (end_time - start_time) / total_users
print(f"Average time per user recommendation: {avg_time:.4f} seconds")


Seed user-id: user_000001
Nearest K=5 neighbors' user-ids:  ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists:  [12649, 86741, 31788, 42929, 27598]
Recommended artist names: ['Yoga', 'Rune Lindbæk', 'Out From Animals', 'Jazz Juice', 'Fictional - Hangman']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids:  ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists:  [140022, 528, 30747, 8575, 23459]
Recommended artist names: ['Die Sekte', 'Squarepusher', 'Sailor Moon', 'Go Home Productions', 'Transwave']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids:  ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Indices of 5 recommended artists:  [50899, 1697, 4472, 30706, 11931]
Recommended artist

In [9]:
# Count the number of times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mappings: user_id and artist_name to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map original user and artist identifiers to their indices
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build the sparse User-Artist Matrix (UAM)
UAM = csr_matrix((playcounts["playcount"],
                  (playcounts["user_idx"], playcounts["artist_idx"])),
                 shape=(len(user_id_to_idx), len(artist_name_to_idx)))

# Normalize rows (for cosine similarity)
UAM = normalize(UAM, norm='l2', axis=1)

# Create lists for reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

# ------------------------
# Build Annoy index
# ------------------------
# Annoy needs dense vectors. Here, each user vector is of dimension equal to the number of artists.
f = UAM.shape[1]
annoy_index = AnnoyIndex(f, 'angular')  # Using angular metric corresponds to cosine similarity on normalized vectors.

# Add each user's normalized playcount vector to the index.
for i in range(UAM.shape[0]):
    # Convert the sparse vector to a dense array
    vector = UAM.getrow(i).toarray()[0]
    annoy_index.add_item(i, vector)

# Build the Annoy index tree structure. The number of trees can be adjusted (more trees means higher precision at the cost of slower indexing).
n_trees = 5
annoy_index.build(n_trees)

# ------------------------
# Recommendation using Annoy
# ------------------------
start_time = time.time()
for u in range(UAM.shape[0]):
    print("Seed user-id: " + str(user_ids[u]))

    # Query Annoy for K+1 neighbors (we request one extra because the query user appears as the nearest neighbor)
    neighbor_indices, neighbor_distances = annoy_index.get_nns_by_item(u, K + 1, include_distances=True)
    
    # Exclude the seed user from the neighbor list
    neighbor_pairs = [(idx, dist) for idx, dist in zip(neighbor_indices, neighbor_distances) if idx != u]
    # In case there are more than K neighbors, we take the top K closest ones.
    top_neighbors = neighbor_pairs[:K]

    # Convert Annoy's angular distance to cosine similarity. For normalized vectors:
    # cosine_similarity = 1 - (angular_distance^2 / 2)
    top_neighbors_similarities = [1 - (dist * dist / 2) for idx, dist in top_neighbors]
    recommended_user_idx = [idx for idx, dist in top_neighbors]
    recommended_user_scores = top_neighbors_similarities

    print("Nearest K=" + str(K) + " neighbors' user-ids: ", user_ids[recommended_user_idx])

    # Aggregate artists from the recommended users
    recommended_artists_idx = []
    for u_idx in recommended_user_idx:
        recommended_artists_idx.extend(list(UAM.getrow(u_idx).indices))
    
    # Remove duplicates and sort
    recommended_artists_idx = sorted(set(recommended_artists_idx))
    
    # Exclude artists the seed user has already listened to
    seed_artist_indices = set(UAM.getrow(u).indices)
    recommended_artists_idx = np.setdiff1d(recommended_artists_idx, list(seed_artist_indices))

    # For illustration, randomly sample 5 recommended artists (if fewer than 5, take all)
    if len(recommended_artists_idx) < 5:
        random_elements = recommended_artists_idx.tolist()
    else:
        random_indices = random.sample(range(len(recommended_artists_idx)), 5)
        random_elements = [recommended_artists_idx[i] for i in random_indices]

    print("Indices of {} recommended artists: ".format(len(random_elements)), random_elements)
    print("Recommended artist names:", [artist_ids[i] for i in random_elements])
    print('-' * 80)

end_time = time.time()
total_users = UAM.shape[0]
avg_time = (end_time - start_time) / total_users
print(f"Average time per user recommendation: {avg_time:.4f} seconds")


Seed user-id: user_000001
Nearest K=5 neighbors' user-ids:  ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists:  [12920, 1665, 12923, 52765, 5653]
Recommended artist names: ['The Posies', 'The Cribs', 'The Seeds', "D'Sound", 'Nathan Fake']
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids:  ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists:  [2499, 11029, 10800, 9205, 30671]
Recommended artist names: ['Sneaker Pimps', 'The Milestone Corporation', 'Money Mark', 'Pet Shop Boys', 'Philippe Zdar']
--------------------------------------------------------------------------------
Seed user-id: user_000003
Nearest K=5 neighbors' user-ids:  ['user_000813' 'user_000558' 'user_000400' 'user_000741' 'user_000392']
Indices of 5 recommended artists:  [59730, 125706, 6799, 115172, 745]
Recommended artist n