# LFM-1b_recommend-CF-faiss-parallelized-cython

In [1]:
import pandas as pd
import numpy as np
import random
import time
import faiss
import concurrent.futures
import cython
import cProfile

In [2]:
# Set number of neighbors to retrieve (K)
K = 5

# Load interaction log
file_path = "lastfm-dataset-1K/userid-timestamp-artid-artname-traid-traname.tsv"
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"],
                           on_bad_lines='skip')

## cProfile from CF-faiss-parallelized version

In [7]:
# Load interaction log
interactions = pd.read_csv(file_path, sep="\t", header=None,
                           names=["user_id", "timestamp", "artist_id", "artist_name", "track_id", "track_name"],
                           on_bad_lines='skip')

# Count number of times each user listened to each artist
playcounts = interactions.groupby(["user_id", "artist_name"]).size().reset_index(name="playcount")

# Create mappings: user/artist to index
user_id_to_idx = {user: idx for idx, user in enumerate(playcounts["user_id"].unique())}
artist_name_to_idx = {artist: idx for idx, artist in enumerate(playcounts["artist_name"].unique())}

# Map to index
playcounts["user_idx"] = playcounts["user_id"].map(user_id_to_idx)
playcounts["artist_idx"] = playcounts["artist_name"].map(artist_name_to_idx)

# Build dense user-artist matrix (playcounts)
n_users = len(user_id_to_idx)
n_artists = len(artist_name_to_idx)
user_matrix = np.zeros((n_users, n_artists), dtype='float32')

for row in playcounts.itertuples(index=False):
    user_matrix[row.user_idx, row.artist_idx] += row.playcount

# Normalize rows to unit length for cosine similarity
faiss.normalize_L2(user_matrix)

# Create FAISS index for cosine similarity (inner product after normalization)
index = faiss.IndexFlatIP(n_artists)
index.add(user_matrix)  # Add all user vectors

# Reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

def recommend_user(u):
    neighbor_indices = neighbors[u][1:]  # skip self
    neighbor_scores = distances[u][1:]

    recommended_artists_idx = []
    for neighbor in neighbor_indices:
        neighbor_vector = user_matrix[neighbor]
        recommended_artists_idx.extend(np.where(neighbor_vector > 0)[0])

    known_artists = np.where(user_matrix[u] > 0)[0]
    recommended_artists_idx = list(set(recommended_artists_idx) - set(known_artists))

    if len(recommended_artists_idx) >= 5:
        random_indices = random.sample(recommended_artists_idx, 5)
    else:
        random_indices = recommended_artists_idx

    return {
        "user": user_ids[u],
        "neighbors": user_ids[neighbor_indices],
        "artist_indices": random_indices,
        "artist_names": [artist_ids[i] for i in random_indices]
    }

def run_recommendations():
    """Run FAISS search and print recommendation results."""
    start_time = time.time()
    distances, neighbors = index.search(user_matrix, K + 1)

    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(executor.map(recommend_user, range(n_users)))

    for res in results:
        print("Seed user-id:", res["user"])
        print(f"Nearest K={K} neighbors' user-ids:", res["neighbors"])
        print(f"Indices of {len(res['artist_indices'])} recommended artists:", res["artist_indices"])
        print("Recommended artist names:", res["artist_names"])
        print('-' * 80)

    end_time = time.time()
    avg_time = (end_time - start_time) / n_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")


if __name__ == "__main__":
    run_recommendations()


Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists: [np.int64(31621), np.int64(2997), np.int64(21022), np.int64(3137), np.int64(8970)]
Recommended artist names: [np.str_('Mc Frontalot'), np.str_('Depeche Mode'), np.str_('Lfo Vs. F.U.S.E.'), np.str_('Glen Hansard & Markéta Irglová'), np.str_('Marcia Griffiths')]
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists: [np.int64(18317), np.int64(587), np.int64(2129), np.int64(10673), np.int64(2900)]
Recommended artist names: [np.str_('Fito & Fitipaldis'), np.str_('Tiga'), np.str_('Eagles'), np.str_('Horace Andy'), np.str_('Charlotte Hatherley')]
--------------------------------------------------------------------------------
Seed user-id: us

Average time per user recommendation: 0.0015s

In [8]:
%prun -s cumulative run_recommendations()


Seed user-id: user_000001
Nearest K=5 neighbors' user-ids: ['user_000074' 'user_000629' 'user_000862' 'user_000844' 'user_000168']
Indices of 5 recommended artists: [np.int64(1773), np.int64(42297), np.int64(27425), np.int64(10698), np.int64(18469)]
Recommended artist names: [np.str_('The Shins'), np.str_('Fanfare Ciocarlia'), np.str_('And One - Sometimes (Radio Edit)'), np.str_('Japan'), np.str_('Hildegard Knef')]
--------------------------------------------------------------------------------
Seed user-id: user_000002
Nearest K=5 neighbors' user-ids: ['user_000673' 'user_000238' 'user_000143' 'user_000726' 'user_000513']
Indices of 5 recommended artists: [np.int64(2584), np.int64(2007), np.int64(2295), np.int64(2872), np.int64(42248)]
Recommended artist names: [np.str_('The Used'), np.str_('Bob Marley'), np.str_('Kraftwerk'), np.str_('Cansei De Ser Sexy'), np.str_('Alina Orlova')]
--------------------------------------------------------------------------------
Seed user-id: user_0000

         570362 function calls (565402 primitive calls) in 1.702 seconds

   Ordered by: cumulative time

   ncalls  tottime  percall  cumtime  percall filename:lineno(function)
        1    0.000    0.000    1.702    1.702 {built-in method builtins.exec}
        1    0.001    0.001    1.701    1.701 <string>:1(<module>)
        1    0.002    0.002    1.701    1.701 3631488482.py:60(run_recommendations)
      993    0.001    0.000    1.035    0.001 _base.py:612(result_iterator)
      992    0.001    0.000    1.034    0.001 _base.py:314(_result_or_cancel)
     1601    0.002    0.000    1.034    0.001 threading.py:295(wait)
      992    0.002    0.000    1.032    0.001 _base.py:428(result)
     5212    1.030    0.000    1.030    0.000 {method 'acquire' of '_thread.lock' objects}
        1    0.000    0.000    0.353    0.353 class_wrappers.py:300(replacement_search)
        1    0.000    0.000    0.352    0.352 swigfaiss.py:2367(search)
        1    0.352    0.352    0.352    0.352 {built

# Applying Cython

In [9]:
%%cython
import numpy as np
cimport numpy as np

def build_playcounts_cy(str filepath):
    cdef dict counts = {}
    cdef bytes line
    with open(filepath, 'rb') as f:
        for line in f:
            parts = line.rstrip(b'\n').split(b'\t')
            counts[(parts[0], parts[3])] = counts.get((parts[0], parts[3]), 0) + 1
    return counts                                       # Cython: replace pandas groupby

def fill_matrix_cy(dict counts,
                   np.ndarray[np.float32_t, ndim=2] M,
                   dict user_to_idx,
                   dict artist_to_idx):
    cdef tuple key
    cdef int u, a, cnt
    for key, cnt in counts.items():
        u = user_to_idx[key[0]]
        a = artist_to_idx[key[1]]
        M[u, a] += cnt                                    # Cython: replace itertuples loop

Content of stderr:
In file included from /Users/yujinkim/.cache/ipython/cython/_cython_magic_d39cbee5769cfe17ccdb36f1ac3ec0377793e9d0.c:1250:
In file included from /Users/yujinkim/Desktop/Efficient MRS/efficient_mrs/.venv/lib/python3.11/site-packages/numpy/_core/include/numpy/arrayobject.h:5:
In file included from /Users/yujinkim/Desktop/Efficient MRS/efficient_mrs/.venv/lib/python3.11/site-packages/numpy/_core/include/numpy/ndarrayobject.h:12:
In file included from /Users/yujinkim/Desktop/Efficient MRS/efficient_mrs/.venv/lib/python3.11/site-packages/numpy/_core/include/numpy/ndarraytypes.h:1913:
      |  ^
 7765 |                     CYTHON_FALLTHROUGH;
      |                     ^
/Users/yujinkim/.cache/ipython/cython/_cython_magic_d39cbee5769cfe17ccdb36f1ac3ec0377793e9d0.c:567:34: note: expanded from macro 'CYTHON_FALLTHROUGH'
  567 |       #define CYTHON_FALLTHROUGH __attribute__((fallthrough))
      |                                  ^
 7776 |                     CYTHON_FALLTHRO

In [11]:
# Count number of times each user listened to each artist
# playcounts = interactions.groupby(["user_id","artist_name"]).size().reset_index(name="playcount")
counts = build_playcounts_cy(file_path)              # Cython: replace pandas groupby

# Create mappings: user/artist to index
user_id_to_idx     = {u: i for i, u in enumerate({k[0] for k in counts.keys()})}   # Cython: from counts keys
artist_name_to_idx = {a: i for i, a in enumerate({k[1] for k in counts.keys()})}

# Build dense user-artist matrix (playcounts)
n_users   = len(user_id_to_idx)
n_artists = len(artist_name_to_idx)
user_matrix = np.zeros((n_users, n_artists), dtype=np.float32)

# for row in playcounts.itertuples(index=False):
#     user_matrix[row.user_idx, row.artist_idx] += row.playcount
fill_matrix_cy(counts, user_matrix, user_id_to_idx, artist_name_to_idx)  # Cython: replace itertuples loop

# Normalize rows to unit length for cosine similarity
faiss.normalize_L2(user_matrix)

# Create FAISS index for cosine similarity (inner product after normalization)
index = faiss.IndexFlatIP(n_artists)
index.add(user_matrix)  # Add all user vectors

# Reverse lookup
user_ids = np.array(list(user_id_to_idx.keys()))
artist_ids = np.array(list(artist_name_to_idx.keys()))

def recommend_user(u):
    neighbor_indices = neighbors[u][1:]  # skip self
    neighbor_scores  = distances[u][1:]

    recommended_artists_idx = []
    for neighbor in neighbor_indices:
        neighbor_vector = user_matrix[neighbor]
        recommended_artists_idx.extend(np.where(neighbor_vector > 0)[0])

    known_artists = np.where(user_matrix[u] > 0)[0]
    recommended_artists_idx = list(set(recommended_artists_idx) - set(known_artists))

    if len(recommended_artists_idx) >= K:
        random_indices = random.sample(recommended_artists_idx, K)
    else:
        random_indices = recommended_artists_idx

    return {
        "user": user_ids[u],
        "neighbors": user_ids[neighbor_indices],
        "artist_indices": random_indices,
        "artist_names": [artist_ids[i] for i in random_indices]
    }

def run_recommendations():
    """Run FAISS search and print recommendation results."""
    start_time = time.time()
    global distances, neighbors
    distances, neighbors = index.search(user_matrix, K + 1)

    with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
        results = list(executor.map(recommend_user, range(n_users)))

    for res in results:
        print("Seed user-id:", res["user"])
        print(f"Nearest K={K} neighbors' user-ids:", res["neighbors"])
        print(f"Indices of {len(res['artist_indices'])} recommended artists:", res["artist_indices"])
        print("Recommended artist names:", res["artist_names"])
        print('-' * 80)

    end_time = time.time()
    avg_time = (end_time - start_time) / n_users
    print(f"Average time per user recommendation: {avg_time:.4f} seconds")


In [12]:
if __name__ == "__main__":
    run_recommendations()

Seed user-id: b'user_000375'
Nearest K=5 neighbors' user-ids: [b'user_000959' b'user_000765' b'user_000016' b'user_000975'
 b'user_000905']
Indices of 5 recommended artists: [np.int64(28475), np.int64(148163), np.int64(151593), np.int64(106789), np.int64(1029)]
Recommended artist names: [np.bytes_(b'The Killers'), np.bytes_(b'Drive'), np.bytes_(b'Beyonc\xc3\xa9 & Shakira'), np.bytes_(b'Godsmack'), np.bytes_(b'Rupee')]
--------------------------------------------------------------------------------
Seed user-id: b'user_000942'
Nearest K=5 neighbors' user-ids: [b'user_000195' b'user_000230' b'user_000431' b'user_000912'
 b'user_000637']
Indices of 5 recommended artists: [np.int64(122830), np.int64(134569), np.int64(91377), np.int64(40629), np.int64(8840)]
Recommended artist names: [np.bytes_(b'Joe Strummer & The Mescaleros'), np.bytes_(b'Vertical Horizon'), np.bytes_(b'Brx'), np.bytes_(b'Total War'), np.bytes_(b'Jmfh')]
--------------------------------------------------------------------

After applying cython: 0.0013s