In [14]:
from scipy.sparse import dok_matrix
from sklearn.metrics.pairwise import pairwise_distances
import numpy as np
import pandas as pd

In [7]:
# Set random seed (for reproducibility)
np.random.seed(1000)

In [8]:
# Create a dummy user-item dataset
nb_users = 1000
nb_products = 2500
max_rating = 5
max_rated_products = 500

In [9]:
X_preferences = dok_matrix((nb_users, nb_products), dtype=np.uint8)

In [12]:
for i in range(nb_users):
    # Extract n random products
    n_products = np.random.randint(0, max_rated_products+1)
    products = np.random.randint(0, nb_products, size=n_products)
    
    # Populate preference sparse matrix
    for p in products:
        X_preferences[i, p] = np.random.randint(0, max_rating+1)

In [13]:
X_preferences

<1000x2500 sparse matrix of type '<class 'numpy.uint8'>'
	with 191178 stored elements in Dictionary Of Keys format>

In [16]:
# Compute pairwise distances
distance_matrix = pairwise_distances(X_preferences, metric='euclidean')

In [17]:
distance_matrix

array([[ 0.        , 64.26507605, 76.59634456, ..., 83.13242448,
        62.57795139, 66.9925369 ],
       [64.26507605,  0.        , 56.73623181, ..., 65.67343451,
        27.85677655, 37.17526059],
       [76.59634456, 56.73623181,  0.        , ..., 77.5370879 ,
        55.52476925, 59.9082632 ],
       ...,
       [83.13242448, 65.67343451, 77.5370879 , ...,  0.        ,
        64.56779383, 67.81592733],
       [62.57795139, 27.85677655, 55.52476925, ..., 64.56779383,
         0.        , 34.58323293],
       [66.9925369 , 37.17526059, 59.9082632 , ..., 67.81592733,
        34.58323293,  0.        ]])

In [18]:
# Sort distances
sorted_distances = np.argsort(distance_matrix, axis=1)

In [19]:
sorted_distances

array([[  0, 381, 739, ..., 268,  95, 924],
       [  1, 381, 614, ..., 924, 191, 368],
       [  2, 313, 381, ..., 409, 907, 191],
       ...,
       [997, 313, 446, ..., 155, 924, 588],
       [998, 616, 614, ..., 924, 368, 907],
       [999, 381, 614, ..., 588, 924, 368]])

In [20]:
test_user=500

# Take the top-10 simular users
for d in sorted_distances[test_user][::-1][0:10]:
    print(d)

368
907
268
924
588
191
688
849
95
558
