In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras

The data set is derived from the [MovieLens "ml-latest-small"](https://grouplens.org/datasets/movielens/latest/) dataset.   
[F. Maxwell Harper and Joseph A. Konstan. 2015. The MovieLens Datasets: History and Context. ACM Transactions on Interactive Intelligent Systems (TiiS) 5, 4: 19:1–19:19. <https://doi.org/10.1145/2827872>]

The original dataset has  9000 movies rated by 600 users. The dataset has been reduced in size to focus on movies from the years since 2000. This dataset consists of ratings on a scale of 0.5 to 5 in 0.5 step increments. The reduced dataset has $n_u = 443$ users, and $n_m= 4778$ movies. 

Below, you will load the movie dataset into the variables $Y$ and $R$.

The matrix $Y$ (a  $n_m \times n_u$ matrix) stores the ratings $y^{(i,j)}$. The matrix $R$ is an binary-valued indicator matrix, where $R(i,j) = 1$ if user $j$ gave a rating to movie $i$, and $R(i,j)=0$ otherwise. 

In [2]:
user_rated_movie = pd.read_csv('./data/small_movies_Y.csv', header=None)
user_rated_movie

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,433,434,435,436,437,438,439,440,441,442
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
1,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,2.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,4.0,3.0,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4773,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4774,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4775,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4776,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [3]:
rated_or_not = pd.read_csv('./data/small_movies_R.csv', header=None)
rated_or_not

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,433,434,435,436,437,438,439,440,441,442
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,1,0,0,0,0,0,0,0,0,0,...,0,0,0,0,1,0,0,1,1,1
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4773,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4774,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4775,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4776,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [4]:
n_movies, n_users = user_rated_movie.shape
n_features = 10  # randomly

# Create the arrays with the specified shapes and conditions
X = np.random.rand(n_movies, n_features)  # Shape (4778, 10), values in range [0, 1]
W = np.random.rand(n_users, n_features)   # Shape (443, 10), values in range [0, 1]
B = np.zeros(n_users)                     # Shape (443,), all values are 0

# Display the shapes and first few values
print("Shape of X:", X.shape)
print("Shape of W:", W.shape)
print("Shape of B:", B.shape)

Shape of X: (4778, 10)
Shape of W: (443, 10)
Shape of B: (443,)


In [5]:
Y = user_rated_movie.values
R = rated_or_not.values

Y.shape, R.shape

((4778, 443), (4778, 443))

# Collaborative Filtering Cost Function

$$J({\mathbf{x}^{(0)},...,\mathbf{x}^{(n_m-1)},\mathbf{w}^{(0)},b^{(0)},...,\mathbf{w}^{(n_u-1)},b^{(n_u-1)}})= \left[ \frac{1}{2}\sum_{(i,j):r(i,j)=1}(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2 \right]
+ \underbrace{\left[
\frac{\lambda}{2}
\sum_{j=0}^{n_u-1}\sum_{k=0}^{n-1}(\mathbf{w}^{(j)}_k)^2
+ \frac{\lambda}{2}\sum_{i=0}^{n_m-1}\sum_{k=0}^{n-1}(\mathbf{x}_k^{(i)})^2
\right]}_{regularization}$$

$$
= \left[ \frac{1}{2}\sum_{j=0}^{n_u-1} \sum_{i=0}^{n_m-1}r(i,j)*(\mathbf{w}^{(j)} \cdot \mathbf{x}^{(i)} + b^{(j)} - y^{(i,j)})^2 \right]
+\text{regularization}
$$

In [6]:
def cost_function(X, W, B, Y, R, lambda_):
    """
    Returns the cost for the content-based filtering
    Args:
      X (ndarray (num_movies, num_features)): matrix of item features
      W (ndarray (num_users, num_features)) : matrix of user parameters
      B (ndarray (num_users, )             : vector of user parameters
      Y (ndarray (num_movies, num_users)    : matrix of user ratings of movies
      R (ndarray (num_movies, num_users)    : matrix, where R(i, j) = 1 if the j-th user rated the i-th movie
      lambda_ (float): regularization parameter
    Returns:
      J (float): Cost
    """
    n_items, n_users = Y.shape
    
    J = 0
    
    for j in range(n_users):
        w = W[j, :]  # Pick parameters w of the user j
        b = B[j]     # Pick parameter b of the user j
        
        for i in range(n_items):
            x = X[i, :]  # Pick parameters of the movie i
            
            r = R[i, j]  # Whether user j rated movie i or not
            y = Y[i, j]  # Pick the rating of user j gave on the movie 
            
            J += (1/2) * np.square(r * (np.dot(w,x) + b - y))  # error
            
    J += (lambda_/2) * (np.sum(np.square(W)) + np.sum(np.square(X)))  # regulalization

    return J

In [7]:
def vectorized_cost_function(X, W, B, Y, R, lambda_):
    """
    Vectorization for speed. Uses tensorflow operations to be compatible with custom training loops.
    """
    # (n_movies, n_features) @ (n_features, n_users) = (n_movies, n_users)
    # (n_movies, n_users) + (1, n_users) - (n_movies, n_users) = (n_movies, n_users)
    # (n_movies, n_users) * (n_movies, n_users) = (n_movies, n_users)
    error = R * (tf.linalg.matmul(X, tf.transpose(W)) + B - Y)  # B must have the shape of (1, n_users)
    mean_squared_error = (1/2) * tf.reduce_sum(error**2)
                      
    regularization = (lambda_/2) * (tf.reduce_sum(X**2) + tf.reduce_sum(W**2))
                  
    J = mean_squared_error + regularization
    return J

In [8]:
import time

# Measure the time for the non-vectorized cost function
start_time = time.time()
J_non_vectorized = cost_function(X, W, B, Y, R, 1.5)
end_time = time.time()
print("Non-vectorized cost function time:", end_time - start_time, "seconds")

# Measure the time for the vectorized cost function
start_time = time.time()
B_reshaped = B.reshape(1, -1)
J_vectorized = vectorized_cost_function(X, W, B, Y, R, 1.5)
end_time = time.time()
print("Vectorized cost function time:", end_time - start_time, "seconds")

print()
print(J_non_vectorized)
print(J_vectorized)

Non-vectorized cost function time: 15.468174695968628 seconds
Vectorized cost function time: 0.14886045455932617 seconds

62756.91956691217
tf.Tensor(62756.91956691242, shape=(), dtype=float64)


# Movie Dataset

In [9]:
df_movie = pd.read_csv('./data/small_movie_list.csv', index_col=0)
df_movie

Unnamed: 0,mean rating,number of ratings,title
0,3.400000,5,"Yards, The (2000)"
1,3.250000,6,Next Friday (2000)
2,2.000000,4,Supernova (2000)
3,2.000000,4,Down to You (2000)
4,2.672414,29,Scream 3 (2000)
...,...,...,...
4773,3.500000,1,Jon Stewart Has Left the Building (2015)
4774,4.000000,1,Black Butler: Book of the Atlantic (2017)
4775,3.500000,1,No Game No Life: Zero (2017)
4776,3.500000,1,Flint (2017)


In [10]:
list_title = df_movie.title.to_list()
len(list_title)

4778

In [11]:
my_ratings = np.zeros(len(list_title))
my_ratings.shape

(4778,)

In [12]:
my_ratings[2700] = 5   # Toy Story 3 (2010)
my_ratings[2609] = 2   # Persuasion (2007)
my_ratings[929]  = 5   # Lord of the Rings: The Return of the King, The
my_ratings[246]  = 5   # Shrek (2001)
my_ratings[2716] = 3   # Inception
my_ratings[1150] = 5   # Incredibles, The (2004)
my_ratings[382]  = 2   # Amelie (Fabuleux destin d'Amélie Poulain, Le)
my_ratings[366]  = 5   # Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
my_ratings[622]  = 5   # Harry Potter and the Chamber of Secrets (2002)
my_ratings[988]  = 3   # Eternal Sunshine of the Spotless Mind (2004)
my_ratings[2925] = 1   # Louis Theroux: Law & Disorder (2008)
my_ratings[2937] = 1   # Nothing to Declare (Rien à déclarer)
my_ratings[793]  = 5   # Pirates of the Caribbean: The Curse of the Black Pearl (2003)

In [13]:
count = 0
for i in range(len(my_ratings)):
    if my_ratings[i] > 0 :
        count += 1
        print('Rated {} for {}'.format(my_ratings[i], list_title[i]))

print()
print('Number of Movies Rated: {}'.format(count))

Rated 5.0 for Shrek (2001)
Rated 5.0 for Harry Potter and the Sorcerer's Stone (a.k.a. Harry Potter and the Philosopher's Stone) (2001)
Rated 2.0 for Amelie (Fabuleux destin d'Amélie Poulain, Le) (2001)
Rated 5.0 for Harry Potter and the Chamber of Secrets (2002)
Rated 5.0 for Pirates of the Caribbean: The Curse of the Black Pearl (2003)
Rated 5.0 for Lord of the Rings: The Return of the King, The (2003)
Rated 3.0 for Eternal Sunshine of the Spotless Mind (2004)
Rated 5.0 for Incredibles, The (2004)
Rated 2.0 for Persuasion (2007)
Rated 5.0 for Toy Story 3 (2010)
Rated 3.0 for Inception (2010)
Rated 1.0 for Louis Theroux: Law & Disorder (2008)
Rated 1.0 for Nothing to Declare (Rien à déclarer) (2010)

Number of Movies Rated: 13


In [14]:
Y.shape, R.shape

((4778, 443), (4778, 443))

In [15]:
# np.c_ to concatenate horizontally
Y_new = np.c_[my_ratings, Y]  # Add new user ratings to Y, my_ratings is the first column
R_new = np.c_[(my_ratings != 0).astype(int), R]  # Add new user indicator matrix to R
Y_new.shape, R_new.shape

((4778, 444), (4778, 444))

In [16]:
(my_ratings != 0)  # create a boolean array from the original my_ratings array
# True if != 0, False if == 0

array([False, False, False, ..., False, False, False])

In [17]:
np.unique((my_ratings != 0).astype(int), return_counts=True)  # convert boo

(array([0, 1]), array([4765,   13], dtype=int64))

# Normalized Ratings

In [18]:
total_rating_each_movie = np.sum(R_new * Y_new, axis=1) # axis=1 means the sum is performed along each row
total_rating_each_movie.shape

(4778,)

In [19]:
n_viewers_each_movie = np.sum(R_new, axis=1) + 1e-12  # to avoid divided by 0
n_viewers_each_movie.shape

(4778,)

In [20]:
mean_rating_each_movie = total_rating_each_movie / n_viewers_each_movie
mean_rating_each_movie = mean_rating_each_movie.reshape(-1, 1)
mean_rating_each_movie.shape

(4778, 1)

In [21]:
ratings_nomalized = Y_new - (R_new * mean_rating_each_movie)
ratings_nomalized.shape

(4778, 444)

In [22]:
Y_mean = mean_rating_each_movie
Y_norm = ratings_nomalized

# Training

### Initializing Parameters

In [23]:
n_movies, n_users = Y_new.shape
n_features = 100

In [24]:
X = tf.Variable(
    tf.random.normal(
        (n_movies, n_features), 
        mean=0, stddev=1.0, 
        dtype=tf.float64
    ), 
    name='X'
)

In [25]:
W = tf.Variable(
    tf.random.normal(
        (n_users, n_features), 
        mean=0, stddev=1.0, 
        dtype=tf.float64
    ), 
    name='W'
)

In [26]:
B = tf.Variable(
    tf.random.normal(
        (1, n_users), 
        mean=0, stddev=1.0, 
        dtype=tf.float64
    ), 
    name='B'
)

In [27]:
X.shape, W.shape, B.shape

(TensorShape([4778, 100]), TensorShape([444, 100]), TensorShape([1, 444]))

### Updating Parameters

In [28]:
lambda_ = 1
optimizer = keras.optimizers.Adam(learning_rate=0.1)
epochs = 200

for i in range(0, epochs+1):
    
    # tf.GradientTape() records operations on tensors to compute gradients for backpropagation.
    with tf.GradientTape() as tape:
        
        # This function computes the loss (cost) based on the current parameters (X, W, B) and the input data (Y_norm, R_new)
        cost_value = vectorized_cost_function(X, W, B, Y_norm, R_new, lambda_)
        
    # This calculates the gradient of the cost_value with respect to the parameters (X, W, B)
    gradient = tape.gradient(cost_value, [X, W, B])
    
    # The gradients are applied to update the parameters using the chosen optimizer 
    optimizer.apply_gradients(zip(gradient, [X, W, B]))

    if i % 20 == 0:
        print('Loss at epoch {}: {:.02f}'.format(i, cost_value))

Loss at epoch 0: 2297025.03
Loss at epoch 20: 135393.78
Loss at epoch 40: 51187.14
Loss at epoch 60: 24189.26
Loss at epoch 80: 13373.16
Loss at epoch 100: 8322.91
Loss at epoch 120: 5697.33
Loss at epoch 140: 4233.76
Loss at epoch 160: 3377.40
Loss at epoch 180: 2857.33
Loss at epoch 200: 2531.19


In [29]:
len(gradient), gradient[0].shape, gradient[1].shape, gradient[2].shape

(3, TensorShape([4778, 100]), TensorShape([444, 100]), TensorShape([1, 444]))

# Prediction

In [30]:
X_np = X.numpy()
W_np = W.numpy()
B_np = B.numpy()

In [31]:
ratings_predicted = X_np @ W_np.T + B_np
ratings_predicted = ratings_predicted + Y_mean
ratings_predicted.shape

(4778, 444)

In [32]:
my_ratings_predicted = ratings_predicted[:, 0]
my_ratings_predicted.shape

(4778,)

In [33]:
my_ratings.shape

(4778,)

In [34]:
sr_my_ratings = pd.Series(my_ratings)
sr_my_ratings_predicted = pd.Series(my_ratings_predicted)

In [35]:
df_concated = pd.concat([sr_my_ratings_predicted, sr_my_ratings, df_movie], axis=1)
df_concated = df_concated.rename(columns={df_concated.columns[0]: 'predicted_rating',
                                          df_concated.columns[1]: 'true_rating'
                                         })
df_concated

Unnamed: 0,predicted_rating,true_rating,mean rating,number of ratings,title
0,2.758987,0.0,3.400000,5,"Yards, The (2000)"
1,2.543402,0.0,3.250000,6,Next Friday (2000)
2,1.426772,0.0,2.000000,4,Supernova (2000)
3,1.326955,0.0,2.000000,4,Down to You (2000)
4,2.484551,0.0,2.672414,29,Scream 3 (2000)
...,...,...,...,...,...
4773,2.943019,0.0,3.500000,1,Jon Stewart Has Left the Building (2015)
4774,3.443037,0.0,4.000000,1,Black Butler: Book of the Atlantic (2017)
4775,2.938096,0.0,3.500000,1,No Game No Life: Zero (2017)
4776,2.942987,0.0,3.500000,1,Flint (2017)


In [36]:
comparision = df_concated[df_concated.true_rating != 0].round(1)
comparision

Unnamed: 0,predicted_rating,true_rating,mean rating,number of ratings,title
246,4.9,5.0,3.9,170,Shrek (2001)
366,4.8,5.0,3.8,107,Harry Potter and the Sorcerer's Stone (a.k.a. ...
382,2.1,2.0,4.2,120,"Amelie (Fabuleux destin d'Amélie Poulain, Le) ..."
622,4.8,5.0,3.6,102,Harry Potter and the Chamber of Secrets (2002)
793,4.8,5.0,3.8,149,Pirates of the Caribbean: The Curse of the Bla...
929,4.9,5.0,4.1,185,"Lord of the Rings: The Return of the King, The..."
988,3.0,3.0,4.2,131,Eternal Sunshine of the Spotless Mind (2004)
1150,5.0,5.0,3.8,125,"Incredibles, The (2004)"
2609,2.1,2.0,3.3,3,Persuasion (2007)
2700,4.8,5.0,4.1,55,Toy Story 3 (2010)


# Recommendation

In [37]:
not_rated_movies = df_concated[df_concated.true_rating == 0]
recommendations = not_rated_movies.sort_values(by=['predicted_rating', 'mean rating'], ascending=False)
recommendations.head(10)

Unnamed: 0,predicted_rating,true_rating,mean rating,number of ratings,title
2573,4.913764,0.0,3.603093,97,Avatar (2009)
1051,4.556084,0.0,3.913978,93,Harry Potter and the Prisoner of Azkaban (2004)
3703,4.520423,0.0,5.0,1,Colourful (Karafuru) (2010)
3870,4.500723,0.0,5.0,1,Into the Forest of Fireflies' Light (2011)
3625,4.47535,0.0,5.0,1,Particle Fever (2013)
3742,4.472653,0.0,5.0,1,"One I Love, The (2014)"
3754,4.472365,0.0,5.0,1,Laggies (2014)
3924,4.47226,0.0,5.0,1,Delirium (2014)
49,4.466293,0.0,5.0,1,Bossa Nova (2000)
3062,4.462188,0.0,5.0,1,Into the Abyss (2011)


# Finding Similar Movies

### Computing Distance between Movies

In [38]:
X_np.shape

(4778, 100)

In [39]:
vector_squared_lengths = np.sum(np.square(X_np), axis=1)
vector_squared_lengths.shape

(4778,)

In [40]:
dot_products = np.dot(X_np, X_np.T)
dot_products.shape

(4778, 4778)

In [41]:
col = vector_squared_lengths[:, np.newaxis]
row = vector_squared_lengths[np.newaxis, :]

col.shape, row.shape

((4778, 1), (1, 4778))

In [42]:
squared_distance_matrix = col + row - 2 * dot_products
squared_distance_matrix.shape

(4778, 4778)

In [43]:
distance_matrix = np.sqrt(np.abs(squared_distance_matrix))
distance_matrix.shape

(4778, 4778)

In [44]:
distance_matrix = pd.DataFrame(distance_matrix, columns=None).round(2)
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4768,4769,4770,4771,4772,4773,4774,4775,4776,4777
0,0.00,0.93,0.99,1.34,1.74,2.62,0.75,1.84,1.05,1.12,...,0.70,0.71,0.70,0.70,0.70,0.70,0.70,0.72,0.70,0.70
1,0.93,0.00,0.85,1.25,1.71,2.47,0.64,1.89,1.00,1.14,...,0.63,0.63,0.63,0.63,0.63,0.63,0.63,0.64,0.63,0.63
2,0.99,0.85,0.00,1.44,1.65,2.61,0.62,1.92,1.01,1.10,...,0.63,0.63,0.63,0.63,0.63,0.63,0.63,0.64,0.63,0.63
3,1.34,1.25,1.44,0.00,2.12,2.68,1.15,2.10,1.17,1.49,...,1.14,1.15,1.14,1.14,1.14,1.14,1.14,1.16,1.14,1.14
4,1.74,1.71,1.65,2.12,0.00,2.79,1.57,2.09,1.88,1.83,...,1.59,1.59,1.59,1.59,1.59,1.59,1.59,1.58,1.59,1.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4773,0.70,0.63,0.63,1.14,1.59,2.45,0.09,1.74,0.78,0.90,...,0.00,0.04,0.00,0.00,0.00,0.00,0.00,0.10,0.00,0.00
4774,0.70,0.63,0.63,1.14,1.59,2.45,0.09,1.74,0.78,0.90,...,0.00,0.04,0.00,0.00,0.00,0.00,0.00,0.10,0.00,0.00
4775,0.72,0.64,0.64,1.16,1.58,2.45,0.12,1.73,0.79,0.91,...,0.10,0.09,0.10,0.10,0.10,0.10,0.10,0.00,0.10,0.10
4776,0.70,0.63,0.63,1.14,1.59,2.45,0.09,1.74,0.78,0.90,...,0.00,0.04,0.00,0.00,0.00,0.00,0.00,0.10,0.00,0.00


In [45]:
mask = np.tril(np.ones_like(distance_matrix, dtype=bool), k=0)
mask

array([[ True, False, False, ..., False, False, False],
       [ True,  True, False, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ...,  True, False, False],
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True,  True]])

In [46]:
distance_matrix[mask] = np.inf
distance_matrix = pd.DataFrame(distance_matrix, columns=None).round(2)
distance_matrix

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,4768,4769,4770,4771,4772,4773,4774,4775,4776,4777
0,inf,0.93,0.99,1.34,1.74,2.62,0.75,1.84,1.05,1.12,...,0.70,0.71,0.70,0.70,0.70,0.70,0.70,0.72,0.70,0.70
1,inf,inf,0.85,1.25,1.71,2.47,0.64,1.89,1.00,1.14,...,0.63,0.63,0.63,0.63,0.63,0.63,0.63,0.64,0.63,0.63
2,inf,inf,inf,1.44,1.65,2.61,0.62,1.92,1.01,1.10,...,0.63,0.63,0.63,0.63,0.63,0.63,0.63,0.64,0.63,0.63
3,inf,inf,inf,inf,2.12,2.68,1.15,2.10,1.17,1.49,...,1.14,1.15,1.14,1.14,1.14,1.14,1.14,1.16,1.14,1.14
4,inf,inf,inf,inf,inf,2.79,1.57,2.09,1.88,1.83,...,1.59,1.59,1.59,1.59,1.59,1.59,1.59,1.58,1.59,1.59
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4773,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,0.00,0.10,0.00,0.00
4774,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,0.10,0.00,0.00
4775,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,0.10,0.10
4776,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,0.00


### Finding Similar Items

In [47]:
n_movies = 10
n_similar_movies = 3

similar_item_ids = []
for i in range(n_movies):
    main_id = distance_matrix.index[i]
    
    sorted_distances = distance_matrix.iloc[i].sort_values(ascending=True)
    similar_ids = sorted_distances.head(n_similar_movies).index
    
    similar_item_ids.append((main_id, similar_ids.tolist()))

similar_item_ids

[(0, [308, 268, 954]),
 (1, [20, 1925, 3549]),
 (2, [272, 701, 563]),
 (3, [18, 102, 109]),
 (4, [11, 955, 169]),
 (5, [4075, 3427, 4282]),
 (6, [4037, 3101, 3112]),
 (7, [471, 196, 94]),
 (8, [206, 102, 19]),
 (9, [4478, 2064, 2987])]

In [48]:
df_movie.iloc[308]

mean rating                            2.6
number of ratings                        5
title                Ghosts of Mars (2001)
Name: 308, dtype: object

In [49]:
df_movie.iloc[308].title

'Ghosts of Mars (2001)'

In [50]:
rows = []

for main_id, similar_ids in similar_item_ids:
    
    title = df_movie.iloc[main_id].title

    row = {
        #'movie_ID': main_id,
        'title': title,
    }

    for loop, id in enumerate(similar_ids, start=1):
        similar_title = df_movie.iloc[id].title
        
        #row[f'similar_ID_{loop}'] = id
        row[f'similar_title_{loop}'] = similar_title

    rows.append(row)

In [51]:
df_similar_items = pd.DataFrame(rows)
df_similar_items.head(10)

Unnamed: 0,title,similar_title_1,similar_title_2,similar_title_3
0,"Yards, The (2000)",Ghosts of Mars (2001),"Closet, The (Placard, Le) (2001)","Perfect Score, The (2004)"
1,Next Friday (2000),What Planet Are You From? (2000),Death at a Funeral (2007),Delivery Man (2013)
2,Supernova (2000),Kiss of the Dragon (2001),Cradle 2 the Grave (2003),City by the Sea (2002)
3,Down to You (2000),Drowning Mona (2000),"Crew, The (2000)",Bait (2000)
4,Scream 3 (2000),Hanging Up (2000),You Got Served (2004),State and Main (2000)
5,"Boondock Saints, The (2000)",Trainwreck (2015),"Fast & Furious 6 (Fast and the Furious 6, The)...",Doctor Who: Planet of the Dead (2009)
6,Gun Shy (2000),Sword of Vengeance (2014),Albatross (2011),Seeking Justice (2011)
7,"Beach, The (2000)",Human Nature (2001),Sweet November (2001),Cecil B. DeMented (2000)
8,Snow Day (2000),Get Over It (2001),"Crew, The (2000)","Next Best Thing, The (2000)"
9,"Tigger Movie, The (2000)",Your Name. (2016),Like Stars on Earth (Taare Zameen Par) (2007),30 Minutes or Less (2011)
