In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_colwidth', 9999999)

# View Data

### Training Movie Dataset

In [3]:
df_training_items = pd.read_csv('./data/movies_for_training.csv')
df_training_items.head(5)

Unnamed: 0,movie id,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874,2003,3.961832,1,0,0,0,0,1,0,0,0,0,0,0,0,1
1,8798,2004,3.761364,1,0,0,0,0,1,0,1,0,0,0,0,0,1
2,46970,2006,3.25,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3,48516,2006,4.252336,0,0,0,0,0,1,0,1,0,0,0,0,0,1
4,58559,2008,4.238255,1,0,0,0,0,1,0,1,0,0,0,0,0,0


In [4]:
df_training_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50884 entries, 0 to 50883
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie id     50884 non-null  int64  
 1   year         50884 non-null  int64  
 2   ave rating   50884 non-null  float64
 3   Action       50884 non-null  int64  
 4   Adventure    50884 non-null  int64  
 5   Animation    50884 non-null  int64  
 6   Children     50884 non-null  int64  
 7   Comedy       50884 non-null  int64  
 8   Crime        50884 non-null  int64  
 9   Documentary  50884 non-null  int64  
 10  Drama        50884 non-null  int64  
 11  Fantasy      50884 non-null  int64  
 12  Horror       50884 non-null  int64  
 13  Mystery      50884 non-null  int64  
 14  Romance      50884 non-null  int64  
 15  Sci-Fi       50884 non-null  int64  
 16  Thriller     50884 non-null  int64  
dtypes: float64(1), int64(16)
memory usage: 6.6 MB


In [5]:
df_training_items['movie id'].value_counts()

movie id
5669     1160
8464     1000
8622      740
4306      680
79132     572
         ... 
69784      10
62155      10
32029      10
55830      10
68793      10
Name: count, Length: 847, dtype: int64

**There are 50884 rows to match with the viewer dataset and the rating dataset but only 847 movies**

### Training Viewer Dataset

In [6]:
df_training_users = pd.read_csv('./data/viewers_for_training.csv')
df_training_users.sample(5)

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
30977,382,117,3.91,3.89,4.12,4.12,4.18,3.49,4.04,0.0,4.2,4.23,0.0,4.22,3.89,3.95,3.9
44096,551,82,3.63,3.72,3.61,3.83,4.3,3.69,3.76,4.0,3.55,3.04,4.0,4.0,4.0,3.75,3.68
44798,560,242,3.5,3.53,3.39,3.56,3.39,3.46,3.77,3.4,3.57,3.36,3.71,3.66,3.26,3.47,3.62
38463,477,178,3.94,3.81,3.72,4.05,3.6,3.98,4.04,4.17,4.15,3.73,3.89,4.3,4.2,3.99,3.87
6365,80,103,4.33,4.38,4.23,4.0,4.0,3.75,4.41,0.0,4.39,4.5,4.11,4.5,4.5,4.24,4.34


In [7]:
df_training_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50884 entries, 0 to 50883
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   user id       50884 non-null  int64  
 1   rating count  50884 non-null  int64  
 2   rating ave    50884 non-null  float64
 3   Action        50884 non-null  float64
 4   Adventure     50884 non-null  float64
 5   Animation     50884 non-null  float64
 6   Children      50884 non-null  float64
 7   Comedy        50884 non-null  float64
 8   Crime         50884 non-null  float64
 9   Documentary   50884 non-null  float64
 10  Drama         50884 non-null  float64
 11  Fantasy       50884 non-null  float64
 12  Horror        50884 non-null  float64
 13  Mystery       50884 non-null  float64
 14  Romance       50884 non-null  float64
 15  Sci-Fi        50884 non-null  float64
 16  Thriller      50884 non-null  float64
dtypes: float64(15), int64(2)
memory usage: 6.6 MB


In [8]:
df_training_users['user id'].value_counts()

user id
414    1245
68      967
610     947
249     920
232     861
       ... 
194       1
493       1
3         1
502       1
342       1
Name: count, Length: 397, dtype: int64

**There are 50884 rows to match with the movie dataset and the rating dataset but only 397 viewers**

### Rating Dataset

In [9]:
df_target = pd.read_csv('./data/ratings_for_training.csv', header=None)
df_target.columns = ['ratings']
df_target.sample(5)

Unnamed: 0,ratings
47878,2.5
31224,3.0
46855,5.0
5948,2.5
36659,4.5


*The first row of viewer dataset is the viewer who rated the film from the first row of movie dataset 
a point of ... which is the first row of rating dataset*

In [10]:
df_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50884 entries, 0 to 50883
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ratings  50884 non-null  float64
dtypes: float64(1)
memory usage: 397.7 KB


### Utility Matrix

In [11]:
movie_id = df_training_items['movie id']
user_id = df_training_users['user id']

user_rated_movie = pd.concat([movie_id, user_id, df_target], axis=1)
user_rated_movie.sample(5)

Unnamed: 0,movie id,user id,ratings
26580,5502,331,4.0
3608,48780,62,5.0
43327,54272,534,3.5
31099,89745,382,4.5
1907,5418,28,3.5


In [12]:
utility_matrix = user_rated_movie.pivot_table(index='movie id', columns='user id', values='ratings')
utility_matrix

user id,2,3,4,7,9,10,12,13,15,16,...,598,599,600,601,603,605,606,607,608,610
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4054,,,,,,,,,,,...,,,,,,,3.5,3.0,3.0,2.0
4069,,,,,,,,,,,...,,,2.5,,,,2.5,3.0,,
4148,,,,,,,,4.0,,,...,,2.5,,,2.0,,2.5,,4.5,3.5
4149,,,,,,,,,,,...,,2.5,,,,,,,,
4153,,,,,,,,,,,...,,,,,,,,,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,,,,,,,,,,,...,,4.0,,4.0,,,,,,
176371,,,,,,,,,,,...,,3.5,,4.0,,,,,,
177765,,,,,,,,,,,...,,,,4.5,,,,,,
179819,,,,,,,,,,,...,,,,,,,,,,


# Data Preprocessing - Scaling

In [13]:
scalerItem = StandardScaler()
items_scaled = scalerItem.fit_transform(df_training_items)
items_scaled = items_scaled[:, 3:]

scalerUser = StandardScaler()
users_scaled = scalerUser.fit_transform(df_training_users)
users_scaled = users_scaled[:, 3:]

scalerTarget = MinMaxScaler(feature_range=(-1, 1))
ratings_scaled = scalerTarget.fit_transform(df_target)

In [14]:
items_scaled.shape, users_scaled.shape, ratings_scaled.shape

((50884, 14), (50884, 14), (50884, 1))

In [15]:
item_train, item_test = train_test_split(items_scaled, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(users_scaled, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(ratings_scaled, train_size=0.80, shuffle=True, random_state=1)

item_train.shape, user_train.shape, y_train.shape

((40707, 14), (40707, 14), (40707, 1))

# Training

### Building Hidden Layers

In [16]:
n_outputs = 32

user_NN = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(n_outputs)
])

item_NN = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(n_outputs)
])

### Building Input and Output Layers

In [17]:
n_item_features = item_train.shape[1]
n_user_features = user_train.shape[1]

input_item = keras.layers.Input(shape=(n_item_features, ))
output_item = item_NN(input_item)

input_user = keras.layers.Input(shape=(n_user_features, ))
output_user = user_NN(input_user)

# Dot Product
final_output = keras.layers.Dot(axes=1, normalize=True)([output_item, output_user])  
# If normalized (vector length = 1), then dot product = cosine similarity (ranges from -1 to 1)

### Complete Model

In [18]:
model = keras.models.Model(inputs=[input_item, input_user], outputs=final_output)
model.summary()

### Training

In [19]:
tf.random.set_seed(1)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
              loss=keras.losses.MeanSquaredError()
             )

model.fit(x=[item_train, user_train], y=y_train, epochs=30)

Epoch 1/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m7s[0m 3ms/step - loss: 0.1440
Epoch 2/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1262
Epoch 3/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1202
Epoch 4/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1146
Epoch 5/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1114
Epoch 6/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1089
Epoch 7/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1069
Epoch 8/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 2ms/step - loss: 0.1050
Epoch 9/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3ms/step - loss: 0.1031
Epoch 10/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x2ad4c2b36d0>

In [20]:
model.evaluate([item_test, user_test], y_test)

[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1006


0.09894740581512451

# Recommend Movies to a New Viewer

### Movie Info Dataset

In [21]:
df_distinct_items = df_training_items.drop_duplicates(keep='first', ignore_index=True)
df_distinct_items.shape, df_distinct_items['movie id'].nunique()

((847, 17), 847)

In [22]:
df_item_info = pd.read_csv('./data/movie_info.csv')
df_item_info.head()

Unnamed: 0,movieId,title,genres
0,4054,Save the Last Dance (2001),Drama|Romance
1,4069,"Wedding Planner, The (2001)",Comedy|Romance
2,4148,Hannibal (2001),Horror|Thriller
3,4149,Saving Silverman (Evil Woman) (2001),Comedy|Romance
4,4153,Down to Earth (2001),Comedy|Fantasy|Romance


In [23]:
df_item_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  847 non-null    int64 
 1   title    847 non-null    object
 2   genres   847 non-null    object
dtypes: int64(1), object(2)
memory usage: 20.0+ KB


In [24]:
df_item_info.movieId.nunique()

847

**847 rows and 847 movies, no duplicates**

### Creating a new user

In [25]:
new_user_id = 5000
new_rating_count = 3
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0

new_user = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])
new_user.shape

(1, 17)

*The new user enjoys movies from the `adventure`, `fantasy` genres. Let's find the top-rated movies for the new user*

In [26]:
# Replicate the new user to match the number of new movies in the dataset.
new_user_replicated = np.tile(new_user, (len(df_distinct_items), 1))  
# Repeat the array 847 times along axis 0 and 1 times along axis 1
new_user_replicated.shape

(847, 17)

### Predicting

In [27]:
new_user_scaled = scalerUser.transform(new_user_replicated)
new_user_scaled = new_user_scaled[:, 3:]

distinct_items_scaled = scalerItem.transform(df_distinct_items)
distinct_items_scaled = distinct_items_scaled[:, 3:]

new_user_scaled.shape, distinct_items_scaled.shape
# warning because new_user_replicated doesn't have feature names



((847, 14), (847, 14))

In [28]:
ratings_predicted = model.predict([distinct_items_scaled, new_user_scaled])
ratings_predicted = scalerTarget.inverse_transform(ratings_predicted)
ratings_predicted.shape  # has the same order as distinct_items_scaled

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


(847, 1)

In [29]:
# must be converted into series to concat
ratings_series = pd.Series(ratings_predicted.reshape(-1), name='rating_predicted')
ratings_df = pd.concat([ratings_series, df_distinct_items], axis=1)

In [30]:
df_merged = pd.merge(df_item_info, ratings_df, left_on='movieId', right_on='movie id', how='inner')
df_merged = df_merged[['rating_predicted', 'ave rating', 'movie id', 'title', 'genres']]
df_merged.sort_values(by='rating_predicted', axis=0, ascending=False, inplace=True)
df_merged.head(10)

Unnamed: 0,rating_predicted,ave rating,movie id,title,genres
367,4.311049,3.816901,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller
717,4.107567,3.8125,98809,"Hobbit: An Unexpected Journey, The (2012)",Adventure|Fantasy
254,4.107567,3.913978,8368,Harry Potter and the Prisoner of Azkaban (2004),Adventure|Fantasy
133,4.107567,3.598039,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
749,4.107567,3.58,106489,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy
604,4.107567,2.875,74789,Alice in Wonderland (2010),Adventure|Fantasy
793,4.107567,3.416667,118696,The Hobbit: The Battle of the Five Armies (2014),Adventure|Fantasy
142,4.107567,4.021277,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
68,4.107567,4.106061,4993,"Lord of the Rings: The Fellowship of the Ring, The (2001)",Adventure|Fantasy
822,4.095731,3.636364,137857,The Jungle Book (2016),Adventure|Drama|Fantasy


In [31]:
df_merged['movie id'].nunique()

847

# Finding Similar Movies

### Computing Output Item Features

In [32]:
n_item_features = 14
# input layer
input_item = tf.keras.layers.Input(shape=(n_item_features,  ))
# use the pre-trained item_NN
output_item = item_NN(input_item)
# normalize the output
output_item = keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(output_item)

item_model = keras.models.Model(input_item, output_item)                                
item_model.summary()




In [33]:
output_item_vectors = item_model.predict(distinct_items_scaled)
output_item_vectors.shape

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 5ms/step


(847, 32)

### Computing Distance between Points / Movies

In [34]:
dot_products = np.dot(output_item_vectors, output_item_vectors.T)
dot_products.shape

(847, 847)

In [35]:
vector_squared_lengths = np.sum(np.square(output_item_vectors), axis=1)
vector_squared_lengths.shape

(847,)

In [36]:
col = vector_squared_lengths[:, np.newaxis]
row = vector_squared_lengths[np.newaxis, :]

col.shape, row.shape

((847, 1), (1, 847))

In [37]:
squared_distance_matrix = col + row - 2 * dot_products
distance_matrix = np.sqrt(np.abs(squared_distance_matrix))
distance_matrix.shape

(847, 847)

In [38]:
distance_matrix = pd.DataFrame(distance_matrix, columns=None).round(2).abs()
distance_matrix.columns = df_distinct_items['movie id']
distance_matrix.index = df_distinct_items['movie id']

distance_matrix

movie id,6874,8798,46970,48516,58559,60756,68157,71535,74458,77455,...,31221,36708,37380,46335,7150,51412,85510,93363,111364,5128
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6874,0.00,0.40,0.73,0.80,0.71,0.85,0.79,0.89,0.95,1.15,...,1.15,0.99,1.07,0.40,0.85,0.71,1.16,0.92,0.92,1.57
8798,0.40,0.00,0.83,0.68,0.74,0.85,0.79,1.02,0.83,1.18,...,1.06,1.03,1.02,0.00,0.85,0.58,1.20,0.91,0.91,1.60
46970,0.73,0.83,0.00,0.91,0.84,0.35,0.71,0.97,1.11,1.11,...,0.97,1.14,0.92,0.83,0.35,0.90,1.14,1.07,1.07,1.52
48516,0.80,0.68,0.91,0.00,0.51,0.83,0.56,0.97,0.79,0.99,...,1.28,1.08,1.03,0.68,0.83,0.84,1.42,1.07,1.07,1.60
58559,0.71,0.74,0.84,0.51,0.00,0.88,0.51,0.87,1.12,0.95,...,1.28,0.87,1.01,0.74,0.88,0.81,1.46,0.92,0.92,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51412,0.71,0.58,0.90,0.84,0.81,0.88,0.80,1.16,1.03,1.23,...,0.94,0.96,0.87,0.58,0.88,0.00,1.18,0.47,0.47,1.57
85510,1.16,1.20,1.14,1.42,1.46,1.23,1.33,1.18,1.35,1.45,...,1.12,1.64,1.05,1.20,1.23,1.18,0.00,1.19,1.19,1.03
93363,0.92,0.91,1.07,1.07,0.92,1.08,0.96,1.13,1.28,1.21,...,1.16,0.95,0.88,0.91,1.08,0.47,1.19,0.00,0.00,1.50
111364,0.92,0.91,1.07,1.07,0.92,1.08,0.96,1.13,1.28,1.21,...,1.16,0.95,0.88,0.91,1.08,0.47,1.19,0.00,0.00,1.50


In [39]:
# Create a mask
mask = np.tril(np.ones_like(distance_matrix, dtype=bool), k=0)
mask

array([[ True, False, False, ..., False, False, False],
       [ True,  True, False, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ...,  True, False, False],
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True,  True]])

In [40]:
distance_matrix[mask] = np.inf
distance_matrix

movie id,6874,8798,46970,48516,58559,60756,68157,71535,74458,77455,...,31221,36708,37380,46335,7150,51412,85510,93363,111364,5128
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6874,inf,0.4,0.73,0.80,0.71,0.85,0.79,0.89,0.95,1.15,...,1.15,0.99,1.07,0.40,0.85,0.71,1.16,0.92,0.92,1.57
8798,inf,inf,0.83,0.68,0.74,0.85,0.79,1.02,0.83,1.18,...,1.06,1.03,1.02,0.00,0.85,0.58,1.20,0.91,0.91,1.60
46970,inf,inf,inf,0.91,0.84,0.35,0.71,0.97,1.11,1.11,...,0.97,1.14,0.92,0.83,0.35,0.90,1.14,1.07,1.07,1.52
48516,inf,inf,inf,inf,0.51,0.83,0.56,0.97,0.79,0.99,...,1.28,1.08,1.03,0.68,0.83,0.84,1.42,1.07,1.07,1.60
58559,inf,inf,inf,inf,inf,0.88,0.51,0.87,1.12,0.95,...,1.28,0.87,1.01,0.74,0.88,0.81,1.46,0.92,0.92,1.60
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51412,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,1.18,0.47,0.47,1.57
85510,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,1.19,1.19,1.03
93363,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,0.00,1.50
111364,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,1.50


### Finding Similar Movies

In [46]:
n_movies = 20
n_similar_movies = 5

similar_item_ids = []
for i in range(n_movies):
    main_id = distance_matrix.index[i]
    
    sorted_distances = distance_matrix.iloc[i].sort_values(ascending=True)
    similar_ids = sorted_distances.head(n_similar_movies).index
    
    similar_item_ids.append((main_id, similar_ids.tolist()))

similar_item_ids

[(6874, [54286, 4369, 5507, 8665, 58295]),
 (8798, [46335, 41997, 59369, 50794, 4901]),
 (46970, [68793, 4161, 102123, 5220, 31878]),
 (48516, [4776, 27831, 115569, 44199, 51540]),
 (58559, [54997, 4344, 69481, 51935, 37727]),
 (60756, [45728, 88785, 4974, 44840, 6287]),
 (68157, [4958, 115210, 5152, 49651, 5010]),
 (71535, [119145, 51255, 77455, 8370, 51662]),
 (74458, [8783, 7371, 104879, 45447, 51086]),
 (77455, [93840, 8370, 51255, 56251, 119145]),
 (79132, [48780, 4878, 68237, 6003, 57504]),
 (80489, [55118, 27831, 115569, 6552, 88129]),
 (80906, [34072, 54881, 6331, 7156, 5669]),
 (89774, [7263, 106100, 4967, 4223, 166643]),
 (91529, [136020, 111781, 63113, 5872, 96079]),
 (91658, [46723, 5400, 48738, 81591, 5608]),
 (99114, [40278, 115210, 5010, 5152, 4448]),
 (106782, [6708, 5991, 5265, 61323, 109374]),
 (109487, [176371, 164179, 112623, 61323, 6708]),
 (112552, [63876, 89492, 166643, 47629, 106100])]

In [47]:
info = df_item_info[df_item_info['movieId'] == 119145]
info

Unnamed: 0,movieId,title,genres
795,119145,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime


In [48]:
info['title'].values

array(['Kingsman: The Secret Service (2015)'], dtype=object)

In [49]:
rows = []

for main_id, similar_ids in similar_item_ids:
    
    main_info = df_item_info[df_item_info['movieId'] == main_id]  # show values in DF format
    title = main_info['title'].values[0] if not main_info.empty else None  # take value in column 'title'
    genres = main_info['genres'].values[0] if not main_info.empty else None

    row = {
        #'movie_ID': main_id,
        'title': title,
        'genres': genres
    }

    for loop, id in enumerate(similar_ids, start=1):
        similar_info = df_item_info[df_item_info['movieId'] == id]
        similar_title = similar_info['title'].values[0] if not similar_info.empty else None
        similar_genres = similar_info['genres'].values[0] if not similar_info.empty else None
        
        #row[f'similar_ID_{loop}'] = id
        row[f'similar_title_{loop}'] = similar_title
        row[f'similar_genres_{loop}'] = similar_genres

    rows.append(row)

In [50]:
df_similar_items = pd.DataFrame(rows)
df_similar_items.head(20)

Unnamed: 0,title,genres,similar_title_1,similar_genres_1,similar_title_2,similar_genres_2,similar_title_3,similar_genres_3,similar_title_4,similar_genres_4,similar_title_5,similar_genres_5
0,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,"Bourne Ultimatum, The (2007)",Action|Crime|Thriller,"Fast and the Furious, The (2001)",Action|Crime|Thriller,xXx (2002),Action|Crime|Thriller,"Bourne Supremacy, The (2004)",Action|Crime|Thriller,"Bank Job, The (2008)",Action|Crime|Thriller
1,Collateral (2004),Action|Crime|Drama|Thriller,"Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)",Action|Crime|Drama|Thriller,Munich (2005),Action|Crime|Drama|Thriller,Taken (2008),Action|Crime|Drama|Thriller,Smokin' Aces (2006),Action|Crime|Drama|Thriller,Spy Game (2001),Action|Crime|Drama|Thriller
2,Talladega Nights: The Ballad of Ricky Bobby (2006),Action|Comedy,Night at the Museum: Battle of the Smithsonian (2009),Action|Comedy,"Mexican, The (2001)",Action|Comedy,This Is the End (2013),Action|Comedy,Showtime (2002),Action|Comedy,Kung Fu Hustle (Gong fu) (2004),Action|Comedy
3,"Departed, The (2006)",Crime|Drama|Thriller,Training Day (2001),Crime|Drama|Thriller,Layer Cake (2004),Crime|Drama|Thriller,Nightcrawler (2014),Crime|Drama|Thriller,Inside Man (2006),Crime|Drama|Thriller,Zodiac (2007),Crime|Drama|Thriller
4,"Dark Knight, The (2008)",Action|Crime|Drama,3:10 to Yuma (2007),Action|Crime|Drama,Swordfish (2001),Action|Crime|Drama,"Hurt Locker, The (2008)",Action|Drama|Thriller,Shooter (2007),Action|Drama|Thriller,Flightplan (2005),Action|Drama|Thriller
5,Step Brothers (2008),Comedy,Clerks II (2006),Comedy,"Change-Up, The (2011)",Comedy,Not Another Teen Movie (2001),Comedy,"Benchwarmers, The (2006)",Comedy,Anger Management (2003),Comedy
6,Inglourious Basterds (2009),Action|Drama,Behind Enemy Lines (2001),Action|Drama,Fury (2014),Action|Drama,We Were Soldiers (2002),Action|Drama,Rocky Balboa (2006),Action|Drama,Black Hawk Down (2001),Action|Drama
7,Zombieland (2009),Action|Comedy|Horror,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,Hot Fuzz (2007),Action|Comedy|Crime|Mystery,Exit Through the Gift Shop (2010),Comedy|Documentary,"Blind Swordsman: Zatoichi, The (Zatôichi) (2003)",Action|Comedy|Crime|Drama,300 (2007),Action|Fantasy
8,Shutter Island (2010),Drama|Mystery|Thriller,"Village, The (2004)",Drama|Mystery|Thriller,Dogville (2003),Drama|Mystery|Thriller,Prisoners (2013),Drama|Mystery|Thriller,"Da Vinci Code, The (2006)",Drama|Mystery|Thriller,"Number 23, The (2007)",Drama|Mystery|Thriller
9,Exit Through the Gift Shop (2010),Comedy|Documentary,"Cabin in the Woods, The (2012)",Comedy|Horror|Sci-Fi|Thriller,"Blind Swordsman: Zatoichi, The (Zatôichi) (2003)",Action|Comedy|Crime|Drama,Hot Fuzz (2007),Action|Comedy|Crime|Mystery,Futurama: Bender's Big Score (2007),Animation|Comedy|Sci-Fi,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime
