In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split

In [2]:
pd.set_option('display.max_colwidth', 9999999)

# View Data

### Training Movie Dataset

In [3]:
df_training_items = pd.read_csv('./data/movies_for_training.csv')
df_training_items.head(5)

Unnamed: 0,movie id,year,ave rating,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
0,6874,2003,3.961832,1,0,0,0,0,1,0,0,0,0,0,0,0,1
1,8798,2004,3.761364,1,0,0,0,0,1,0,1,0,0,0,0,0,1
2,46970,2006,3.25,1,0,0,0,1,0,0,0,0,0,0,0,0,0
3,48516,2006,4.252336,0,0,0,0,0,1,0,1,0,0,0,0,0,1
4,58559,2008,4.238255,1,0,0,0,0,1,0,1,0,0,0,0,0,0


In [4]:
df_training_items.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50884 entries, 0 to 50883
Data columns (total 17 columns):
 #   Column       Non-Null Count  Dtype  
---  ------       --------------  -----  
 0   movie id     50884 non-null  int64  
 1   year         50884 non-null  int64  
 2   ave rating   50884 non-null  float64
 3   Action       50884 non-null  int64  
 4   Adventure    50884 non-null  int64  
 5   Animation    50884 non-null  int64  
 6   Children     50884 non-null  int64  
 7   Comedy       50884 non-null  int64  
 8   Crime        50884 non-null  int64  
 9   Documentary  50884 non-null  int64  
 10  Drama        50884 non-null  int64  
 11  Fantasy      50884 non-null  int64  
 12  Horror       50884 non-null  int64  
 13  Mystery      50884 non-null  int64  
 14  Romance      50884 non-null  int64  
 15  Sci-Fi       50884 non-null  int64  
 16  Thriller     50884 non-null  int64  
dtypes: float64(1), int64(16)
memory usage: 6.6 MB


In [5]:
df_training_items['movie id'].value_counts()

movie id
5669     1160
8464     1000
8622      740
4306      680
79132     572
         ... 
69784      10
62155      10
32029      10
55830      10
68793      10
Name: count, Length: 847, dtype: int64

**There are 50884 rows to match with the viewer dataset and the rating dataset but only 847 movies**

### Training Viewer Dataset

In [6]:
df_training_users = pd.read_csv('./data/viewers_for_training.csv')
df_training_users.sample(5)

Unnamed: 0,user id,rating count,rating ave,Action,Adventure,Animation,Children,Comedy,Crime,Documentary,Drama,Fantasy,Horror,Mystery,Romance,Sci-Fi,Thriller
42943,526,24,4.12,4.12,4.14,3.83,3.83,4.22,4.17,0.0,4.13,3.75,0.0,3.75,3.86,4.17,3.75
29213,365,157,2.78,2.98,2.8,2.36,2.53,3.04,2.91,1.25,2.5,2.91,2.56,2.25,2.34,2.77,2.52
1128,20,65,3.22,2.67,2.96,4.0,3.22,3.23,3.14,4.0,3.72,3.42,3.2,3.21,3.95,3.23,2.78
42148,522,86,3.89,4.07,3.92,4.0,3.5,3.6,4.17,4.0,3.82,3.45,4.0,4.32,3.31,4.19,4.12
10127,125,156,3.7,3.57,3.56,3.79,3.79,3.71,3.71,0.0,3.84,3.52,3.69,4.08,3.6,3.64,3.73


In [7]:
df_training_users.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50884 entries, 0 to 50883
Data columns (total 17 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   user id       50884 non-null  int64  
 1   rating count  50884 non-null  int64  
 2   rating ave    50884 non-null  float64
 3   Action        50884 non-null  float64
 4   Adventure     50884 non-null  float64
 5   Animation     50884 non-null  float64
 6   Children      50884 non-null  float64
 7   Comedy        50884 non-null  float64
 8   Crime         50884 non-null  float64
 9   Documentary   50884 non-null  float64
 10  Drama         50884 non-null  float64
 11  Fantasy       50884 non-null  float64
 12  Horror        50884 non-null  float64
 13  Mystery       50884 non-null  float64
 14  Romance       50884 non-null  float64
 15  Sci-Fi        50884 non-null  float64
 16  Thriller      50884 non-null  float64
dtypes: float64(15), int64(2)
memory usage: 6.6 MB


In [8]:
df_training_users['user id'].value_counts()

user id
414    1245
68      967
610     947
249     920
232     861
       ... 
194       1
493       1
3         1
502       1
342       1
Name: count, Length: 397, dtype: int64

**There are 50884 rows to match with the movie dataset and the rating dataset but only 397 viewers**

### Rating Dataset

In [9]:
df_target = pd.read_csv('./data/ratings_for_training.csv', header=None)
df_target.columns = ['ratings']
df_target.sample(5)

Unnamed: 0,ratings
42600,4.5
37632,4.0
40022,4.0
24260,3.5
46418,3.0


*The first row of viewer dataset is the viewer who rated the film from the first row of movie dataset 
a point of ... which is the first row of rating dataset*

In [10]:
df_target.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 50884 entries, 0 to 50883
Data columns (total 1 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   ratings  50884 non-null  float64
dtypes: float64(1)
memory usage: 397.7 KB


### Utility Matrix

In [11]:
movie_id = df_training_items['movie id']
user_id = df_training_users['user id']

user_rated_movie = pd.concat([movie_id, user_id, df_target], axis=1)
user_rated_movie.sample(5)

Unnamed: 0,movie id,user id,ratings
12688,6708,177,4.5
16835,4446,232,2.5
1930,6281,28,1.5
27899,27773,351,3.0
31394,53322,387,3.5


In [12]:
utility_matrix = user_rated_movie.pivot_table(index='movie id', columns='user id', values='ratings')
utility_matrix

user id,2,3,4,7,9,10,12,13,15,16,...,598,599,600,601,603,605,606,607,608,610
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
4054,,,,,,,,,,,...,,,,,,,3.5,3.0,3.0,2.0
4069,,,,,,,,,,,...,,,2.5,,,,2.5,3.0,,
4148,,,,,,,,4.0,,,...,,2.5,,,2.0,,2.5,,4.5,3.5
4149,,,,,,,,,,,...,,2.5,,,,,,,,
4153,,,,,,,,,,,...,,,,,,,,,,3.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
174055,,,,,,,,,,,...,,4.0,,4.0,,,,,,
176371,,,,,,,,,,,...,,3.5,,4.0,,,,,,
177765,,,,,,,,,,,...,,,,4.5,,,,,,
179819,,,,,,,,,,,...,,,,,,,,,,


# Data Preprocessing - Scaling

In [13]:
scalerItem = StandardScaler()
items_scaled = scalerItem.fit_transform(df_training_items)
items_scaled = items_scaled[:, 3:]

scalerUser = StandardScaler()
users_scaled = scalerUser.fit_transform(df_training_users)
users_scaled = users_scaled[:, 3:]

scalerTarget = MinMaxScaler(feature_range=(-1, 1))
ratings_scaled = scalerTarget.fit_transform(df_target)

In [14]:
items_scaled.shape, users_scaled.shape, ratings_scaled.shape

((50884, 14), (50884, 14), (50884, 1))

In [15]:
item_train, item_test = train_test_split(items_scaled, train_size=0.80, shuffle=True, random_state=1)
user_train, user_test = train_test_split(users_scaled, train_size=0.80, shuffle=True, random_state=1)
y_train, y_test = train_test_split(ratings_scaled, train_size=0.80, shuffle=True, random_state=1)

item_train.shape, user_train.shape, y_train.shape

((40707, 14), (40707, 14), (40707, 1))

# Training

### Building Hidden Layers

In [16]:
n_outputs = 32

user_NN = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(n_outputs)
])

item_NN = keras.Sequential([
    keras.layers.Dense(256, activation='relu'),
    keras.layers.Dense(128, activation='relu'),
    keras.layers.Dense(n_outputs)
])

### Building Input and Output Layers

In [None]:
n_item_features = item_train.shape[1]
n_user_features = user_train.shape[1]

input_item = keras.layers.Input(shape=(n_item_features, ))
output_item = item_NN(input_item)

input_user = keras.layers.Input(shape=(n_user_features, ))
output_user = user_NN(input_user)

# Dot Product
final_output = keras.layers.Dot(axes=1, normalize=True)([output_item, output_user])  
# If normalized (vector length = 1), then dot product = cosine similarity (ranges from -1 to 1)

### Complete Model

In [18]:
model = keras.models.Model(inputs=[input_item, input_user], outputs=final_output)
model.summary()

### Training

In [19]:
tf.random.set_seed(1)

model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.01),
              loss=keras.losses.MeanSquaredError()
             )

model.fit(x=[item_train, user_train], y=y_train, epochs=30)

Epoch 1/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m9s[0m 4ms/step - loss: 0.1429
Epoch 2/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1279
Epoch 3/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m5s[0m 3ms/step - loss: 0.1207
Epoch 4/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1155
Epoch 5/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m6s[0m 3ms/step - loss: 0.1121
Epoch 6/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1094
Epoch 7/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1073
Epoch 8/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1055
Epoch 9/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m4s[0m 3ms/step - loss: 0.1039
Epoch 10/30
[1m1273/1273[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m

<keras.src.callbacks.history.History at 0x273fd447150>

In [20]:
model.evaluate([item_test, user_test], y_test)

[1m319/319[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step - loss: 0.1018


0.09993626922369003

# Recommend Movies to a New Viewer

### Movie Info Dataset

In [27]:
df_distinct_items = df_training_items.drop_duplicates(keep='first', ignore_index=True)
df_distinct_items.shape, df_distinct_items['movie id'].nunique()

((847, 17), 847)

In [21]:
df_item_info = pd.read_csv('./data/movie_info.csv')
df_item_info.head()

Unnamed: 0,movieId,title,genres
0,4054,Save the Last Dance (2001),Drama|Romance
1,4069,"Wedding Planner, The (2001)",Comedy|Romance
2,4148,Hannibal (2001),Horror|Thriller
3,4149,Saving Silverman (Evil Woman) (2001),Comedy|Romance
4,4153,Down to Earth (2001),Comedy|Fantasy|Romance


In [22]:
df_item_info.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 847 entries, 0 to 846
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  847 non-null    int64 
 1   title    847 non-null    object
 2   genres   847 non-null    object
dtypes: int64(1), object(2)
memory usage: 20.0+ KB


In [23]:
df_item_info.movieId.nunique()

847

**847 rows and 847 movies, no duplicates**

### Creating a new user

In [24]:
new_user_id = 5000
new_rating_count = 3
new_rating_ave = 0.0
new_action = 0.0
new_adventure = 5.0
new_animation = 0.0
new_childrens = 0.0
new_comedy = 0.0
new_crime = 0.0
new_documentary = 0.0
new_drama = 0.0
new_fantasy = 5.0
new_horror = 0.0
new_mystery = 0.0
new_romance = 0.0
new_scifi = 0.0
new_thriller = 0.0

new_user = np.array([[new_user_id, new_rating_count, new_rating_ave,
                      new_action, new_adventure, new_animation, new_childrens,
                      new_comedy, new_crime, new_documentary,
                      new_drama, new_fantasy, new_horror, new_mystery,
                      new_romance, new_scifi, new_thriller]])
new_user.shape

(1, 17)

*The new user enjoys movies from the `adventure`, `fantasy` genres. Let's find the top-rated movies for the new user*

In [28]:
# Replicate the new user to match the number of new movies in the dataset.
new_user_replicated = np.tile(new_user, (len(df_distinct_items), 1))  
# Repeat the array 847 times along axis 0 and 1 times along axis 1
new_user_replicated.shape

(847, 17)

### Predicting

In [29]:
new_user_scaled = scalerUser.transform(new_user_replicated)
new_user_scaled = new_user_scaled[:, 3:]

distinct_items_scaled = scalerItem.transform(df_distinct_items)
distinct_items_scaled = distinct_items_scaled[:, 3:]

new_user_scaled.shape, distinct_items_scaled.shape
# warning because new_user_replicated doesn't have feature names



((847, 14), (847, 14))

In [30]:
ratings_predicted = model.predict([distinct_items_scaled, new_user_scaled])
ratings_predicted = scalerTarget.inverse_transform(ratings_predicted)
ratings_predicted.shape  # has the same order as distinct_items_scaled

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step


(847, 1)

In [31]:
# must be converted into series to concat
ratings_series = pd.Series(ratings_predicted.reshape(-1), name='rating_predicted')
ratings_df = pd.concat([ratings_series, df_distinct_items], axis=1)

In [32]:
df_merged = pd.merge(df_item_info, ratings_df, left_on='movieId', right_on='movie id', how='inner')
df_merged = df_merged[['rating_predicted', 'ave rating', 'movie id', 'title', 'genres']]
df_merged.sort_values(by='rating_predicted', axis=0, ascending=False, inplace=True)
df_merged.head(10)

Unnamed: 0,rating_predicted,ave rating,movie id,title,genres
349,4.183578,3.708333,36708,Family Guy Presents Stewie Griffin: The Untold Story (2005),Adventure|Animation|Comedy
730,4.183578,3.875,103141,Monsters University (2013),Adventure|Animation|Comedy
833,4.183578,3.576923,157296,Finding Dory (2016),Adventure|Animation|Comedy
367,4.149903,3.816901,40815,Harry Potter and the Goblet of Fire (2005),Adventure|Fantasy|Thriller
521,4.030414,3.954545,59387,"Fall, The (2006)",Adventure|Drama|Fantasy
467,4.030414,3.862069,54001,Harry Potter and the Order of the Phoenix (2007),Adventure|Drama|Fantasy
822,4.030414,3.636364,137857,The Jungle Book (2016),Adventure|Drama|Fantasy
133,4.015323,3.598039,5816,Harry Potter and the Chamber of Secrets (2002),Adventure|Fantasy
142,4.015323,4.021277,5952,"Lord of the Rings: The Two Towers, The (2002)",Adventure|Fantasy
749,4.015323,3.58,106489,"Hobbit: The Desolation of Smaug, The (2013)",Adventure|Fantasy


In [33]:
df_merged['movie id'].nunique()

847

# Finding Similar Movies

### Computing Output Item Features

In [34]:
n_item_features = 14
# input layer
input_item = tf.keras.layers.Input(shape=(n_item_features,  ))
# use the pre-trained item_NN
output_item = item_NN(input_item)
# normalize the output
output_item = keras.layers.Lambda(lambda x: tf.linalg.l2_normalize(x, axis=1))(output_item)

item_model = keras.models.Model(input_item, output_item)                                
item_model.summary()




In [35]:
output_item_vectors = item_model.predict(distinct_items_scaled)
output_item_vectors.shape

[1m27/27[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 6ms/step


(847, 32)

### Computing Distance between Points / Movies

In [36]:
dot_products = np.dot(output_item_vectors, output_item_vectors.T)
dot_products.shape

(847, 847)

In [37]:
vector_squared_lengths = np.sum(np.square(output_item_vectors), axis=1)
vector_squared_lengths.shape

(847,)

In [38]:
col = vector_squared_lengths[:, np.newaxis]
row = vector_squared_lengths[np.newaxis, :]

col.shape, row.shape

((847, 1), (1, 847))

In [39]:
squared_distance_matrix = col + row - 2 * dot_products
distance_matrix = np.sqrt(np.abs(squared_distance_matrix))
distance_matrix.shape

(847, 847)

In [40]:
distance_matrix = pd.DataFrame(distance_matrix, columns=None).round(2).abs()
distance_matrix.columns = df_distinct_items['movie id']
distance_matrix.index = df_distinct_items['movie id']

distance_matrix

movie id,6874,8798,46970,48516,58559,60756,68157,71535,74458,77455,...,31221,36708,37380,46335,7150,51412,85510,93363,111364,5128
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6874,0.00,0.42,0.82,0.85,0.66,0.89,0.75,0.90,1.14,1.21,...,1.18,1.01,0.97,0.42,0.89,0.76,1.45,0.89,0.89,1.66
8798,0.42,0.00,0.92,0.69,0.71,0.94,0.72,1.00,1.05,1.25,...,1.18,1.18,0.84,0.00,0.94,0.68,1.35,0.88,0.88,1.57
46970,0.82,0.92,0.00,0.89,0.79,0.32,0.70,0.96,1.13,1.12,...,0.77,1.08,0.77,0.92,0.32,0.81,1.17,0.98,0.98,1.80
48516,0.85,0.69,0.89,0.00,0.57,0.95,0.46,1.01,0.80,1.04,...,1.20,1.13,0.85,0.69,0.95,0.83,1.41,1.03,1.03,1.66
58559,0.66,0.71,0.79,0.57,0.00,0.93,0.34,0.91,1.10,1.01,...,1.11,0.87,0.88,0.71,0.93,0.79,1.52,0.96,0.96,1.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51412,0.76,0.68,0.81,0.83,0.79,0.86,0.81,1.18,1.06,1.28,...,1.11,0.92,0.81,0.68,0.86,0.00,1.05,0.46,0.46,1.74
85510,1.45,1.35,1.17,1.41,1.52,1.13,1.42,1.47,1.34,1.41,...,1.14,1.44,1.11,1.35,1.13,1.05,0.00,1.07,1.07,1.50
93363,0.89,0.88,0.98,1.03,0.96,1.02,1.00,1.07,1.22,1.21,...,1.26,0.90,0.86,0.88,1.02,0.46,1.07,0.00,0.00,1.71
111364,0.89,0.88,0.98,1.03,0.96,1.02,1.00,1.07,1.22,1.21,...,1.26,0.90,0.86,0.88,1.02,0.46,1.07,0.00,0.00,1.71


In [41]:
# Create a mask
mask = np.tril(np.ones_like(distance_matrix, dtype=bool), k=0)
mask

array([[ True, False, False, ..., False, False, False],
       [ True,  True, False, ..., False, False, False],
       [ True,  True,  True, ..., False, False, False],
       ...,
       [ True,  True,  True, ...,  True, False, False],
       [ True,  True,  True, ...,  True,  True, False],
       [ True,  True,  True, ...,  True,  True,  True]])

In [42]:
distance_matrix[mask] = np.inf
distance_matrix

movie id,6874,8798,46970,48516,58559,60756,68157,71535,74458,77455,...,31221,36708,37380,46335,7150,51412,85510,93363,111364,5128
movie id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
6874,inf,0.42,0.82,0.85,0.66,0.89,0.75,0.90,1.14,1.21,...,1.18,1.01,0.97,0.42,0.89,0.76,1.45,0.89,0.89,1.66
8798,inf,inf,0.92,0.69,0.71,0.94,0.72,1.00,1.05,1.25,...,1.18,1.18,0.84,0.00,0.94,0.68,1.35,0.88,0.88,1.57
46970,inf,inf,inf,0.89,0.79,0.32,0.70,0.96,1.13,1.12,...,0.77,1.08,0.77,0.92,0.32,0.81,1.17,0.98,0.98,1.80
48516,inf,inf,inf,inf,0.57,0.95,0.46,1.01,0.80,1.04,...,1.20,1.13,0.85,0.69,0.95,0.83,1.41,1.03,1.03,1.66
58559,inf,inf,inf,inf,inf,0.93,0.34,0.91,1.10,1.01,...,1.11,0.87,0.88,0.71,0.93,0.79,1.52,0.96,0.96,1.71
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
51412,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,1.05,0.46,0.46,1.74
85510,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,1.07,1.07,1.50
93363,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,0.00,1.71
111364,inf,inf,inf,inf,inf,inf,inf,inf,inf,inf,...,inf,inf,inf,inf,inf,inf,inf,inf,inf,1.71


### Finding Similar Movies

In [43]:
similar_item_ids = []
for i in range(10):
    main_id = distance_matrix.index[i]
    
    sorted_distances = distance_matrix.iloc[i].sort_values(ascending=True)
    similar_ids = sorted_distances.head(3).index
    
    similar_item_ids.append((main_id, similar_ids.tolist()))

similar_item_ids

[(6874, [8665, 6383, 5507]),
 (8798, [46335, 59369, 32029]),
 (46970, [68793, 31878, 102123]),
 (48516, [55118, 5329, 6552]),
 (58559, [54997, 4344, 40278]),
 (60756, [8641, 6586, 48385]),
 (68157, [40278, 115210, 4958]),
 (71535, [8370, 119145, 57669]),
 (74458, [8950, 104879, 51086]),
 (77455, [93840, 57669, 6003])]

In [44]:
info = df_item_info[df_item_info['movieId'] == 119145]
info

Unnamed: 0,movieId,title,genres
795,119145,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime


In [45]:
info['title'].values

array(['Kingsman: The Secret Service (2015)'], dtype=object)

In [46]:
rows = []

for main_id, similar_ids in similar_item_ids:
    
    main_info = df_item_info[df_item_info['movieId'] == main_id]  # show values in DF format
    title = main_info['title'].values[0] if not main_info.empty else None  # take value in column 'title'
    genres = main_info['genres'].values[0] if not main_info.empty else None

    row = {
        'movie_ID': main_id,
        'title': title,
        'genres': genres
    }

    for loop, id in enumerate(similar_ids, start=1):
        similar_info = df_item_info[df_item_info['movieId'] == id]
        similar_title = similar_info['title'].values[0] if not similar_info.empty else None
        similar_genres = similar_info['genres'].values[0] if not similar_info.empty else None
        
        row[f'similar_ID_{loop}'] = id
        row[f'similar_title_{loop}'] = similar_title
        row[f'similar_genres_{loop}'] = similar_genres

    rows.append(row)

In [47]:
df_similar_items = pd.DataFrame(rows)
df_similar_items

Unnamed: 0,movie_ID,title,genres,similar_ID_1,similar_title_1,similar_genres_1,similar_ID_2,similar_title_2,similar_genres_2,similar_ID_3,similar_title_3,similar_genres_3
0,6874,Kill Bill: Vol. 1 (2003),Action|Crime|Thriller,8665,"Bourne Supremacy, The (2004)",Action|Crime|Thriller,6383,"2 Fast 2 Furious (Fast and the Furious 2, The) (2003)",Action|Crime|Thriller,5507,xXx (2002),Action|Crime|Thriller
1,8798,Collateral (2004),Action|Crime|Drama|Thriller,46335,"Fast and the Furious: Tokyo Drift, The (Fast and the Furious 3, The) (2006)",Action|Crime|Drama|Thriller,59369,Taken (2008),Action|Crime|Drama|Thriller,32029,Hostage (2005),Action|Crime|Drama|Thriller
2,46970,Talladega Nights: The Ballad of Ricky Bobby (2006),Action|Comedy,68793,Night at the Museum: Battle of the Smithsonian (2009),Action|Comedy,31878,Kung Fu Hustle (Gong fu) (2004),Action|Comedy,102123,This Is the End (2013),Action|Comedy
3,48516,"Departed, The (2006)",Crime|Drama|Thriller,55118,Eastern Promises (2007),Crime|Drama|Thriller,5329,"Salton Sea, The (2002)",Crime|Drama|Thriller,6552,Dirty Pretty Things (2002),Crime|Drama|Thriller
4,58559,"Dark Knight, The (2008)",Action|Crime|Drama,54997,3:10 to Yuma (2007),Action|Crime|Drama,4344,Swordfish (2001),Action|Crime|Drama,40278,Jarhead (2005),Action|Drama
5,60756,Step Brothers (2008),Comedy,8641,Anchorman: The Legend of Ron Burgundy (2004),Comedy,6586,American Wedding (American Pie 3) (2003),Comedy,48385,Borat: Cultural Learnings of America for Make Benefit Glorious Nation of Kazakhstan (2006),Comedy
6,68157,Inglourious Basterds (2009),Action|Drama,40278,Jarhead (2005),Action|Drama,115210,Fury (2014),Action|Drama,4958,Behind Enemy Lines (2001),Action|Drama
7,71535,Zombieland (2009),Action|Comedy|Horror,8370,"Blind Swordsman: Zatoichi, The (Zatôichi) (2003)",Action|Comedy|Crime|Drama,119145,Kingsman: The Secret Service (2015),Action|Adventure|Comedy|Crime,57669,In Bruges (2008),Comedy|Crime|Drama|Thriller
8,74458,Shutter Island (2010),Drama|Mystery|Thriller,8950,The Machinist (2004),Drama|Mystery|Thriller,104879,Prisoners (2013),Drama|Mystery|Thriller,51086,"Number 23, The (2007)",Drama|Mystery|Thriller
9,77455,Exit Through the Gift Shop (2010),Comedy|Documentary,93840,"Cabin in the Woods, The (2012)",Comedy|Horror|Sci-Fi|Thriller,57669,In Bruges (2008),Comedy|Crime|Drama|Thriller,6003,Confessions of a Dangerous Mind (2002),Comedy|Crime|Drama|Thriller
