In [149]:
import pandas as pd

import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras import layers
from tensorflow.keras import backend as K
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras import regularizers

In [150]:
ratings_df = pd.read_csv('movielens_matrix.csv')
ratings_df.head()

Unnamed: 0,user id,1,2,3,4,5,6,7,8,9,...,1673,1674,1675,1676,1677,1678,1679,1680,1681,1682
0,1,5.0,3.0,4.0,3.0,3.0,5.0,4.0,1.0,5.0,...,,,,,,,,,,
1,2,4.0,,,,,,,,,...,,,,,,,,,,
2,3,,,,,,,,,,...,,,,,,,,,,
3,4,,,,,,,,,,...,,,,,,,,,,
4,5,4.0,3.0,,,,,,,,...,,,,,,,,,,


In [151]:
ratings_df.fillna(0, inplace=True)
#ratings_df = ratings_df.astype(int)
ratings_matrix = ratings_df.drop('user id',axis=1).values
#normalize data
ratings_matrix = ratings_matrix/5.0
ratings_matrix.shape

(943, 1682)

In [152]:
#custom loss to not penalize on 0 values in reconstruction
def masked_mse(y_true, y_pred):
    mask = tf.cast(tf.not_equal(y_true, 0), tf.float32)  # 1 where y_true != 0
    squared_error = tf.square(y_true - y_pred)
    masked_se = mask * squared_error
    return tf.reduce_sum(masked_se) / tf.reduce_sum(mask)  # average over non-zero entries

In [203]:
original_dim = ratings_matrix.shape[1]
encoding_dim = 64

# Input: user's interaction vector (sparse or dense)
input_layer = layers.Input(shape=(original_dim,))

# Encoder
encoded = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.1))(input_layer)
#encoded = layers.Dropout(0.5)(encoded)
#encoded = layers.Dense(264, activation='relu')(encoded)
#encoded = layers.Dropout(0.5)(encoded)
encoded = layers.Dense(encoding_dim, activation='relu', activity_regularizer=regularizers.l1(0.00001))(encoded)

# Decoder
#decoded = layers.Dense(264, activation='relu')(encoded)
decoded = layers.Dense(512, activation='relu', kernel_regularizer=regularizers.l2(0.1))(encoded)
#decoded = layers.Dropout(0.5)(decoded)
output_layer = layers.Dense(original_dim, activation='linear')(decoded)  # use sigmoid if input is binary; linear if ratings

# Model
autoencoder = Model(inputs=input_layer, outputs=output_layer)

#compile the model
autoencoder.compile(optimizer='adam', loss= masked_mse)

In [204]:
#train the model
#callbacks=[early_stopping]
early_stopping = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

autoencoder.fit(ratings_matrix, ratings_matrix,
                epochs=100,
                batch_size=128,
                shuffle=True,
                validation_split=0.2,
                callbacks=[early_stopping])

Epoch 1/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 41ms/step - loss: 83.7866 - val_loss: 64.3128
Epoch 2/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 59.2547 - val_loss: 44.4468
Epoch 3/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step - loss: 40.7640 - val_loss: 29.9813
Epoch 4/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step - loss: 27.3449 - val_loss: 19.7647
Epoch 5/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 17.9629 - val_loss: 12.8242
Epoch 6/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 30ms/step - loss: 11.6256 - val_loss: 8.2483
Epoch 7/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 27ms/step - loss: 7.4800 - val_loss: 5.3266
Epoch 8/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step - loss: 4.8445 - val_loss: 3.4974
Epoch 9/100
[1m6/6[0m [32m━━━━━━━━━━━━━━━━

<keras.src.callbacks.history.History at 0x7fb174467ce0>

In [205]:
reconstruction_matrix = autoencoder.predict(ratings_matrix)
#scale back
reconstruction_matrix = reconstruction_matrix * 5

[1m30/30[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [206]:
print(reconstruction_matrix.shape)
print(reconstruction_matrix[0:5,:])

(943, 1682)
[[3.8865983  3.148137   3.138447   ... 0.14600912 0.29098308 0.26322863]
 [3.886599   3.1481373  3.1384468  ... 0.14600894 0.29098302 0.2632287 ]
 [3.8865995  3.1481376  3.1384473  ... 0.14600879 0.29098257 0.26322883]
 [3.8865995  3.1481376  3.1384478  ... 0.14600876 0.29098257 0.26322883]
 [3.886599   3.148137   3.138447   ... 0.14600903 0.290983   0.26322865]]


### Make recommendations
Certain items are recommended alot

In [157]:
#number of items to recommend
top_k = 10
#zero out previously rated items, so they are not recommended
rated_indices = np.argwhere(ratings_matrix)
rows, columns = rated_indices.T
reconstruction_matrix[rows,columns] = 0
#get random user
user = reconstruction_matrix[np.random.randint(reconstruction_matrix.shape[0],size=1),:]
#make recommendations
recs = np.argsort(-user)[0,0:top_k]
print(recs)
print(f'\npredicted ratings: {user[0,recs]}')

[1535 1590 1430 1462 1448 1598 1641 1267 1428 1232]

predicted ratings: [7.529709  7.3549767 6.6967754 6.5149755 6.4491777 6.445409  6.3638744
 6.2875032 6.128447  6.095021 ]


### Find items with highest predicted ratings

In [147]:
top_k=20
average_scores = reconstruction_matrix.mean(axis=0)
recs = np.argsort(-average_scores)[0:top_k]
print(f'Recommended Items: {recs}')
print(f'Predicted Ratings: {average_scores[recs]}')

Recommended Items: [1652 1641 1535 1598 1448 1079 1366 1190 1644 1462  850 1124  118 1635
 1188 1627  813 1649 1650 1397]
Predicted Ratings: [5.469575  5.2373295 4.954247  4.9409657 4.6577344 4.638872  4.5999055
 4.555565  4.5339255 4.528101  4.421934  4.4193916 4.405479  4.403879
 4.391745  4.372519  4.365604  4.3639364 4.274907  4.259183 ]
