In [13]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [14]:
# reading the ratings data
ratings = pd.read_csv('ml-1m/ratings.dat',\
          sep="::", header = None, engine='python')
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [15]:
# Lets pivot the data to get it at a user level
data_df = pd.pivot_table(ratings[[0,1,2]],\
          values=2, index=0, columns=1 ).fillna(0)
data_df.head()

1,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [6]:
'''# Function to generate synthetic user-item interaction data
def generate_synthetic_data(num_users, num_items):
    np.random.seed(42)  # Set the seed for reproducibility
    data = np.random.rand(num_users, num_items)  # Generate random data
    df = pd.DataFrame(data, columns=[f'item_{i}' for i in range(num_items)])
    return df

# Generate synthetic data for 1000 users and 500 items
num_users = 1000
num_items = 500
data_df = generate_synthetic_data(num_users, num_items)

data_df'''

Unnamed: 0,item_0,item_1,item_2,item_3,item_4,item_5,item_6,item_7,item_8,item_9,...,item_490,item_491,item_492,item_493,item_494,item_495,item_496,item_497,item_498,item_499
0,0.374540,0.950714,0.731994,0.598658,0.156019,0.155995,0.058084,0.866176,0.601115,0.708073,...,0.455657,0.620133,0.277381,0.188121,0.463698,0.353352,0.583656,0.077735,0.974395,0.986211
1,0.698162,0.536096,0.309528,0.813795,0.684731,0.162617,0.910927,0.822537,0.949800,0.725720,...,0.799416,0.694696,0.272145,0.590231,0.360974,0.091582,0.917314,0.136819,0.950237,0.446006
2,0.185133,0.541901,0.872946,0.732225,0.806561,0.658783,0.692277,0.849196,0.249668,0.489425,...,0.237544,0.373252,0.227270,0.073196,0.603449,0.668213,0.619490,0.463494,0.379786,0.863334
3,0.519082,0.479182,0.025642,0.341248,0.380196,0.398823,0.580172,0.533603,0.607905,0.764883,...,0.765513,0.158908,0.610225,0.135354,0.751375,0.656955,0.956615,0.068958,0.057055,0.282187
4,0.261706,0.246979,0.906255,0.249546,0.271950,0.759398,0.449740,0.776711,0.065366,0.487571,...,0.285784,0.203223,0.761798,0.386541,0.511275,0.492325,0.577279,0.865577,0.980739,0.407584
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
995,0.119716,0.301410,0.163965,0.261691,0.126138,0.059495,0.674850,0.809887,0.954444,0.385121,...,0.185042,0.907333,0.383251,0.112915,0.954184,0.291942,0.036909,0.966182,0.954838,0.694163
996,0.267215,0.176615,0.835887,0.404111,0.481017,0.344779,0.678286,0.271478,0.062304,0.042079,...,0.394545,0.786379,0.362930,0.054260,0.530885,0.136017,0.751096,0.735454,0.667888,0.468211
997,0.258813,0.174448,0.836308,0.940249,0.216489,0.314816,0.747976,0.407415,0.339505,0.910091,...,0.808109,0.734749,0.723624,0.536112,0.824388,0.300727,0.867356,0.626426,0.484527,0.580526
998,0.578663,0.989142,0.329159,0.855647,0.026532,0.862262,0.667397,0.247983,0.028812,0.905783,...,0.703674,0.206627,0.564155,0.330331,0.421515,0.442666,0.879735,0.009206,0.818791,0.620621


In [16]:
# Convert DataFrame to NumPy array
data = data_df.to_numpy()

data

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [17]:
# Normalize the data using StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(data)

# Split the data into training and test sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [18]:
# Define the autoencoder model with input, hidden, encoding, hidden, and output layers
input_dim = data.shape[1]  # Number of items
encoding_dim = 64  # Number of neurons in the bottleneck layer
hidden_dim = 128   # Number of neurons in the hidden layers

# Input layer
input_layer = Input(shape=(input_dim,))

# First hidden layer
hidden_layer1 = Dense(hidden_dim, activation='relu')(input_layer)

# Encoding layer (bottleneck layer)
encoder = Dense(encoding_dim, activation='relu')(hidden_layer1)

# Second hidden layer
hidden_layer2 = Dense(hidden_dim, activation='relu')(encoder)

# Output layer
decoder = Dense(input_dim, activation='sigmoid')(hidden_layer2)

In [19]:
# Build the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile the model with Adam optimizer and Mean Squared Error loss
autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
autoencoder.fit(train_data, train_data,
                epochs=50,  # Number of epochs
                batch_size=256,  # Batch size
                shuffle=True,  # Shuffle the data
                validation_data=(test_data, test_data))  # Use test data for validation

# Extract the encoder part of the autoencoder to get the encoded representations
encoder_model = Model(inputs=input_layer, outputs=encoder)
encoded_data = encoder_model.predict(data)

# Define the decoder model
encoded_input = Input(shape=(encoding_dim,))
decoder_layer1 = autoencoder.layers[-2](encoded_input)  # Second hidden layer
decoder_output = autoencoder.layers[-1](decoder_layer1)  # Output layer
decoder_model = Model(inputs=encoded_input, outputs=decoder_output)

Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 22ms/step - loss: 1.1725 - val_loss: 0.8879
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 15ms/step - loss: 0.9951 - val_loss: 0.8772
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step - loss: 0.9977 - val_loss: 0.8641
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 1.0124 - val_loss: 0.8528
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 16ms/step - loss: 0.9204 - val_loss: 0.8421
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 17ms/step - loss: 0.9337 - val_loss: 0.8354
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 14ms/step - loss: 0.9363 - val_loss: 0.8304
Epoch 8/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 18ms/step - loss: 0.9110 - val_loss: 0.8268
Epoch 9/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━

In [20]:
# Function to recommend items for a given user based on encoded data
def recommend_items(user_id, num_recommendations=5):
    user_vector = encoded_data[user_id].reshape(1, -1)  # Get the encoded vector for the user
    reconstructed_user = decoder_model.predict(user_vector)  # Reconstruct the user vector using the decoder
    # Sort the items by predicted score in descending order and get the top recommendations
    recommended_items = np.argsort(reconstructed_user[0])[::-1][:num_recommendations]
    return recommended_items

In [26]:
# Example: Get recommendations for user with ID 
user_id = 10
recommendations = recommend_items(user_id)
print(f"Recommended movies for user {user_id}: {recommendations}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 25ms/step
Recommended movies for user 10: [3029  101 1398 1743  174]
