In [1]:
import pandas as pd
import numpy as np
from tensorflow.keras.layers import Input, Dense
from tensorflow.keras.models import Model
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

In [2]:
# reading the ratings data
# columns: user_id, movie_id, rating, timestamp
ratings = pd.read_csv('ml-1m/ratings.dat',\
          sep="::", header = None, engine='python')
ratings.head()

Unnamed: 0,0,1,2,3
0,1,1193,5,978300760
1,1,661,3,978302109
2,1,914,3,978301968
3,1,3408,4,978300275
4,1,2355,5,978824291


In [3]:
# Lets pivot the data to get it at a user level
data_df = pd.pivot_table(ratings[[0,1,2]],\
          values=2, index=0, columns=1 ).fillna(0)
data_df.head()

1,1,2,3,4,5,6,7,8,9,10,...,3943,3944,3945,3946,3947,3948,3949,3950,3951,3952
0,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
5,0.0,0.0,0.0,0.0,0.0,2.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0


In [4]:
'''# Function to generate synthetic user-item interaction data
def generate_synthetic_data(num_users, num_items):
    np.random.seed(42)  # Set the seed for reproducibility
    data = np.random.rand(num_users, num_items)  # Generate random data
    df = pd.DataFrame(data, columns=[f'item_{i}' for i in range(num_items)])
    return df

# Generate synthetic data for 1000 users and 500 items
num_users = 1000
num_items = 500
data_df = generate_synthetic_data(num_users, num_items)

data_df'''

"# Function to generate synthetic user-item interaction data\ndef generate_synthetic_data(num_users, num_items):\n    np.random.seed(42)  # Set the seed for reproducibility\n    data = np.random.rand(num_users, num_items)  # Generate random data\n    df = pd.DataFrame(data, columns=[f'item_{i}' for i in range(num_items)])\n    return df\n\n# Generate synthetic data for 1000 users and 500 items\nnum_users = 1000\nnum_items = 500\ndata_df = generate_synthetic_data(num_users, num_items)\n\ndata_df"

In [5]:
# Convert DataFrame to NumPy array
data = data_df.to_numpy()

data

array([[5., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [3., 0., 0., ..., 0., 0., 0.]])

In [6]:
# Normalize the data using StandardScaler
scaler = StandardScaler()
data = scaler.fit_transform(data)

# Split the data into training and test sets (80% train, 20% test)
train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)

In [7]:
# Define the autoencoder model with input, hidden, encoding, hidden, and output layers
input_dim = data.shape[1]  # Number of items
encoding_dim = 64  # Number of neurons in the bottleneck layer
hidden_dim = 128   # Number of neurons in the hidden layers

# Input layer
input_layer = Input(shape=(input_dim,))

# First hidden layer
hidden_layer1 = Dense(hidden_dim, activation='relu')(input_layer)

# Encoding layer (bottleneck layer)
encoder = Dense(encoding_dim, activation='relu')(hidden_layer1)

# Second hidden layer
hidden_layer2 = Dense(hidden_dim, activation='relu')(encoder)

# Output layer
decoder = Dense(input_dim, activation='sigmoid')(hidden_layer2)

In [8]:
# Build the autoencoder model
autoencoder = Model(inputs=input_layer, outputs=decoder)

# Compile the model with Adam optimizer and Mean Squared Error loss
autoencoder.compile(optimizer='adam', loss='mse')

# Train the model
autoencoder.fit(train_data, train_data,
                epochs=50,  # Number of epochs
                batch_size=256,  # Batch size
                shuffle=True,  # Shuffle the data
                validation_data=(test_data, test_data))  # Use test data for validation

# Extract the encoder part of the autoencoder to get the encoded representations
encoder_model = Model(inputs=input_layer, outputs=encoder)
encoded_data = encoder_model.predict(data)

# Define the decoder model
encoded_input = Input(shape=(encoding_dim,))
decoder_layer1 = autoencoder.layers[-2](encoded_input)  # Second hidden layer
decoder_output = autoencoder.layers[-1](decoder_layer1)  # Output layer
decoder_model = Model(inputs=encoded_input, outputs=decoder_output)

Epoch 1/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 34ms/step - loss: 1.1732 - val_loss: 0.8891
Epoch 2/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 0.9863 - val_loss: 0.8774
Epoch 3/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 0.9675 - val_loss: 0.8700
Epoch 4/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 26ms/step - loss: 0.9955 - val_loss: 0.8555
Epoch 5/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.9220 - val_loss: 0.8446
Epoch 6/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 24ms/step - loss: 0.9411 - val_loss: 0.8378
Epoch 7/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 21ms/step - loss: 0.9492 - val_loss: 0.8317
Epoch 8/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 27ms/step - loss: 0.9134 - val_loss: 0.8273
Epoch 9/50
[1m19/19[0m [32m━━━━━━━━━━━━━━━━━━

In [9]:
# Function to recommend items for a given user based on encoded data
def recommend_items(user_id, num_recommendations=5):
    user_vector = encoded_data[user_id].reshape(1, -1)  # Get the encoded vector for the user
    reconstructed_user = decoder_model.predict(user_vector)  # Reconstruct the user vector using the decoder
    # Sort the items by predicted score in descending order and get the top recommendations
    recommended_items = np.argsort(reconstructed_user[0])[::-1][:num_recommendations]
    return recommended_items

In [20]:
# Example: Get recommendations for user with ID 
user_id = 10
recommendations = recommend_items(user_id)
print(f"Recommended movies for user {user_id}: {recommendations}")

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 102ms/step
Recommended movies for user 10: [1743  427 3029  992 2838]
