In [22]:
import pandas as pd
import tensorflow as tf
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

ratings = pd.read_csv('ratings.csv')
movies = pd.read_csv('movies.csv')

# Splitting data into train and test
train, test = train_test_split(ratings, test_size=0.2, stratify=ratings['userId'], random_state=42)

# Encoding user and movie IDs after splitting
user_enc = LabelEncoder()
train['userId'] = user_enc.fit_transform(train['userId'].values)
test['userId'] = user_enc.transform(test['userId'].values)

item_enc = LabelEncoder()
train['movieId'] = item_enc.fit_transform(train['movieId'].values)

# Filter out test set movieIds not in training set
test = test[test['movieId'].isin(item_enc.classes_)]
test['movieId'] = item_enc.transform(test['movieId'].values)

# Deep Learning model
embedding_size = 50
n_users = train['userId'].nunique()
n_movies = train['movieId'].nunique()

# User and Movie input layers
user_input = tf.keras.layers.Input(shape=(1,), name='user_input', dtype='int64')
movie_input = tf.keras.layers.Input(shape=(1,), name='movie_input', dtype='int64')

# User and Movie Embeddings
user_embedding = tf.keras.layers.Embedding(input_dim=n_users, output_dim=embedding_size, name='user_embedding')(user_input)
movie_embedding = tf.keras.layers.Embedding(input_dim=n_movies, output_dim=embedding_size, name='movie_embedding')(movie_input)

# Assuming movie_genres is a DataFrame with one-hot encoded genres for each movieId
movie_genres = pd.get_dummies(movies['genres'], drop_first=True)
movie_genres['movieId'] = movies['movieId']

# Make sure all movieIds in train and test are also present in movie_genres
missing_movieIds_train = set(train['movieId']) - set(movie_genres['movieId'])
missing_movieIds_test = set(test['movieId']) - set(movie_genres['movieId'])

missing_rows_train = pd.DataFrame({'movieId': list(missing_movieIds_train)})
for genre in movie_genres.columns:
    if genre != 'movieId':
        missing_rows_train[genre] = 0

missing_rows_test = pd.DataFrame({'movieId': list(missing_movieIds_test)})
for genre in movie_genres.columns:
    if genre != 'movieId':
        missing_rows_test[genre] = 0

movie_genres = movie_genres.append(missing_rows_train, ignore_index=True)
movie_genres = movie_genres.append(missing_rows_test, ignore_index=True)

# Now, when you merge, there shouldn't be any NaN values for genres
train = pd.merge(train, movie_genres, on='movieId', how='left')
test = pd.merge(test, movie_genres, on='movieId', how='left')

# Double-check for NaN values
assert not train.isnull().values.any(), "Train data contains NaN values"
assert not test.isnull().values.any(), "Test data contains NaN values"

# Metadata input: genres
genre_input = tf.keras.layers.Input(shape=(movie_genres.shape[1]-1,), name='GenreInput')

# Flatten the embeddings
user_vector = tf.keras.layers.Flatten()(user_embedding)
movie_vector = tf.keras.layers.Flatten()(movie_embedding)

# Concatenate the flattened embeddings with genre features
x = tf.keras.layers.Concatenate()([user_vector, movie_vector, genre_input])

# Pass through the neural network using he_normal initialization
x = tf.keras.layers.Dense(128, activation='relu', kernel_initializer='he_normal')(x)
x = tf.keras.layers.Dropout(0.2)(x)
x = tf.keras.layers.Dense(64, activation='relu', kernel_initializer='he_normal')(x)
x = tf.keras.layers.Dense(1, activation='linear', kernel_initializer='he_normal')(x)

# Create the model
model = tf.keras.models.Model(inputs=[user_input, movie_input, genre_input], outputs=x)

# Reduce learning rate and compile the model
optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)  # Reduced learning rate
model.compile(optimizer=optimizer, loss='mean_squared_error')

# Train the model
history = model.fit([train['userId'], train['movieId'], train[movie_genres.columns[:-1]]], train['rating'], 
                    epochs=5, batch_size=32, validation_data=([test['userId'], test['movieId'], test[movie_genres.columns[:-1]]], test['rating']))


  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missin

  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missing_rows_train[genre] = 0
  missin

  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missin

  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missing_rows_test[genre] = 0
  missin

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5
