In [4]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import keras
from keras.models import Model
from keras import layers
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

# Load and preprocess data
def load_and_preprocess_data():
    movies = pd.read_csv('dataset/movies.csv')
    ratings = pd.read_csv('dataset/ratings.csv')

    user_ids = ratings["userId"].unique().tolist()
    userencoded = {x: i for i, x in enumerate(user_ids)}
    user_rev = {i: x for i, x in enumerate(user_ids)}

    movie_ids = ratings['movieId'].unique().tolist()
    moviecoded = {x: i for i, x in enumerate(movie_ids)}
    movie_rev = {i: x for i, x in enumerate(movie_ids)}

    ratings['user'] = ratings['userId'].map(userencoded)
    ratings['movie'] = ratings['movieId'].map(moviecoded)

    ratings['rating'] = (ratings['rating'] - ratings['rating'].mean()) / ratings['rating'].std()
    max_rating = max(ratings['rating'])
    min_rating = min(ratings['rating'])
    ratings['rating'] = ratings['rating'].apply(lambda x: (x - min_rating) / (max_rating - min_rating))

    # One-hot encode the genres
    genres = set()
    for genre_list in movies['genres']:
        genres.update(genre_list.split('|'))
    genres = list(genres)

    for genre in genres:
        movies[genre] = movies['genres'].apply(lambda x: 1 if genre in x else 0)

    x = ratings[['user', 'movie']].values
    y = ratings['rating'].values
    x_train, x_val, y_train, y_val = train_test_split(x, y, test_size=0.1, random_state=42)

    return movies, ratings, x_train, x_val, y_train, y_val, userencoded, user_rev, moviecoded, movie_rev, genres

movies, ratings, x_train, x_val, y_train, y_val, userencoded, user_rev, moviecoded, movie_rev, genres = load_and_preprocess_data()


In [5]:
def build_model(num_users, num_movies, embedding_size=100):
    user_layer = layers.Input(shape=[1])
    user_embedding = layers.Embedding(num_users, embedding_size, embeddings_initializer="he_normal",
                                       embeddings_regularizer=keras.regularizers.l2(1e-6))(user_layer)
    user_vector = layers.Flatten()(user_embedding)

    movie_layer = layers.Input(shape=[1])
    movie_embedding = layers.Embedding(num_movies, embedding_size, embeddings_initializer="he_normal",
                                        embeddings_regularizer=keras.regularizers.l2(1e-6))(movie_layer)
    movie_vector = layers.Flatten()(movie_embedding)

    prod = layers.dot(inputs=[user_vector, movie_vector], axes=1)
    dense1 = layers.Dense(200, activation='relu')(prod)
    dense2 = layers.Dense(100, activation='relu')(dense1)
    dropout = layers.Dropout(0.5)(dense2)
    dense3 = layers.Dense(1, activation='relu')(dropout)

    model = Model([user_layer, movie_layer], dense3)
    model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss="mean_squared_error", metrics=["mean_absolute_error"])
    return model

model = build_model(len(userencoded), len(moviecoded))


In [6]:
model.fit([x_train[:, 0], x_train[:, 1]], y_train, validation_data=([x_val[:, 0], x_val[:, 1]], y_val), 
          batch_size=64, epochs=30, verbose=1)


Epoch 1/30
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 18ms/step - loss: 0.0817 - mean_absolute_error: 0.2218 - val_loss: 0.0550 - val_mean_absolute_error: 0.1858
Epoch 2/30
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 16ms/step - loss: 0.0511 - mean_absolute_error: 0.1755 - val_loss: 0.0425 - val_mean_absolute_error: 0.1576
Epoch 3/30
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 16ms/step - loss: 0.0197 - mean_absolute_error: 0.1028 - val_loss: 0.0452 - val_mean_absolute_error: 0.1603
Epoch 4/30
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m25s[0m 18ms/step - loss: 0.0118 - mean_absolute_error: 0.0756 - val_loss: 0.0460 - val_mean_absolute_error: 0.1609
Epoch 5/30
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 17ms/step - loss: 0.0107 - mean_absolute_error: 0.0684 - val_loss: 0.0464 - val_mean_absolute_error: 0.1625
Epoch 6/30
[1m1418/1418[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[

<keras.src.callbacks.history.History at 0x25556f4f2f0>

In [7]:
# Predict on validation set
y_pred = model.predict([x_val[:, 0], x_val[:, 1]])

# Compute performance metrics
mse = mean_squared_error(y_val, y_pred)
mae = mean_absolute_error(y_val, y_pred)
r2 = r2_score(y_val, y_pred)

print(f"Mean Squared Error (MSE): {mse}")
print(f"Mean Absolute Error (MAE): {mae}")
print(f"R2 Score: {r2}")


[1m316/316[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 2ms/step
Mean Squared Error (MSE): 0.04404742292757607
Mean Absolute Error (MAE): 0.16068166438512316
R2 Score: 0.1918258202698746


In [8]:
model.save("movie_recommendation_model.h5")




In [1]:
def recommend_movies(user_id, selected_genres, top_n=10):
    if user_id not in userencoded:
        return {"error": f"User ID {user_id} not found."}

    # Map the user ID to its encoded value
    user_encoder = userencoded[user_id]
    
    # Get movies the user has already watched
    movies_watched = ratings[ratings['user'] == user_encoder][['movieId', 'rating']]
    
    # Get movies that the user has liked (rating > 0.8)
    liked_movies = movies_watched[movies_watched['rating'] > 0.8]
    
    # Get all movies that the user has not yet rated
    movies_not_watched = movies[~movies["movieId"].isin(movies_watched['movieId'])]["movieId"]
    movies_not_watched = list(set(movies_not_watched).intersection(set(moviecoded.keys())))
    
    # Prepare user-movie pairs for predictions
    user_movie_array = np.hstack(([[user_encoder]] * len(movies_not_watched), [[moviecoded[x]] for x in movies_not_watched]))

    # Predict ratings for the movies that the user hasn't watched yet
    predicted_ratings = model.predict([user_movie_array[:, 0], user_movie_array[:, 1]]).flatten()
    
    # Create a dictionary of predicted ratings for movies the user has not watched
    predicted_ratings_dict = {movie_id: rating for movie_id, rating in zip(movies_not_watched, predicted_ratings)}

    # Filter movies by selected genres (context-based filtering)
    filtered_movies = movies[movies[selected_genres].sum(axis=1) > 0]
    filtered_movie_ids = filtered_movies['movieId'].values
    
    # Create a dictionary of filtered predicted ratings
    filtered_predicted_ratings = {movie_id: predicted_ratings_dict.get(movie_id, 0) for movie_id in filtered_movie_ids}

    # Sort the filtered movies by predicted ratings in descending order
    top_movies = sorted(filtered_predicted_ratings.items(), key=lambda x: x[1], reverse=True)[:top_n]

    # Prepare the list of recommended movies
    recommended = [{"title": movies[movies["movieId"] == movie_id]["title"].values[0],
                    "genres": movies[movies["movieId"] == movie_id]["genres"].values[0]}
                   for movie_id, _ in top_movies]

    return recommended


In [3]:
import pickle
pickle.dump(userencoded, open('userencoded.pkl', 'wb'))
pickle.dump(moviecoded, open('moviecoded.pkl', 'wb'))

