In [1]:
import pandas as pd
from surprise import Reader, Dataset, SVD, accuracy
from surprise.model_selection import  train_test_split, GridSearchCV
import time
import mlflow

In [2]:
ratings = pd.read_csv("s3://ns-data-resources-bucket/netflix-recommendation/sample_data_2005_10_12.csv")

ratings.drop(['timestamp', 'year', 'month'], axis=1, inplace=True)

movies = pd.read_csv("s3://ns-data-resources-bucket/netflix-recommendation/movie_titles_clean.csv")

In [3]:
movie_id_count = ratings["movie_id"].value_counts()
movie_id_count[movie_id_count > movie_id_count.quantile(0.7)].index
user_id_count = ratings["user_id"].value_counts()
user_id_count[user_id_count > user_id_count.quantile(0.7)].index
ratings = ratings[ratings['user_id'].isin(user_id_count[user_id_count > user_id_count.quantile(0.85)].index)]
ratings = ratings[ratings['movie_id'].isin(movie_id_count[movie_id_count > movie_id_count.quantile(0.85)].index)]
movies = movies[movies['movie_id'].isin(movie_id_count[movie_id_count > movie_id_count.quantile(0.85)].index)]


In [4]:
# Later we will use the current_movies dataframe to make predictions for the user
movies.to_csv('current_movies.csv', index=False)

In [7]:
print(ratings.info())
print(movies.info())

<class 'pandas.core.frame.DataFrame'>
Index: 2406468 entries, 0 to 6640933
Data columns (total 3 columns):
 #   Column       Dtype  
---  ------       -----  
 0   user_id      int64  
 1   movie_id     int64  
 2   user_rating  float64
dtypes: float64(1), int64(2)
memory usage: 73.4 MB
None
<class 'pandas.core.frame.DataFrame'>
Index: 799 entries, 29 to 17763
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   movie_id      799 non-null    int64  
 1   release_year  799 non-null    float64
 2   movie_title   799 non-null    object 
dtypes: float64(1), int64(1), object(1)
memory usage: 25.0+ KB
None


In [8]:
unique_user_ids = ratings.user_id.unique()
unique_movie_ids = ratings.movie_id.unique()
total_ratings = len(ratings.index)
print("Number of unique users: {}".format(len(unique_user_ids)))
print("Number of unique movies: {}".format(len(unique_movie_ids)))
print("Number of total ratings: {}".format(total_ratings))

Number of unique users: 16875
Number of unique movies: 799
Number of total ratings: 2406468


In [9]:
reader = Reader(rating_scale=(1, 5))
ratings_df = Dataset.load_from_df(ratings[['user_id', 'movie_id', 'user_rating']], reader)
data_train, data_test = train_test_split(ratings_df, test_size=0.2)
print("Data loaded.")

Data loaded.


In [10]:
# Set mlflow experiment
print("Setting mlflow experiment...")
mlflow.set_experiment('netflix_recommendation_project')

time_stamp = int(time.time())
run_name = f'svd_surprise_{time_stamp}'
class PyFuncSurprise(mlflow.pyfunc.PythonModel):

    def __init__(self, model):
        self.model = model

    def predict(self, context, inputs):
        inputs['prediction'] = inputs.apply(lambda x: self.model.predict(uid=x['user_id'], iid=x['movie_id']).est, axis=1)
        inputs.sort_values(by=['prediction'], ascending=False, inplace=True)
        return inputs
    
with mlflow.start_run(run_name=run_name) as run:
    # Grid search parameters for SVD
    # Need more extensive grid search
    param_grid = {
        "n_epochs": [20, 30], 
        "n_factors": [100, 200]
        }
    print("Starting training...")
    gs = GridSearchCV(SVD, param_grid, measures=["rmse"],n_jobs=-2, cv=5, joblib_verbose=2)
    gs.fit(ratings_df)

    # Train the model with the best parameters
    params = gs.best_params["rmse"]

    model = SVD(**params)
    print("Training the model...")
    # Start timer
    start_time = time.time()
    model.fit(data_train)

    # End timer
    end_time = time.time()
    print("Training completed.")
    print(f"Total time: {end_time - start_time} seconds")

    # Evaluate the model
    print("Evaluating the model...")
    predictions = model.test(data_test)
    rmse = accuracy.rmse(predictions)
    print("RMSE: ", rmse)

    # Create the pyfunc model
    pyfunc_model = PyFuncSurprise(model)
    
    mlflow.log_metric("rmse", rmse)
    mlflow.log_metric("nb_unique_users", len(unique_user_ids))
    mlflow.log_metric("nb_unique_movies", len(unique_movie_ids))
    mlflow.log_metric("total_ratings", total_ratings)
    mlflow.log_metric("training_time", end_time - start_time)
    mlflow.log_param("params", params)
    mlflow.pyfunc.log_model(python_model=pyfunc_model, artifact_path="surprise-model", registered_model_name="svd_surprise")

2023/07/11 15:29:28 INFO mlflow.tracking.fluent: Experiment with name 'netflix_recommendation_project' does not exist. Creating a new experiment.


Setting mlflow experiment...
Starting training...


[Parallel(n_jobs=-2)]: Using backend LokyBackend with 7 concurrent workers.
[Parallel(n_jobs=-2)]: Done  18 out of  20 | elapsed:  1.0min remaining:    6.9s
[Parallel(n_jobs=-2)]: Done  20 out of  20 | elapsed:  1.1min finished


Training the model...
Training completed.
Total time: 16.09574294090271 seconds
Evaluating the model...
RMSE: 0.8263
RMSE:  0.8262569189964807


Successfully registered model 'svd_surprise'.
2023/07/11 15:31:13 INFO mlflow.tracking._model_registry.client: Waiting up to 300 seconds for model version to finish creation. Model name: svd_surprise, version 1
Created version '1' of model 'svd_surprise'.


In [23]:
# Testing the model
client = mlflow.MlflowClient()
model_name = "svd_surprise"
model_version = client.get_registered_model(name="svd_surprise").latest_versions[0].version
loaded_model = mlflow.pyfunc.load_model(model_uri=f"models:/{model_name}/{model_version}")
# Make a prediction
test_data = pd.DataFrame([(30878, 14550), (30878, 585), (30878, 2410)], columns=['user_id', 'movie_id'])
loaded_model.predict(test_data)

 - mlflow (current: 2.4.2, required: mlflow==2.4)
To fix the mismatches, call `mlflow.pyfunc.get_model_dependencies(model_uri)` to fetch the model's environment and install dependencies using the resulting environment file.


Unnamed: 0,user_id,movie_id,prediction
0,30878,14550,4.428463
1,30878,585,3.732719
2,30878,2410,3.732719
