# Step 1. Introduction

## 1.1 Problem Statement

Streaming platforms contain thousands of movies, making it difficult for users to decide what to watch. A recommender system helps predict which movies a user may like based on historical ratings

## 1.2 Objective

To build a movie recommendation system using Non-negative Matrix Factorization (NMF) based on user-movie rating data

## 1.3 Dataset

Netflix Movie Rating Dataset (from the Netflix Prize competition)

Movies file:
- Movie_ID
- Year
- Name

Ratings file:
- User_ID
- Rating (1-5)
- Movie_ID

# Step 2. Import Libraries

In [1]:
import os
import numpy as np
import pandas as pd
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import ndcg_score

## Step 3. Donwload Dataset

In [2]:
import kagglehub

In [3]:
path = kagglehub.dataset_download("rishitjavia/netflix-movie-rating-dataset")
print("Path dataset:", path)

Downloading from https://www.kaggle.com/api/v1/datasets/download/rishitjavia/netflix-movie-rating-dataset?dataset_version_number=1...


100%|██████████| 74.7M/74.7M [00:00<00:00, 80.9MB/s]

Extracting files...





Path dataset: /root/.cache/kagglehub/datasets/rishitjavia/netflix-movie-rating-dataset/versions/1


# Step 4. Load CSV Files

## 4.1 List all CSV files in the downloaded folder

In [4]:
files = [f for f in os.listdir(path) if f.endswith(".csv")]

if not files:
  raise FileNotFoundError("File is not found.")

print("File is found:", files)

File is found: ['Netflix_Dataset_Rating.csv', 'Netflix_Dataset_Movie.csv']


## 4.2 Load ratings dan movies

In [5]:
ratings_path = os.path.join(path, "Netflix_Dataset_Rating.csv")
movies_path = os.path.join(path, "Netflix_Dataset_Movie.csv")

ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

ratings.head()

Unnamed: 0,User_ID,Rating,Movie_ID
0,712664,5,3
1,1331154,4,3
2,2632461,3,3
3,44937,5,3
4,656399,4,3


# Step 5. Explore the Dataset

## 5.1 Check data structure

In [6]:
ratings.info()
ratings.describe()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17337458 entries, 0 to 17337457
Data columns (total 3 columns):
 #   Column    Dtype
---  ------    -----
 0   User_ID   int64
 1   Rating    int64
 2   Movie_ID  int64
dtypes: int64(3)
memory usage: 396.8 MB


Unnamed: 0,User_ID,Rating,Movie_ID
count,17337460.0,17337460.0,17337460.0
mean,1321158.0,3.59057,2302.783
std,764692.3,1.062665,1303.458
min,6.0,1.0,3.0
25%,660228.0,3.0,1176.0
50%,1316775.0,4.0,2342.0
75%,1983213.0,4.0,3433.0
max,2649429.0,5.0,4496.0


## 5.2 Count unique users and movies

In [7]:
print("Number of users:", ratings['User_ID'].nunique())
print("Number of movies:", ratings['Movie_ID'].nunique())

Number of users: 143458
Number of movies: 1350


# Step 6. Create the User-Item Rating Matrix

## 6.1 Convert the rating table into a matrix

In [8]:
rating_matrix = ratings.pivot_table(
    index='User_ID',
    columns='Movie_ID',
    values='Rating'
)

## 6.2 Handle missing values (required for NMF)

In [9]:
rating_matrix_filled = rating_matrix.fillna(0)

# Step 7. Train the NMF Model

In [10]:
nmf_model = NMF(
    n_components=20,
    init='random',
    random_state=42,
    max_iter=200
)

W = nmf_model.fit_transform(rating_matrix_filled)
H = nmf_model.components_



Where:
- W = user latent features
- H = movie latent features

# Step 8. Predict Ratings and Evaluate Model

## 8.1 Reconstruct the rating matrix

In [11]:
R_pred = np.dot(W, H)

## 8.2 Evaluate using RMSE & MAE

In [12]:
R_true = rating_matrix.values
R_hat  = R_pred

mask = ~np.isnan(R_true)

rmse = np.sqrt(mean_squared_error(
    R_true[mask],
    R_hat[mask]
))

mae = mean_absolute_error(
    R_true[mask],
    R_hat[mask]
)

print("RMSE:", rmse)
print("MAE :", mae)

RMSE: 2.346811602066299
MAE : 2.0185851482351165


## 8.3 Evaluate using NDCG@K

In [13]:
def ndcg_at_k(rating_matrix, R_pred, k=10):
    ndcg_scores = []

    for i, user_id in enumerate(rating_matrix.index):
        true_ratings = rating_matrix.loc[user_id].values
        pred_ratings = R_pred[i]

        mask = ~np.isnan(true_ratings)
        if mask.sum() == 0:
            continue

        ndcg = ndcg_score(
            y_true=[true_ratings[mask]],
            y_score=[pred_ratings[mask]],
            k=k
        )
        ndcg_scores.append(ndcg)

    return np.mean(ndcg_scores)

ndcg_10 = ndcg_at_k(rating_matrix, R_pred, k=10)
print("NDCG@10:", ndcg_10)

NDCG@10: 0.8726368846704292


# Step 9. Build the Recommendation Function

In [14]:
def recommend_movies(user_id, n_recommendations=5):
  if user_id not in rating_matrix.index:
    raise ValueError("User ID not found.")

  user_index = rating_matrix.index.get_loc(user_id)
  user_predictions = R_pred[user_index]

  recommendations = pd.DataFrame({
      'Movie_ID': rating_matrix.columns,
      'predicted_rating': user_predictions
  }).set_index('Movie_ID')

  already_rated = rating_matrix.loc[user_id]
  recommendations = recommendations[already_rated.isna()]

  top_recommendations = recommendations.sort_values(
      by='predicted_rating',
      ascending=False
  ).head(n_recommendations)

  return top_recommendations.reset_index().merge(movies, on='Movie_ID')

# Step 10. Generate Recommendation

In [15]:
recommend_movies(user_id=6, n_recommendations=5)

Unnamed: 0,Movie_ID,predicted_rating,Year,Name
0,571,2.671499,1999,American Beauty
1,798,2.588083,1975,Jaws
2,3282,2.060839,2004,Sideways
3,2430,2.001406,1979,Alien: Collector's Edition
4,1470,1.937342,2002,Bend It Like Beckham
