In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.decomposition import NMF
from sklearn.metrics import mean_squared_error

In [2]:
# -------------------------------
# Section 1: Load Data & Prepare
# -------------------------------

# Specify latin-1 encoding to handle special characters
ratings = pd.read_csv(
    'ratings.dat',
    sep='::',
    names=['userId','movieId','rating','timestamp'],
    engine='python',
    encoding='latin-1'
)

movies = pd.read_csv(
    'movies.dat',
    sep='::',
    names=['movieId','title','genres'],
    engine='python',
    encoding='latin-1'
)

users = pd.read_csv(
    'users.dat',
    sep='::',
    names=['userId','gender','age','occupation','zip'],
    engine='python',
    encoding='latin-1'
)

# Train/test split
train_data, test_data = train_test_split(ratings, test_size=0.2, random_state=42)

# Build user-item matrix for training (missing→0)
train_matrix = train_data.pivot_table(
    index='userId',
    columns='movieId',
    values='rating'
).fillna(0)

# Mappings
user_ids = train_matrix.index.values
movie_ids = train_matrix.columns.values
user_id_to_idx  = {uid: i for i, uid in enumerate(user_ids)}
movie_id_to_idx = {mid: i for i, mid in enumerate(movie_ids)}


In [3]:
# -------------------------------
# Section 2: Fit NMF & Reconstruct
# -------------------------------

n_components = 20
nmf_model = NMF(
    n_components=n_components,
    init='nndsvda',      # smarter initialization
    random_state=0,
    max_iter=500,        # increase iterations for convergence
    tol=1e-4,            # tighten convergence tolerance
    verbose=1            # show progress
)

# fit_transform will now converge more reliably
W = nmf_model.fit_transform(train_matrix)
H = nmf_model.components_

# reconstructed rating matrix
reconstructed = W.dot(H)


violation: 1.0
violation: 0.20651913002111752
violation: 0.12194206742023926
violation: 0.08302814578855364
violation: 0.060436729914764016
violation: 0.048865752176420016
violation: 0.04127655545925941
violation: 0.03474207967499024
violation: 0.028956618608798123
violation: 0.024589349010759824
violation: 0.0213822791534582
violation: 0.01904998746197676
violation: 0.017295435998739898
violation: 0.015933351637782485
violation: 0.014847155522897235
violation: 0.013838772211607064
violation: 0.012868422844478181
violation: 0.011893607854451456
violation: 0.010940325451838463
violation: 0.01006984085620822
violation: 0.009354114055294346
violation: 0.008870972557608712
violation: 0.008547968450531008
violation: 0.008376345664007875
violation: 0.008263522758721243
violation: 0.00817722858773413
violation: 0.008069662141585584
violation: 0.007918628179237677
violation: 0.007729104852538392
violation: 0.00749528578605573
violation: 0.007244741413993416
violation: 0.006975122165177568
viol

In [4]:

# -------------------------------
# Section 3: Evaluate on Test
# -------------------------------

mask = (
    test_data['userId'].isin(user_id_to_idx) &
    test_data['movieId'].isin(movie_id_to_idx)
)
test_filtered = test_data[mask]

y_true = []
y_pred = []
for _, row in test_filtered.iterrows():
    ui = user_id_to_idx[row['userId']]
    mi = movie_id_to_idx[row['movieId']]
    y_true.append(row['rating'])
    y_pred.append(reconstructed[ui, mi])

rmse = np.sqrt(mean_squared_error(y_true, y_pred))
print(f"Test RMSE: {rmse:.4f}")


Test RMSE: 2.7408


# Section 1: Load Data, Fit NMF & Compute RMSE

1. **Load the MovieLens data**  
   - Read `ratings.dat`, `movies.dat`, `users.dat` (with `encoding='latin-1'`) into DataFrames.

2. **Train/Test split**  
   - Split ratings into 80% train / 20% test.

3. **Build user–item matrix**  
   - Pivot `train_data` into a matrix of shape (n_users×n_movies), filling missing with 0.

4. **Fit non-negative matrix factorization**  
   - Use `sklearn.decomposition.NMF` with `n_components=20` (or tuned value), appropriate `init`, `tol`, and `max_iter`.

5. **Reconstruct full rating matrix**  
   - Compute `reconstructed = W.dot(H)` to get predicted ratings for every pair.

6. **Evaluate on the held-out test set**  
   - Look up each `(userId, movieId)` in `reconstructed` and compute  
     ```
     RMSE = sqrt(mean_squared_error(y_true, y_pred))
     ```

---

**RMSE analysis:**  
> An RMSE of **2.7408** on a 1–5 scale means our predictions are, on average, nearly **3 stars** off—indicating that zero-imputation and the raw NMF model without biases are not capturing user/movie effects well.


# Section 2: Discussion of Results & Fixes

# 1. Why RMSE ≈ 2.74 is so poor

Zero-imputation bias
We treated every missing rating as 0. On a 1–5 scale, that drags all latent factors toward predicting very low scores—so almost every prediction is way off.

## No user/item bias modeling
Some users habitually give high ratings; some movies are almost always rated generously. Our NMF model only learns interactions wu hi but never learns global mean u

## Sparse data & uniform weighting
MovieLens is extremely sparse: most user–item cells are missing. By filling them with equal weight, we overwhelm the signal from true ratings.Default hyperparameters

## Default hyperparameters
We chose 20 components and default regularization. Those may be far from optimal: too many factors, too little penalty, too small a convergence tolerance, etc.

# 2.Limitations of sklearn.decomposition.NMF for recommender systems
## Cannot natively handle missing values
You must impute (e.g. with zeros) before fitting, which biases the solution.

## No built-in bias terms
Only learns non-negative factors 𝑊,𝐻 not global/user/item offsets.

## Single‐objective fit
Minimizes reconstruction error over all entries (including imputed ones), not just the observed ratings.

## Sensitivity to initialization & hyperparameters
Initialization strategies (random vs. nndsvd*), n_components, alpha (regularization), tol, and max_iter all dramatically affect convergence and fit quality.

# 3.Concrete suggestions to fix/improve

## Smarter imputation
Pre-fill missing entries with the item average or global mean rather than zero; alternatively, use an iterative imputation scheme that only “fills in” during optimization.

## Incorporate bias terms

## Hyperparameter tuning

## Use a dedicated recommender library
Libraries like Surprise (for SVD/ALS with biases) or implicit (for confidence-weighted factorization) handle missing data, biases, and regularization more gracefully.

## Baseline & similarity hybrids
Compare against or even combine with user-based or item-based KNN (cosine or Pearson). Sometimes a simple KNN baseline outperforms raw NMF on sparse, skewed datasets.
