# SVD vs ALS (NMF) — Recommendation System Comparison

## 1. Import Libraries

In [1]:
import pandas as pd
import json
import os
import numpy as np
from collections import defaultdict

from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split
from scipy.sparse import csr_matrix

from surprise import SVD, NMF, Dataset, Reader, accuracy, Trainset
from surprise.model_selection import train_test_split as surprise_split

---
## 2. Load & Explore Data

In [2]:
def load_json_lines(filepath):
    records = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line:
                records.append(json.loads(line))
    return pd.DataFrame(records)

FILE_PATH = 'reco_dataset.json'

df = load_json_lines(FILE_PATH)
print(f"Loaded {len(df):,} reviews")
print(f"Shape: {df.shape}")

Loaded 551,682 reviews
Shape: (551682, 9)


In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,APYOBQE6M18AA,615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013"
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014"
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013"
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""","[14, 18]",This book is a must have if you get a Zoku (wh...,5.0,Creative Combos,1312416000,"08 4, 2011"
4,AHAI85T5C2DH3,615391206,PugLover,"[0, 0]",This cookbook is great. I have really enjoyed...,4.0,A must own if you own the Zoku maker...,1402099200,"06 7, 2014"


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 551682 entries, 0 to 551681
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewerID      551682 non-null  str    
 1   asin            551682 non-null  str    
 2   reviewerName    546729 non-null  str    
 3   helpful         551682 non-null  object 
 4   reviewText      551682 non-null  str    
 5   overall         551682 non-null  float64
 6   summary         551682 non-null  str    
 7   unixReviewTime  551682 non-null  int64  
 8   reviewTime      551682 non-null  str    
dtypes: float64(1), int64(1), object(1), str(6)
memory usage: 37.9+ MB


In [5]:
df['overall'].describe()

count    551682.000000
mean          4.316655
std           1.110749
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: overall, dtype: float64

---
## 3. Data Cleaning

### 3.1 Select Relevant Columns

In [6]:
df_cf = df[['reviewerID', 'asin', 'overall']].copy()

print(f"Users  : {df_cf['reviewerID'].nunique():,}")
print(f"Items  : {df_cf['asin'].nunique():,}")
print(f"Ratings: {len(df_cf):,}")

Users  : 66,519
Items  : 28,237
Ratings: 551,682


### 3.2 Check for Missing Values

In [7]:
print("Missing values per column:")
print(df_cf.isnull().sum())

Missing values per column:
reviewerID    0
asin          0
overall       0
dtype: int64


### 3.3 Remove Duplicates

In [8]:
print(f"Before dedup: {len(df_cf):,} rows")

df_cf = df_cf.drop_duplicates(subset=['reviewerID', 'asin'], keep='last')

print(f"After dedup : {len(df_cf):,} rows")

Before dedup: 551,682 rows
After dedup : 551,682 rows


### 3.4 Drop Missing Values

In [9]:
df_cf = df_cf.dropna(subset=['reviewerID', 'asin', 'overall'])

print(f"After dropping nulls: {len(df_cf):,} rows")


After dropping nulls: 551,682 rows


### 3.5 Validate Rating Range

In [10]:
print("Rating distribution before filter:")
print(df_cf['overall'].value_counts().sort_index())

# Keep only valid ratings
df_cf = df_cf[df_cf['overall'].between(1.0, 5.0)]

print(f"\nAfter rating filter: {len(df_cf):,} rows")

Rating distribution before filter:
overall
1.0     27106
2.0     24313
3.0     45059
4.0    105508
5.0    349696
Name: count, dtype: int64

After rating filter: 551,682 rows


### 3.6 Filter Cold-Start Users & Items

This is the most important cleaning step for matrix factorization. 
Users with only 1 or 2 ratings give the model nothing to learn from.

In [11]:
MIN_USER_RATINGS = 5   # user must have rated at least 5 items
MIN_ITEM_RATINGS = 5   # item must have been rated at least 5 times

# Filter users
user_counts = df_cf['reviewerID'].value_counts()
valid_users = user_counts[user_counts >= MIN_USER_RATINGS].index
df_cf = df_cf[df_cf['reviewerID'].isin(valid_users)]

# Filter items
item_counts = df_cf['asin'].value_counts()
valid_items = item_counts[item_counts >= MIN_ITEM_RATINGS].index
df_cf = df_cf[df_cf['asin'].isin(valid_items)]

print(f"After cold start filter:")
print(f"  Users  : {df_cf['reviewerID'].nunique():,}")
print(f"  Items  : {df_cf['asin'].nunique():,}")
print(f"  Ratings: {len(df_cf):,}")

After cold start filter:
  Users  : 66,519
  Items  : 28,237
  Ratings: 551,682


### 3.7 Final Sanity Check

In [12]:
print("=== Final Clean Dataset ===")
print(f"Shape       : {df_cf.shape}")
print(f"Users       : {df_cf['reviewerID'].nunique():,}")
print(f"Items       : {df_cf['asin'].nunique():,}")
print(f"Ratings     : {len(df_cf):,}")
print(f"Missing vals: {df_cf.isnull().sum().sum()}")
print(f"Duplicates  : {df_cf.duplicated().sum()}")
print("\nRating distribution:")
print(df_cf['overall'].value_counts().sort_index())

df_cf.head()

=== Final Clean Dataset ===
Shape       : (551682, 3)
Users       : 66,519
Items       : 28,237
Ratings     : 551,682
Missing vals: 0
Duplicates  : 0

Rating distribution:
overall
1.0     27106
2.0     24313
3.0     45059
4.0    105508
5.0    349696
Name: count, dtype: int64


Unnamed: 0,reviewerID,asin,overall
0,APYOBQE6M18AA,615391206,5.0
1,A1JVQTAGHYOL7F,615391206,5.0
2,A3UPYGJKZ0XTU4,615391206,4.0
3,A2MHCTX43MIMDZ,615391206,5.0
4,AHAI85T5C2DH3,615391206,4.0


---
## 4. Feature Engineering

### 4.1 Encode User & Item IDs to Integers

In [13]:
user_enc = LabelEncoder()
item_enc = LabelEncoder()

df_cf['user_id'] = user_enc.fit_transform(df_cf['reviewerID'])
df_cf['item_id'] = item_enc.fit_transform(df_cf['asin'])

n_users = df_cf['user_id'].nunique()
n_items = df_cf['item_id'].nunique()

print(f"Number of users : {n_users:,}")
print(f"Number of items : {n_items:,}")
print(f"\nSample encoding:")
df_cf[['reviewerID', 'user_id', 'asin', 'item_id', 'overall']].head()

Number of users : 66,519
Number of items : 28,237

Sample encoding:


Unnamed: 0,reviewerID,user_id,asin,item_id,overall
0,APYOBQE6M18AA,61571,615391206,0,5.0
1,A1JVQTAGHYOL7F,9619,615391206,0,5.0
2,A3UPYGJKZ0XTU4,50097,615391206,0,4.0
3,A2MHCTX43MIMDZ,28494,615391206,0,5.0
4,AHAI85T5C2DH3,57379,615391206,0,4.0


### 4.2 Build the Sparse User-Item Matrix

In [14]:
sparse_matrix = csr_matrix(
    (df_cf['overall'].astype(float),
     (df_cf['user_id'], df_cf['item_id'])),
    shape=(n_users, n_items)
)

# Check sparsity
total_cells = n_users * n_items
filled_cells = len(df_cf)
sparsity = 1 - (filled_cells / total_cells)

print(f"Matrix shape : {sparse_matrix.shape}")
print(f"Filled cells : {filled_cells:,}")
print(f"Total cells  : {total_cells:,}")
print(f"Sparsity     : {sparsity:.4%}")

Matrix shape : (66519, 28237)
Filled cells : 551,682
Total cells  : 1,878,297,003
Sparsity     : 99.9706%


### 4.3 Save Lookup Dictionaries

In [15]:
# Map integer → original ID
user_id_to_reviewer = {i: label for i, label in enumerate(user_enc.classes_)}
item_id_to_asin     = {i: label for i, label in enumerate(item_enc.classes_)}

# Map original ID → integer
reviewer_to_user_id = {v: k for k, v in user_id_to_reviewer.items()}
asin_to_item_id     = {v: k for k, v in item_id_to_asin.items()}

print(f"Sample user mapping: {list(user_id_to_reviewer.items())[:3]}")
print(f"Sample item mapping: {list(item_id_to_asin.items())[:3]}")

Sample user mapping: [(0, 'A0002382258OFJJ2UYNTR'), (1, 'A0010876CNE3ILIM9HV0'), (2, 'A00473363TJ8YSZ3YAGG9')]
Sample item mapping: [(0, '0615391206'), (1, '0689027818'), (2, '0912696591')]


### 4.4 Feature Engineering Summary

In [16]:
print("=== Feature Engineering Summary ===")
print(f"df_cf columns     : {list(df_cf.columns)}")
print(f"Sparse matrix type: {type(sparse_matrix)}")
print(f"Matrix shape      : {sparse_matrix.shape}")
print(f"Stored values     : {sparse_matrix.nnz:,}")
df_cf.head()

=== Feature Engineering Summary ===
df_cf columns     : ['reviewerID', 'asin', 'overall', 'user_id', 'item_id']
Sparse matrix type: <class 'scipy.sparse._csr.csr_matrix'>
Matrix shape      : (66519, 28237)
Stored values     : 551,682


Unnamed: 0,reviewerID,asin,overall,user_id,item_id
0,APYOBQE6M18AA,615391206,5.0,61571,0
1,A1JVQTAGHYOL7F,615391206,5.0,9619,0
2,A3UPYGJKZ0XTU4,615391206,4.0,50097,0
3,A2MHCTX43MIMDZ,615391206,5.0,28494,0
4,AHAI85T5C2DH3,615391206,4.0,57379,0


---
## 5. Shared Train/Test Split

Both models (SVD and NMF) will be evaluated on the **exact same** 
train/test split for a fair comparison.

### 5.1 Sklearn Split (for sparse matrices)

In [17]:
train_data, test_data = train_test_split(
    df_cf,
    test_size=0.2,
    random_state=42,
    stratify=df_cf['user_id']
)

print(f"Train size : {len(train_data):,} ratings")
print(f"Test size  : {len(test_data):,} ratings")
print(f"Train users: {train_data['user_id'].nunique():,}")
print(f"Test users : {test_data['user_id'].nunique():,}")

Train size : 441,345 ratings
Test size  : 110,337 ratings
Train users: 66,519
Test users : 66,519


### 5.2 Build Sparse Matrices

In [18]:
train_matrix = csr_matrix(
    (train_data['overall'].astype(float),
     (train_data['user_id'], train_data['item_id'])),
    shape=(n_users, n_items)
)

test_matrix = csr_matrix(
    (test_data['overall'].astype(float),
     (test_data['user_id'], test_data['item_id'])),
    shape=(n_users, n_items)
)

print(f"Train matrix shape : {train_matrix.shape}")
print(f"Train matrix nnz   : {train_matrix.nnz:,}")
print(f"Test matrix shape  : {test_matrix.shape}")
print(f"Test matrix nnz    : {test_matrix.nnz:,}")

Train matrix shape : (66519, 28237)
Train matrix nnz   : 441,345
Test matrix shape  : (66519, 28237)
Test matrix nnz    : 110,337


### 5.3 Sanity Check — No Overlap

In [19]:
train_pairs = set(zip(train_data['user_id'], train_data['item_id']))
test_pairs  = set(zip(test_data['user_id'],  test_data['item_id']))

overlap = train_pairs & test_pairs
print(f"Overlapping user-item pairs: {len(overlap)}")

Overlapping user-item pairs: 0


### 5.4 Convert to Surprise Format

Both SVD and NMF (from the Surprise library) need data in Surprise's internal format. 
We convert the **same** train/test split so both models see identical data.

In [20]:
reader = Reader(rating_scale=(1, 5))

# Build Surprise trainset from our train_data
surprise_full = Dataset.load_from_df(
    df_cf[['reviewerID', 'asin', 'overall']], reader
)

# Build trainset using the same indices
train_surprise = Dataset.load_from_df(
    train_data[['reviewerID', 'asin', 'overall']], reader
)
trainset = train_surprise.build_full_trainset()

# Build testset from test_data (list of tuples)
testset = list(test_data[['reviewerID', 'asin', 'overall']].itertuples(index=False, name=None))

print(f"Surprise trainset: {trainset.n_ratings:,} ratings, {trainset.n_users:,} users, {trainset.n_items:,} items")
print(f"Surprise testset : {len(testset):,} ratings")

Surprise trainset: 441,345 ratings, 66,519 users, 28,237 items
Surprise testset : 110,337 ratings


---
## 6. Evaluation Helpers

In [21]:
def precision_recall_at_k(predictions, k=10, threshold=4.0):
    """Compute Precision@K and Recall@K from Surprise predictions."""
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))

    precisions, recalls = {}, {}
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_relevant = sum(1 for (_, true_r) in user_ratings if true_r >= threshold)
        n_hits     = sum(1 for (_, true_r) in user_ratings[:k] if true_r >= threshold)
        precisions[uid] = n_hits / k
        recalls[uid]    = n_hits / n_relevant if n_relevant > 0 else 0

    avg_precision = sum(precisions.values()) / len(precisions)
    avg_recall    = sum(recalls.values()) / len(recalls)
    return avg_precision, avg_recall

print("✅ Evaluation helper defined.")

✅ Evaluation helper defined.


---
## 7. Model 1 — SVD (Explicit Rating Prediction)

SVD (Singular Value Decomposition) from the Surprise library predicts explicit ratings.
It learns user and item latent factor vectors directly from 1–5 star ratings.

In [22]:
model_svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.1,
    random_state=42
)

print("Training SVD model...")
model_svd.fit(trainset)
print("✅ SVD model trained successfully!")

Training SVD model...
✅ SVD model trained successfully!


### 7.1 SVD Evaluation

In [23]:
# Predict on the test set
svd_predictions = model_svd.test(testset)

# RMSE & MAE
svd_rmse = accuracy.rmse(svd_predictions, verbose=True)
svd_mae  = accuracy.mae(svd_predictions, verbose=True)

# Precision@10 & Recall@10
svd_precision, svd_recall = precision_recall_at_k(svd_predictions, k=10, threshold=4.0)

print(f"\nSVD Precision@10 : {svd_precision:.4f} ({svd_precision*100:.2f}%)")
print(f"SVD Recall@10    : {svd_recall:.4f} ({svd_recall*100:.2f}%)")

RMSE: 1.0283
MAE:  0.7568

SVD Precision@10 : 0.1352 (13.52%)
SVD Recall@10    : 0.8693 (86.93%)


---
## 8. Model 2 — NMF (ALS-Style Matrix Factorization)

NMF (Non-negative Matrix Factorization) from Surprise is an ALS-style algorithm.
Like ALS, it decomposes the user-item matrix into non-negative latent factors.
It uses the **same** train/test split as SVD for a fair comparison.

In [24]:
model_nmf = NMF(
    n_factors=50,
    n_epochs=20,
    reg_pu=0.1,       # user factor regularization
    reg_qi=0.1,       # item factor regularization
    random_state=42
)

print("Training NMF (ALS-style) model...")
model_nmf.fit(trainset)
print("✅ NMF model trained successfully!")

Training NMF (ALS-style) model...
✅ NMF model trained successfully!


### 8.1 NMF Evaluation

In [25]:
# Predict on the test set
nmf_predictions = model_nmf.test(testset)

# RMSE & MAE
nmf_rmse = accuracy.rmse(nmf_predictions, verbose=True)
nmf_mae  = accuracy.mae(nmf_predictions, verbose=True)

# Precision@10 & Recall@10
nmf_precision, nmf_recall = precision_recall_at_k(nmf_predictions, k=10, threshold=4.0)

print(f"\nNMF Precision@10 : {nmf_precision:.4f} ({nmf_precision*100:.2f}%)")
print(f"NMF Recall@10    : {nmf_recall:.4f} ({nmf_recall*100:.2f}%)")

RMSE: 1.1871
MAE:  0.6526

NMF Precision@10 : 0.1351 (13.51%)
NMF Recall@10    : 0.8692 (86.92%)


---
## 9. Model Comparison — SVD vs NMF (ALS)

In [26]:
print("=" * 55)
print("   ALGORITHM COMPARISON ON SAME DATASET")
print("=" * 55)
print(f"{'Metric':<20} {'SVD':>12} {'NMF (ALS)':>12}")
print("-" * 55)
print(f"{'RMSE':<20} {svd_rmse:>12.4f} {nmf_rmse:>12.4f}")
print(f"{'MAE':<20} {svd_mae:>12.4f} {nmf_mae:>12.4f}")
print(f"{'Precision@10':<20} {svd_precision:>12.4f} {nmf_precision:>12.4f}")
print(f"{'Recall@10':<20} {svd_recall:>12.4f} {nmf_recall:>12.4f}")
print("=" * 55)
print()
print("NOTES:")
print("  Lower RMSE/MAE  = better rating prediction accuracy")
print("  Higher Prec/Rec = better at ranking relevant items")
print("  Both models trained on the SAME data split for fair comparison")

   ALGORITHM COMPARISON ON SAME DATASET
Metric                        SVD    NMF (ALS)
-------------------------------------------------------
RMSE                       1.0283       1.1871
MAE                        0.7568       0.6526
Precision@10               0.1352       0.1351
Recall@10                  0.8693       0.8692

NOTES:
  Lower RMSE/MAE  = better rating prediction accuracy
  Higher Prec/Rec = better at ranking relevant items
  Both models trained on the SAME data split for fair comparison
