 # Import Libraries

In [1]:
import pandas as pd
import json
import os

# Load the Data Efficiently

In [2]:
def load_json_lines(filepath):
    records = []
    with open(filepath, 'r', encoding='utf-8') as f:
        for line in f:
            line = line.strip()
            if line: 
                records.append(json.loads(line))
    return pd.DataFrame(records)

FILE_PATH = 'reco_dataset.json'

df = load_json_lines(FILE_PATH)
print(f"Loaded {len(df):,} reviews")
print(f"Shape: {df.shape}")

Loaded 551,682 reviews
Shape: (551682, 9)


# Checking the data

In [3]:
df.head()

Unnamed: 0,reviewerID,asin,reviewerName,helpful,reviewText,overall,summary,unixReviewTime,reviewTime
0,APYOBQE6M18AA,615391206,Martin Schwartz,"[0, 0]",My daughter wanted this book and the price on ...,5.0,Best Price,1382140800,"10 19, 2013"
1,A1JVQTAGHYOL7F,615391206,Michelle Dinh,"[0, 0]",I bought this zoku quick pop for my daughterr ...,5.0,zoku,1403049600,"06 18, 2014"
2,A3UPYGJKZ0XTU4,615391206,mirasreviews,"[26, 27]",There is no shortage of pop recipes available ...,4.0,"Excels at Sweet Dessert Pops, but Falls Short ...",1367712000,"05 5, 2013"
3,A2MHCTX43MIMDZ,615391206,"M. Johnson ""Tea Lover""","[14, 18]",This book is a must have if you get a Zoku (wh...,5.0,Creative Combos,1312416000,"08 4, 2011"
4,AHAI85T5C2DH3,615391206,PugLover,"[0, 0]",This cookbook is great. I have really enjoyed...,4.0,A must own if you own the Zoku maker...,1402099200,"06 7, 2014"


In [4]:
df.info()

<class 'pandas.DataFrame'>
RangeIndex: 551682 entries, 0 to 551681
Data columns (total 9 columns):
 #   Column          Non-Null Count   Dtype  
---  ------          --------------   -----  
 0   reviewerID      551682 non-null  str    
 1   asin            551682 non-null  str    
 2   reviewerName    546729 non-null  str    
 3   helpful         551682 non-null  object 
 4   reviewText      551682 non-null  str    
 5   overall         551682 non-null  float64
 6   summary         551682 non-null  str    
 7   unixReviewTime  551682 non-null  int64  
 8   reviewTime      551682 non-null  str    
dtypes: float64(1), int64(1), object(1), str(6)
memory usage: 37.9+ MB


In [5]:
df['overall'].describe()

count    551682.000000
mean          4.316655
std           1.110749
min           1.000000
25%           4.000000
50%           5.000000
75%           5.000000
max           5.000000
Name: overall, dtype: float64

# Preparing Data

In [6]:
df_cf = df[['reviewerID', 'asin', 'overall']].copy()

In [7]:
print(f"Users  : {df_cf['reviewerID'].nunique():,}")
print(f"Items  : {df_cf['asin'].nunique():,}")
print(f"Ratings: {len(df_cf):,}")

Users  : 66,519
Items  : 28,237
Ratings: 551,682


## Check for Missing Values

In [8]:
print("Missing values per column:")
print(df_cf.isnull().sum())

Missing values per column:
reviewerID    0
asin          0
overall       0
dtype: int64


In [9]:
df_cf.head()

Unnamed: 0,reviewerID,asin,overall
0,APYOBQE6M18AA,615391206,5.0
1,A1JVQTAGHYOL7F,615391206,5.0
2,A3UPYGJKZ0XTU4,615391206,4.0
3,A2MHCTX43MIMDZ,615391206,5.0
4,AHAI85T5C2DH3,615391206,4.0


## Clean the data

### Check for Duplicates

In [10]:
print(f"Before dedup: {len(df_cf):,} rows")

df_cf = df_cf.drop_duplicates(subset=['reviewerID', 'asin'], keep='last')

print(f"After dedup : {len(df_cf):,} rows")

Before dedup: 551,682 rows
After dedup : 551,682 rows


### Drop Missing Values

In [11]:
df_cf = df_cf.dropna(subset=['reviewerID', 'asin', 'overall'])

print(f"After dropping nulls: {len(df_cf):,} rows")


After dropping nulls: 551,682 rows


### Validate the Rating Range

In [12]:
print("Rating distribution before filter:")
print(df_cf['overall'].value_counts().sort_index())

# Keep only valid ratings
df_cf = df_cf[df_cf['overall'].between(1.0, 5.0)]

print(f"\nAfter rating filter: {len(df_cf):,} rows")

Rating distribution before filter:
overall
1.0     27106
2.0     24313
3.0     45059
4.0    105508
5.0    349696
Name: count, dtype: int64

After rating filter: 551,682 rows


###  Filter Cold Start Users & Items 

This is the most important cleaning step for ALS. Users with only 1 or 2 ratings give the model nothing to learn from

In [13]:
MIN_USER_RATINGS = 5   # user must have rated at least 5 items
MIN_ITEM_RATINGS = 5   # item must have been rated at least 5 times

# Filter users
user_counts = df_cf['reviewerID'].value_counts()
valid_users = user_counts[user_counts >= MIN_USER_RATINGS].index
df_cf = df_cf[df_cf['reviewerID'].isin(valid_users)]

# Filter items
item_counts = df_cf['asin'].value_counts()
valid_items = item_counts[item_counts >= MIN_ITEM_RATINGS].index
df_cf = df_cf[df_cf['asin'].isin(valid_items)]

print(f"After cold start filter:")
print(f"  Users  : {df_cf['reviewerID'].nunique():,}")
print(f"  Items  : {df_cf['asin'].nunique():,}")
print(f"  Ratings: {len(df_cf):,}")

After cold start filter:
  Users  : 66,519
  Items  : 28,237
  Ratings: 551,682


###  Final Sanity Check

In [14]:
print("=== Final Clean Dataset ===")
print(f"Shape       : {df_cf.shape}")
print(f"Users       : {df_cf['reviewerID'].nunique():,}")
print(f"Items       : {df_cf['asin'].nunique():,}")
print(f"Ratings     : {len(df_cf):,}")
print(f"Missing vals: {df_cf.isnull().sum().sum()}")
print(f"Duplicates  : {df_cf.duplicated().sum()}")
print("\nRating distribution:")
print(df_cf['overall'].value_counts().sort_index())

df_cf.head()

=== Final Clean Dataset ===
Shape       : (551682, 3)
Users       : 66,519
Items       : 28,237
Ratings     : 551,682
Missing vals: 0
Duplicates  : 0

Rating distribution:
overall
1.0     27106
2.0     24313
3.0     45059
4.0    105508
5.0    349696
Name: count, dtype: int64


Unnamed: 0,reviewerID,asin,overall
0,APYOBQE6M18AA,615391206,5.0
1,A1JVQTAGHYOL7F,615391206,5.0
2,A3UPYGJKZ0XTU4,615391206,4.0
3,A2MHCTX43MIMDZ,615391206,5.0
4,AHAI85T5C2DH3,615391206,4.0


##  Feature Engineering

### Encode User & Item IDs to Integers

In [15]:
from sklearn.preprocessing import LabelEncoder

user_enc = LabelEncoder()
item_enc = LabelEncoder()

df_cf['user_id'] = user_enc.fit_transform(df_cf['reviewerID'])
df_cf['item_id'] = item_enc.fit_transform(df_cf['asin'])

n_users = df_cf['user_id'].nunique()
n_items = df_cf['item_id'].nunique()

print(f"Number of users : {n_users:,}")
print(f"Number of items : {n_items:,}")
print(f"\nSample encoding:")
df_cf[['reviewerID', 'user_id', 'asin', 'item_id', 'overall']].head()

Number of users : 66,519
Number of items : 28,237

Sample encoding:


Unnamed: 0,reviewerID,user_id,asin,item_id,overall
0,APYOBQE6M18AA,61571,615391206,0,5.0
1,A1JVQTAGHYOL7F,9619,615391206,0,5.0
2,A3UPYGJKZ0XTU4,50097,615391206,0,4.0
3,A2MHCTX43MIMDZ,28494,615391206,0,5.0
4,AHAI85T5C2DH3,57379,615391206,0,4.0


In [16]:
print(f"Unique item_ids : {df_cf['item_id'].nunique():,}")
print(f"Unique asins    : {df_cf['asin'].nunique():,}")

Unique item_ids : 28,237
Unique asins    : 28,237


###  Build the Sparse User-Item Matrix

In [17]:
from scipy.sparse import csr_matrix

sparse_matrix = csr_matrix(
    (df_cf['overall'].astype(float),
     (df_cf['user_id'], df_cf['item_id'])),
    shape=(n_users, n_items)
)

# Check sparsity
total_cells = n_users * n_items
filled_cells = len(df_cf)
sparsity = 1 - (filled_cells / total_cells)

print(f"Matrix shape : {sparse_matrix.shape}")
print(f"Filled cells : {filled_cells:,}")
print(f"Total cells  : {total_cells:,}")
print(f"Sparsity     : {sparsity:.4%}")

Matrix shape : (66519, 28237)
Filled cells : 551,682
Total cells  : 1,878,297,003
Sparsity     : 99.9706%


### Save Lookup Dictionaries

In [18]:
# Map integer â†’ original ID
user_id_to_reviewer = {i: label for i, label in enumerate(user_enc.classes_)}
item_id_to_asin     = {i: label for i, label in enumerate(item_enc.classes_)}

# Map original ID â†’ integer
reviewer_to_user_id = {v: k for k, v in user_id_to_reviewer.items()}
asin_to_item_id     = {v: k for k, v in item_id_to_asin.items()}

print(f"Sample user mapping: {list(user_id_to_reviewer.items())[:3]}")
print(f"Sample item mapping: {list(item_id_to_asin.items())[:3]}")

Sample user mapping: [(0, 'A0002382258OFJJ2UYNTR'), (1, 'A0010876CNE3ILIM9HV0'), (2, 'A00473363TJ8YSZ3YAGG9')]
Sample item mapping: [(0, '0615391206'), (1, '0689027818'), (2, '0912696591')]


### Final Check

In [19]:
print("=== Feature Engineering Summary ===")
print(f"df_cf columns     : {list(df_cf.columns)}")
print(f"Sparse matrix type: {type(sparse_matrix)}")
print(f"Matrix shape      : {sparse_matrix.shape}")
print(f"Stored values     : {sparse_matrix.nnz:,}")
df_cf.head()

=== Feature Engineering Summary ===
df_cf columns     : ['reviewerID', 'asin', 'overall', 'user_id', 'item_id']
Sparse matrix type: <class 'scipy.sparse._csr.csr_matrix'>
Matrix shape      : (66519, 28237)
Stored values     : 551,682


Unnamed: 0,reviewerID,asin,overall,user_id,item_id
0,APYOBQE6M18AA,615391206,5.0,61571,0
1,A1JVQTAGHYOL7F,615391206,5.0,9619,0
2,A3UPYGJKZ0XTU4,615391206,4.0,50097,0
3,A2MHCTX43MIMDZ,615391206,5.0,28494,0
4,AHAI85T5C2DH3,615391206,4.0,57379,0


##  Train/Test Split

###  User-Aware Train/Test Split

In [20]:
from sklearn.model_selection import train_test_split

train_data, test_data = train_test_split(
    df_cf,
    test_size=0.2,
    random_state=42,
    stratify=df_cf['user_id']  
)

print(f"Train size : {len(train_data):,} ratings")
print(f"Test size  : {len(test_data):,} ratings")
print(f"Train users: {train_data['user_id'].nunique():,}")
print(f"Test users : {test_data['user_id'].nunique():,}")

Train size : 441,345 ratings
Test size  : 110,337 ratings
Train users: 66,519
Test users : 66,519


### Build Sparse Matrices for Train and Test

In [21]:
from scipy.sparse import csr_matrix

train_matrix = csr_matrix(
    (train_data['overall'].astype(float),
     (train_data['user_id'], train_data['item_id'])),
    shape=(n_users, n_items)
)

test_matrix = csr_matrix(
    (test_data['overall'].astype(float),
     (test_data['user_id'], test_data['item_id'])),
    shape=(n_users, n_items)
)

print(f"Train matrix shape : {train_matrix.shape}")
print(f"Train matrix nnz   : {train_matrix.nnz:,}")
print(f"Test matrix shape  : {test_matrix.shape}")
print(f"Test matrix nnz    : {test_matrix.nnz:,}")

Train matrix shape : (66519, 28237)
Train matrix nnz   : 441,345
Test matrix shape  : (66519, 28237)
Test matrix nnz    : 110,337


###  Sanity Check

In [22]:
train_pairs = set(zip(train_data['user_id'], train_data['item_id']))
test_pairs  = set(zip(test_data['user_id'],  test_data['item_id']))

overlap = train_pairs & test_pairs
print(f"Overlapping user-item pairs: {len(overlap)}")

Overlapping user-item pairs: 0


In [24]:
model = implicit.als.AlternatingLeastSquares(
    factors=50,          # number of latent factors (dimensions)
    regularization=0.1,  # lambda â€” prevents overfitting
    iterations=20,       # number of ALS iterations
    random_state=42,
    num_threads=1
)

  check_blas_config()


In [25]:
# implicit expects (item x user) matrix
train_matrix_T = train_matrix.T.tocsr()

print("Training ALS model...")
model.fit(train_matrix_T)
print("Training complete!")


Training ALS model...


  0%|          | 0/20 [00:00<?, ?it/s]

Training complete!


In [26]:
test_user_id = 0

item_ids, scores = model.recommend(
    test_user_id,
    train_matrix[test_user_id],
    N=10,                          # ask for 10 to guarantee 5 valid ones
    filter_already_liked_items=True
)

print(f"Top 5 recommendations for user {test_user_id}:")
count = 0
for item, score in zip(item_ids, scores):
    item_int = int(item)
    if item_int < n_items:         # only accept valid item indices
        count += 1
        asin = item_id_to_asin[item_int]
        print(f"  {count}. item_id={item_int} | asin={asin} | score={score:.4f}")
    if count == 5:
        break

Top 5 recommendations for user 0:
  1. item_id=3859 | asin=B00032Q0S6 | score=0.0191
  2. item_id=19179 | asin=B0049DUGOO | score=0.0181
  3. item_id=20074 | asin=B004NYAY8M | score=0.0154
  4. item_id=18635 | asin=B00421ATJK | score=0.0127
  5. item_id=11826 | asin=B001CHIWWS | score=0.0118


## Model Training with SVD (Surprise)

In [27]:
from surprise import SVD, Dataset, Reader, accuracy
from surprise.model_selection import train_test_split as surprise_split

reader = Reader(rating_scale=(1, 5))
data = Dataset.load_from_df(df_cf[['reviewerID', 'asin', 'overall']], reader)
print("Data loaded into surprise format")

Data loaded into surprise format


## Train/Test Split:

In [28]:
trainset, testset = surprise_split(data, test_size=0.2, random_state=42)

print(f"Train ratings : {trainset.n_ratings:,}")
print(f"Test ratings  : {len(testset):,}")
print(f"Users         : {trainset.n_users:,}")
print(f"Items         : {trainset.n_items:,}")

Train ratings : 441,345
Test ratings  : 110,337
Users         : 66,506
Items         : 28,237


## Train SVD Model

In [30]:
model_svd = SVD(
    n_factors=50,
    n_epochs=20,
    lr_all=0.005,
    reg_all=0.1,
    random_state=42
)

model_svd.fit(trainset)
print("âœ… Model trained successfully!")

âœ… Model trained successfully!


#  Evaluation (RMSE, Precision@K, Recall@K)

## RMSE:

In [31]:
predictions = model_svd.test(testset)

rmse = accuracy.rmse(predictions)
mae  = accuracy.mae(predictions)

RMSE: 1.0406
MAE:  0.7678


## Precision@K and Recall@K

In [32]:
from collections import defaultdict

def precision_recall_at_k(predictions, k=10, threshold=4.0):
    user_est_true = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        user_est_true[uid].append((est, true_r))
    
    precisions, recalls = {}, {}
    for uid, user_ratings in user_est_true.items():
        user_ratings.sort(key=lambda x: x[0], reverse=True)
        n_relevant = sum(1 for (_, true_r) in user_ratings if true_r >= threshold)
        n_hits     = sum(1 for (_, true_r) in user_ratings[:k] if true_r >= threshold)
        precisions[uid] = n_hits / k
        recalls[uid]    = n_hits / n_relevant if n_relevant > 0 else 0
    
    return sum(precisions.values())/len(precisions), sum(recalls.values())/len(recalls)

precision, recall = precision_recall_at_k(predictions, k=10, threshold=4.0)
print(f"Precision@10 : {precision:.4f} ({precision*100:.2f}%)")
print(f"Recall@10    : {recall:.4f} ({recall*100:.2f}%)")

Precision@10 : 0.1721 (17.21%)
Recall@10    : 0.8991 (89.91%)


## Summary

In [33]:
print("=" * 40)
print("   EVALUATION RESULTS SUMMARY")
print("=" * 40)
print(f"  Precision@10 : {precision:.4f} ({precision*100:.2f}%)")
print(f"  Recall@10    : {recall:.4f} ({recall*100:.2f}%)")
print(f"  RMSE         : {rmse:.4f}")
print("=" * 40)

   EVALUATION RESULTS SUMMARY
  Precision@10 : 0.1721 (17.21%)
  Recall@10    : 0.8991 (89.91%)
  RMSE         : 1.0406


# Algorithm Comparison: SVD vs ALS On the same dataset

In [34]:
from implicit.als import AlternatingLeastSquares
from scipy.sparse import csr_matrix
import numpy as np

# Convert to confidence scores for ALS
train_data_implicit = train_data.copy()
train_data_implicit['confidence'] = 1 + 40 * (train_data_implicit['overall'] / 5.0)

train_matrix_als = csr_matrix(
    (train_data_implicit['confidence'].astype(float),
     (train_data_implicit['user_id'], train_data_implicit['item_id'])),
    shape=(n_users, n_items)
)

model_als = AlternatingLeastSquares(
    factors=50, regularization=0.1,
    iterations=20, random_state=42, num_threads=1
)
model_als.fit(train_matrix_als.T.tocsr())
print("âœ… ALS model trained")

  0%|          | 0/20 [00:00<?, ?it/s]

âœ… ALS model trained


In [35]:
from tqdm import tqdm

def evaluate_als(model, train_matrix, test_matrix, n_items, K=10, n_users=500):
    precisions, recalls = [], []
    test_users = np.where(np.diff(test_matrix.indptr) > 0)[0][:n_users]
    
    for user_id in tqdm(test_users, desc="Evaluating ALS"):
        test_items = test_matrix[user_id].indices
        test_ratings = test_matrix[user_id].data
        relevant = set(int(i) for i, r in zip(test_items, test_ratings) if r >= 4.0)
        
        if not relevant:
            continue
        
        user_vec = model.user_factors[user_id]
        all_scores = model.item_factors[:n_items] @ user_vec
        train_items = set(int(i) for i in train_matrix[user_id].indices)
        all_scores[list(train_items)] = -np.inf
        
        top_k = set(np.argpartition(all_scores, -K)[-K:])
        hits = len(top_k & relevant)
        precisions.append(hits / K)
        recalls.append(hits / len(relevant))
    
    return np.mean(precisions), np.mean(recalls)

als_precision, als_recall = evaluate_als(
    model_als, train_matrix_als, test_matrix, n_items, K=10
)
print(f"ALS Precision@10 : {als_precision:.4f} ({als_precision*100:.2f}%)")
print(f"ALS Recall@10    : {als_recall:.4f} ({als_recall*100:.2f}%)")

Evaluating ALS: 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 500/500 [00:01<00:00, 339.54it/s]

ALS Precision@10 : 0.0000 (0.00%)
ALS Recall@10    : 0.0000 (0.00%)





In [36]:
print("=" * 50)
print("   ALGORITHM COMPARISON ON SAME DATASET")
print("=" * 50)
print(f"{'Metric':<20} {'SVD':>10} {'ALS':>10}")
print("-" * 50)
print(f"{'Precision@10':<20} {precision:>10.4f} {als_precision:>10.4f}")
print(f"{'Recall@10':<20} {recall:>10.4f} {als_recall:>10.4f}")
print(f"{'RMSE':<20} {rmse:>10.4f} {'N/A':>10}")
print("=" * 50)
print("\nðŸ“Œ Note: SVD is designed for explicit ratings (1-5 stars)")
print("         ALS is designed for implicit feedback (converted)")

   ALGORITHM COMPARISON ON SAME DATASET
Metric                      SVD        ALS
--------------------------------------------------
Precision@10             0.1721     0.0000
Recall@10                0.8991     0.0000
RMSE                     1.0406        N/A

ðŸ“Œ Note: SVD is designed for explicit ratings (1-5 stars)
         ALS is designed for implicit feedback (converted)
