# NOTEBOOK 6: MATRIX FACTORIZATION (SVD)
## Latent Factor Models for Recommendations

This notebook implements matrix factorization using Singular Value Decomposition (SVD) to uncover latent features in the user-item interaction matrix.
It includes training SVD models with different dimensions, generating recommendations, evaluating performance, visualizing the latent space, and comparing results with content-based and collaborative filtering methods.

In [9]:

# IMPORT LIBRARIES

print("=" * 80)
print("SETUP: IMPORTING LIBRARIES")
print("=" * 80)


import numpy as np              
import pandas as pd            
import matplotlib.pyplot as plt 
import seaborn as sns          
import pickle                
import os                     
import time                    
from datetime import datetime  

# Sparse matrix operations
from scipy.sparse import csr_matrix, load_npz  

# Machine Learning
from sklearn.decomposition import TruncatedSVD  
from sklearn.preprocessing import normalize     
# Suppress warnings for cleaner output
import warnings
warnings.filterwarnings('ignore')

print(" All libraries imported successfully")

 
# VISUALIZATION SETTINGS


# Set consistent, professional plot style
plt.style.use('seaborn-v0_8-darkgrid')
sns.set_palette("husl")

# Default figure settings
plt.rcParams['figure.figsize'] = (12, 6)
plt.rcParams['figure.dpi'] = 100
plt.rcParams['savefig.dpi'] = 300
plt.rcParams['savefig.bbox'] = 'tight'

print(" Plot settings configured")


# DIRECTORY STRUCTURE 

print("\n" + "=" * 80)
print("DIRECTORY SETUP")
print("=" * 80)


import sys
PROJECT_ROOT = os.path.dirname(os.getcwd())  

print(f" Current directory: {os.getcwd()}")
print(f" Project root: {PROJECT_ROOT}")

# Define project directories relative to PROJECT_ROOT
DATA_DIR = os.path.join(PROJECT_ROOT, 'data')
PROCESSED_DIR = os.path.join(PROJECT_ROOT, 'data', 'processed')
MODELS_DIR = os.path.join(PROJECT_ROOT, 'models')
RESULTS_DIR = os.path.join(PROJECT_ROOT, 'reports', 'results')

# Create directories if they don't exist
for directory in [MODELS_DIR, RESULTS_DIR]:
    os.makedirs(directory, exist_ok=True)
    
print(f"\n Directory structure:")
print(f"   • Data: {DATA_DIR}")
print(f"   • Processed: {PROCESSED_DIR}")
print(f"   • Models: {MODELS_DIR}")
print(f"   • Results: {RESULTS_DIR}")

# Verify directories exist
all_exist = all(os.path.exists(d) for d in [DATA_DIR, MODELS_DIR])
if all_exist:
    print(f" All directories ready")
else:
    print(f"  Some directories missing - will try to continue")

# LOAD PREVIOUS RESULTS


print("\n" + "=" * 80)
print("LOADING DATA FROM PREVIOUS NOTEBOOKS")
print("=" * 80)

# Load User-Item Matrix

print("\n Loading user-item interaction matrix...")

matrix_path = os.path.join(MODELS_DIR, 'user_item_matrix.npz')

print(f"   Looking for: {matrix_path}")

# Check if file exists
if not os.path.exists(matrix_path):
    print(f"    Matrix not found!")
    print(f"\n  SOLUTION: We need to generate the matrix from Notebook 5 data")
    print(f"   I'll help you create it now...\n")
    
    # Try to load from training data
    train_path = os.path.join(DATA_DIR, 'ratings_train.csv')
    
    if os.path.exists(train_path):
        print("   ✅ Found training data - will regenerate matrix")
        print("   This will take a few minutes...")
        
        # Load training data
        train = pd.read_csv(train_path)
        print(f"   Loaded: {len(train):,} training ratings")
        
        # Create mappings
        unique_users = train['userId'].unique()
        unique_movies = train['movieId'].unique()
        
        user_to_idx = {user_id: idx for idx, user_id in enumerate(unique_users)}
        idx_to_user = {idx: user_id for user_id, idx in user_to_idx.items()}
        movie_to_idx = {movie_id: idx for idx, movie_id in enumerate(unique_movies)}
        idx_to_movie = {idx: movie_id for movie_id, idx in movie_to_idx.items()}
        
        print(f"   Created mappings: {len(user_to_idx):,} users, {len(movie_to_idx):,} movies")
        
        # Build matrix
        n_users = len(user_to_idx)
        n_movies = len(movie_to_idx)
        
        user_indices = train['userId'].map(user_to_idx).values
        movie_indices = train['movieId'].map(movie_to_idx).values
        ratings = train['rating'].values
        
        user_item_matrix = csr_matrix(
            (ratings, (user_indices, movie_indices)),
            shape=(n_users, n_movies)
        )
        
        print(f"   Matrix created: {user_item_matrix.shape}")
        
        # Save for next time
        from scipy.sparse import save_npz
        save_npz(matrix_path, user_item_matrix)
        print(f"    Saved matrix for future use")
        
        # Save mappings
        mappings = {
            'user_to_idx': user_to_idx,
            'idx_to_user': idx_to_user,
            'movie_to_idx': movie_to_idx,
            'idx_to_movie': idx_to_movie
        }
        mappings_path = os.path.join(MODELS_DIR, 'matrix_mappings.pkl')
        with open(mappings_path, 'wb') as f:
            pickle.dump(mappings, f, protocol=pickle.HIGHEST_PROTOCOL)
        print(f"    Saved mappings for future use")
        
    else:
        raise FileNotFoundError(
            f"Cannot find training data at {train_path}. "
            "Please ensure Notebook 5 has been run and data files exist."
        )
else:
    # Load existing matrix
    user_item_matrix = load_npz(matrix_path)
    print(f"    Matrix loaded from file")

# Calculate sparsity
sparsity = (1 - user_item_matrix.nnz / 
            (user_item_matrix.shape[0] * user_item_matrix.shape[1])) * 100

print(f"\n Matrix ready:")
print(f"   Shape: {user_item_matrix.shape[0]:,} users × "
      f"{user_item_matrix.shape[1]:,} movies")
print(f"   Non-zero entries: {user_item_matrix.nnz:,}")
print(f"   Sparsity: {sparsity:.2f}%")
print(f"   Memory: {user_item_matrix.data.nbytes / (1024**2):.1f} MB")

#
# Load ID Mappings

print("\n  Loading user/movie ID mappings...")

mappings_path = os.path.join(MODELS_DIR, 'matrix_mappings.pkl')

if not os.path.exists(mappings_path):
    print(f"     Mappings not found - using ones created above")
    # Already created above when regenerating matrix
else:
    with open(mappings_path, 'rb') as f:
        mappings = pickle.load(f)
        user_to_idx = mappings['user_to_idx']
        idx_to_user = mappings['idx_to_user']
        movie_to_idx = mappings['movie_to_idx']
        idx_to_movie = mappings['idx_to_movie']
    print(f"    Mappings loaded from file")

print(f"   Users: {len(user_to_idx):,}")
print(f"   Movies: {len(movie_to_idx):,}")

# Load Evaluation Setup 

print("\n Loading evaluation setup...")

cf_results_path = os.path.join(MODELS_DIR, 'evaluation_results_cf.pkl')

if os.path.exists(cf_results_path):
    with open(cf_results_path, 'rb') as f:
        cf_results = pickle.load(f)
        eval_user_ids = cf_results['eval_user_ids']
        ground_truth = cf_results['ground_truth']
        ub_metrics = cf_results['user_based_metrics']
        ib_metrics = cf_results['item_based_metrics']
    
    print(f" Evaluation setup loaded:")
    print(f"   Evaluation users: {len(eval_user_ids)}")
    print(f"   Ground truth available: {len(ground_truth)} users")
    
    print(f"\n Previous CF Results (for comparison):")
    print(f"   User-Based CF:")
    print(f"      • Precision@10: {ub_metrics['Precision@10']:.2f}%")
    print(f"      • Hit Rate@10:  {ub_metrics['Hit Rate@10']:.2f}%")
    print(f"   Item-Based CF:")
    print(f"      • Precision@10: {ib_metrics['Precision@10']:.2f}%")
    print(f"      • Hit Rate@10:  {ib_metrics['Hit Rate@10']:.2f}%")
else:
    print(f"     CF results not found")
    print(f"   We'll create evaluation setup from test data...")
    
    # Load test data
    test_path = os.path.join(DATA_DIR, 'ratings_test.csv')
    if os.path.exists(test_path):
        test = pd.read_csv(test_path)
        
        # Sample 100 users who exist in training
        test_users = test[test['userId'].isin(user_to_idx.keys())]['userId'].unique()
        np.random.seed(42)
        eval_user_ids = np.random.choice(test_users, size=min(100, len(test_users)), replace=False)
        
        # Create ground truth
        ground_truth = {}
        for user_id in eval_user_ids:
            user_test = test[test['userId'] == user_id]
            relevant = user_test[user_test['rating'] >= 4.0]['movieId'].tolist()
            ground_truth[user_id] = relevant
        
        print(f"    Created evaluation setup: {len(eval_user_ids)} users")
        
        # No previous CF metrics
        ub_metrics = None
        ib_metrics = None
    else:
        print(f"    Cannot create evaluation - test data not found")
        eval_user_ids = []
        ground_truth = {}
        ub_metrics = None
        ib_metrics = None


# Load Movies Data

print("\n Loading movies data...")

movies_path = os.path.join(PROCESSED_DIR, 'movies_features.csv')
if not os.path.exists(movies_path):
    movies_path = os.path.join(DATA_DIR, 'movies_filtered.csv')
if not os.path.exists(movies_path):
    movies_path = os.path.join(DATA_DIR, 'movies.csv')

if os.path.exists(movies_path):
    movies = pd.read_csv(movies_path)
    print(f" Movies loaded: {len(movies):,} movies")
else:
    print(f"  Movies file not found - will work without titles")
    movies = None



SETUP: IMPORTING LIBRARIES
 All libraries imported successfully
 Plot settings configured

DIRECTORY SETUP
 Current directory: c:\Users\mhfou\Documents\MovieRecommenderSystem\notebooks
 Project root: c:\Users\mhfou\Documents\MovieRecommenderSystem

 Directory structure:
   • Data: c:\Users\mhfou\Documents\MovieRecommenderSystem\data
   • Processed: c:\Users\mhfou\Documents\MovieRecommenderSystem\data\processed
   • Models: c:\Users\mhfou\Documents\MovieRecommenderSystem\models
   • Results: c:\Users\mhfou\Documents\MovieRecommenderSystem\reports\results
 All directories ready

LOADING DATA FROM PREVIOUS NOTEBOOKS

 Loading user-item interaction matrix...
   Looking for: c:\Users\mhfou\Documents\MovieRecommenderSystem\models\user_item_matrix.npz
    Matrix loaded from file

 Matrix ready:
   Shape: 137,883 users × 34,461 movies
   Non-zero entries: 20,000,076
   Sparsity: 99.58%
   Memory: 152.6 MB

  Loading user/movie ID mappings...
    Mappings loaded from file
   Users: 137,883
   M