# SVD Grid Search User Review Model

In [1]:
!pip install scikit-surprise



In [3]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")


Authenticated successfully!


In [4]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.accuracy import rmse

from google.cloud import bigquery
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define your project and dataset
project_id = "film-wizard-453315"
ratings_table = "film-wizard-453315.Grouplens.raw_grouplens_ratings"


# Query to fetch user ratings data
query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
LIMIT 100000
"""
ratings_df = client.query(query).to_dataframe()

# Define the Surprise reader with rating scale (assuming 1-5 scale)
reader = Reader(rating_scale=(1, 5))

# Load dataset into Surprise format
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split data (80% train, 20% test)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### **1️⃣ Perform Grid Search to Tune SVD Hyperparameters**
param_grid = {
    "n_factors": [7, 10, 12],  # Number of latent factors
    "reg_all": [0.05, 0.1, 0.15]  # Regularization strength
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

# Print Best Parameters
print("Best RMSE:", gs.best_score["rmse"])
print("Best Parameters:", gs.best_params["rmse"])

# Train Final Model with Best Parameters
best_algo = SVD(n_factors=gs.best_params["rmse"]["n_factors"], reg_all=gs.best_params["rmse"]["reg_all"])
best_algo.fit(trainset)

### **2️⃣ Evaluate Best Model**
# Predict on test set
predictions = best_algo.test(testset)

# Evaluate RMSE (lower is better)
print("Final RMSE on test set:", rmse(predictions))

# Perform cross-validation
cv_results = cross_validate(best_algo, data, cv=5, verbose=True)

# Show cross-validation results
print(cv_results)


Best RMSE: 0.7110597059295376
Best Parameters: {'n_factors': 7, 'reg_all': 0.05}
RMSE: 0.7093
Final RMSE on test set: 0.7093247905769704
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.7078  0.7065  0.7042  0.7114  0.7007  0.7061  0.0036  
MAE (testset)     0.5783  0.5761  0.5725  0.5798  0.5725  0.5758  0.0029  
Fit time          0.16    0.15    0.16    0.15    0.16    0.16    0.01    
Test time         0.09    0.09    0.10    0.04    0.04    0.07    0.03    
{'test_rmse': array([0.70780888, 0.70653273, 0.70421051, 0.71138289, 0.70074015]), 'test_mae': array([0.57826046, 0.57614165, 0.57250998, 0.57976567, 0.57254495]), 'fit_time': (0.16230201721191406, 0.14548516273498535, 0.15945792198181152, 0.1543889045715332, 0.16369295120239258), 'test_time': (0.09481596946716309, 0.09401488304138184, 0.09547901153564453, 0.0391993522644043, 0.03848981857299805)}
