# SVD Grid Search User Review Model

In [1]:
!pip install scikit-surprise



In [2]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")


Authenticated successfully!


In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.accuracy import rmse
from google.cloud import bigquery
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define batch size & dataset properties
BATCH_SIZE = 1_000_000  # 1M rows per batch
TOTAL_ROWS = 32_000_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

### **1️⃣ Perform Grid Search on Small Sample**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
TABLESAMPLE SYSTEM (1 PERCENT)
"""
sample_df = client.query(sample_query).to_dataframe()
data = Dataset.load_from_df(sample_df[['userId', 'movieId', 'rating']], reader)

# Hyperparameter tuning
param_grid = {
    "n_factors": [10, 20],  
    "reg_all": [0.05, 0.1]  
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(data)

best_params = gs.best_params["rmse"]
print("Best Params:", best_params)

# **2️⃣ Cross-Validation Before Batch Training**
best_algo = SVD(n_factors=best_params["n_factors"], reg_all=best_params["reg_all"])
cv_results = cross_validate(best_algo, data, cv=5, verbose=True)
print(cv_results)

# **3️⃣ Train on Full Dataset in Batches**
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    dataset = Dataset.load_from_df(batch_df[['userId', 'movieId', 'rating']], reader)
    trainset = dataset.build_full_trainset()
    
    best_algo.fit(trainset)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")

### **4️⃣ Evaluate Best Model on a Fresh Test Set**
test_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
TABLESAMPLE SYSTEM (5 PERCENT)
"""
test_df = client.query(test_query).to_dataframe()
testset = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader).build_full_trainset().build_testset()

predictions = best_algo.test(testset)
print("Final RMSE on test set:", rmse(predictions))




Best Params: {'n_factors': 10, 'reg_all': 0.05}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9397  0.9410  0.9373  0.9373  0.9378  0.9386  0.0015  
MAE (testset)     0.7224  0.7226  0.7219  0.7219  0.7206  0.7219  0.0007  
Fit time          2.20    2.08    2.28    2.21    2.16    2.19    0.07    
Test time         0.33    0.35    0.32    7.60    0.35    1.79    2.90    
{'test_rmse': array([0.93973984, 0.9410432 , 0.93731174, 0.93731422, 0.93781788]), 'test_mae': array([0.7223674 , 0.72264216, 0.72189225, 0.72185187, 0.72056979]), 'fit_time': (2.203209161758423, 2.0799431800842285, 2.281464099884033, 2.209975004196167, 2.164191961288452), 'test_time': (0.32975316047668457, 0.34781718254089355, 0.3220970630645752, 7.598642110824585, 0.35052013397216797)}




✅ Processed 1000000 rows




✅ Processed 2000000 rows




✅ Processed 3000000 rows




✅ Processed 4000000 rows




✅ Processed 5000000 rows




✅ Processed 6000000 rows




✅ Processed 7000000 rows




✅ Processed 8000000 rows




✅ Processed 9000000 rows




✅ Processed 10000000 rows




✅ Processed 11000000 rows




✅ Processed 12000000 rows


In [None]:
import pandas as pd

# Path to your CSV file
csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/letterboxd-oliverramsay-2025-03-13-15-05-utc/ratings.csv"

# Load the CSV into a DataFrame
movies_df = pd.read_csv(csv_path)

# Convert the 'date' column to datetime if it's not already
movies_df['Date'] = pd.to_datetime(movies_df['Date'])

# Filter out movies released in or after October 2023
movies_df = movies_df[movies_df['Date'] < '2023-10-01']

#Filter out movies that Grouplens later has an issue matching with
# Exclude specific movies by title
movies_to_exclude = ["Sing", "Living", 'Fury']  # Replace with actual movie titles
movies_df = movies_df[~movies_df['Name'].isin(movies_to_exclude)]

##### Half the ratings to be out of 5 - only for Adam #####
# movies_df['Rating'] = movies_df['Rating']/2

# Sort by 'date' in descending order to get the most recent entries
oliver_recent_movies = movies_df.sort_values(by='Date', ascending=False).head(10)

# Show the most recent 10 movies and their ratings
display(oliver_recent_movies)

In [None]:
from fuzzywuzzy import fuzz, process

# Set up BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Query to fetch the relevant data from BigQuery
query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
# Fetch data from BigQuery and load it into a DataFrame
grouplens_movies_df = client.query(query).to_dataframe()

# Show the first few rows of the DataFrame
display('grouplens_movies_df:')
display(grouplens_movies_df.head())

# Perform fuzzy matching between 'title' in 'oliver_recent_movies' and 'title' in 'grouplens_movies_df'
def get_best_match(title, choices, scorer=fuzz.ratio):
    """Fuzzy match using fuzz.ratio and return the best match."""
    match = process.extractOne(title, choices, scorer=scorer)
    return match[0] if match else None

# Apply fuzzy matching to the titles in 'oliver_recent_movies'
oliver_recent_movies['matched_title'] = oliver_recent_movies['Name'].apply(
    get_best_match, args=(grouplens_movies_df['title'],)
)

# Merge the original 'oliver_recent_movies' DataFrame with 'grouplens_movies_df' based on the 'matched_title'
test_movies_with_ids = pd.merge(
    oliver_recent_movies, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='matched_title', 
    right_on='title', 
    how='left'
)

##### Drop the 'matched_title' column and any other unnecessary columns - didn't use with Adam's csv
# test_movies_with_ids = test_movies_with_ids.drop(columns=['matched_title', 'Letterboxd URI', 'title', 'Date'])

In [None]:
test_movies_with_ids.drop_duplicates(inplace=True)

# Show the final DataFrame
display(test_movies_with_ids)

In [None]:
# Get all movie IDs in the dataset (this could be all movies in the system or a smaller list)
all_movie_ids = ratings_df['movieId'].unique()

# Find unrated movies for user 1 (exclude movies that user 1 has already rated)
unrated_movie_ids = [i for i in all_movie_ids if i not in test_movies_with_ids['movieId'].values]

# Make predictions for the unrated movies
predictions = [best_algo.predict(1, movie_id) for movie_id in unrated_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top 5 recommended movies with their predicted ratings
recommended_movies = [(pred.iid, pred.est) for pred in predictions[:10]]

# Convert to DataFrame
recommended_df = pd.DataFrame(recommended_movies, columns=["Movie ID", "Predicted Rating"])
display(recommended_df)

In [None]:
display(recommended_df.info())
display(grouplens_movies_df.info())

# Bring back titles
final_recommendation = pd.merge(
    recommended_df, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='Movie ID', 
    right_on='movieId', 
    how='left'
)

final_recommendation