# SVD Grid Search User Review Model

In [1]:
!pip install scikit-surprise



In [2]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")


Authenticated successfully!


In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, train_test_split, GridSearchCV
from surprise.accuracy import rmse

from google.cloud import bigquery
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define your project and dataset
project_id = "film-wizard-453315"
ratings_table = "film-wizard-453315.Grouplens.raw_grouplens_ratings"

# Query to fetch user ratings data
query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
TABLESAMPLE SYSTEM (50 PERCENT)
"""
# LIMIT 1000000
ratings_df = client.query(query).to_dataframe()

# Define the Surprise reader with rating scale (assuming 1-5 scale)
reader = Reader(rating_scale=(0.5, 5))

# Load dataset into Surprise format
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Split data (80% train, 20% test)
trainset, testset = train_test_split(data, test_size=0.2, random_state=42)

### **1️⃣ Perform Grid Search to Tune SVD Hyperparameters**
param_grid = {
    "n_factors": [5, 10, 15, 20],  # Number of latent factors
    "reg_all": [0.01, 0.05, 0.1, 0.15, 0.2]  # Regularization strength
}

gs = GridSearchCV(SVD, param_grid, measures=["rmse", "mae"], cv=3)
gs.fit(data)

# Print Best Parameters
print("Best RMSE:", gs.best_score["rmse"])
print("Best Parameters:", gs.best_params["rmse"])

# Train Final Model with Best Parameters
best_algo = SVD(n_factors=gs.best_params["rmse"]["n_factors"], reg_all=gs.best_params["rmse"]["reg_all"])
best_algo.fit(trainset)

### **2️⃣ Evaluate Best Model**
# Predict on test set
predictions = best_algo.test(testset)

# Evaluate RMSE (lower is better)
print("Final RMSE on test set:", rmse(predictions))

# Perform cross-validation
cv_results = cross_validate(best_algo, data, cv=5, verbose=True)

# Show cross-validation results
print(cv_results)



In [None]:
import pandas as pd

# Path to your CSV file
csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/letterboxd-oliverramsay-2025-03-13-15-05-utc/ratings.csv"

# Load the CSV into a DataFrame
movies_df = pd.read_csv(csv_path)

# Convert the 'date' column to datetime if it's not already
movies_df['Date'] = pd.to_datetime(movies_df['Date'])

# Filter out movies released in or after October 2023
movies_df = movies_df[movies_df['Date'] < '2023-10-01']

#Filter out movies that Grouplens later has an issue matching with
# Exclude specific movies by title
movies_to_exclude = ["Sing", "Living", 'Fury']  # Replace with actual movie titles
movies_df = movies_df[~movies_df['Name'].isin(movies_to_exclude)]

##### Half the ratings to be out of 5 - only for Adam #####
# movies_df['Rating'] = movies_df['Rating']/2

# Sort by 'date' in descending order to get the most recent entries
oliver_recent_movies = movies_df.sort_values(by='Date', ascending=False).head(10)

# Show the most recent 10 movies and their ratings
display(oliver_recent_movies)

In [None]:
from fuzzywuzzy import fuzz, process

# Set up BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Query to fetch the relevant data from BigQuery
query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
# Fetch data from BigQuery and load it into a DataFrame
grouplens_movies_df = client.query(query).to_dataframe()

# Show the first few rows of the DataFrame
display('grouplens_movies_df:')
display(grouplens_movies_df.head())

# Perform fuzzy matching between 'title' in 'oliver_recent_movies' and 'title' in 'grouplens_movies_df'
def get_best_match(title, choices, scorer=fuzz.ratio):
    """Fuzzy match using fuzz.ratio and return the best match."""
    match = process.extractOne(title, choices, scorer=scorer)
    return match[0] if match else None

# Apply fuzzy matching to the titles in 'oliver_recent_movies'
oliver_recent_movies['matched_title'] = oliver_recent_movies['Name'].apply(
    get_best_match, args=(grouplens_movies_df['title'],)
)

# Merge the original 'oliver_recent_movies' DataFrame with 'grouplens_movies_df' based on the 'matched_title'
test_movies_with_ids = pd.merge(
    oliver_recent_movies, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='matched_title', 
    right_on='title', 
    how='left'
)

##### Drop the 'matched_title' column and any other unnecessary columns - didn't use with Adam's csv
# test_movies_with_ids = test_movies_with_ids.drop(columns=['matched_title', 'Letterboxd URI', 'title', 'Date'])

In [None]:
test_movies_with_ids.drop_duplicates(inplace=True)

# Show the final DataFrame
display(test_movies_with_ids)

In [None]:
# Get all movie IDs in the dataset (this could be all movies in the system or a smaller list)
all_movie_ids = ratings_df['movieId'].unique()

# Find unrated movies for user 1 (exclude movies that user 1 has already rated)
unrated_movie_ids = [i for i in all_movie_ids if i not in test_movies_with_ids['movieId'].values]

# Make predictions for the unrated movies
predictions = [best_algo.predict(1, movie_id) for movie_id in unrated_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top 5 recommended movies with their predicted ratings
recommended_movies = [(pred.iid, pred.est) for pred in predictions[:10]]

# Convert to DataFrame
recommended_df = pd.DataFrame(recommended_movies, columns=["Movie ID", "Predicted Rating"])
display(recommended_df)

In [None]:
display(recommended_df.info())
display(grouplens_movies_df.info())

# Bring back titles
final_recommendation = pd.merge(
    recommended_df, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='Movie ID', 
    right_on='movieId', 
    how='left'
)

final_recommendation