# SVD Grid Search User Review Model

In [1]:
!pip install scikit-surprise



In [2]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")

Authenticated successfully!


In [4]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.accuracy import rmse
from google.cloud import bigquery
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define batch size & dataset properties
BATCH_SIZE = 1_000_000  # 1M rows per batch
TOTAL_ROWS = 32_000_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

### **1️⃣ Perform Grid Search on Small Sample**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
ORDER BY RAND()
LIMIT 320000  -- Adjust for ~1% of 32M rows
"""
sample_df = client.query(sample_query).to_dataframe()
data = Dataset.load_from_df(sample_df[['userId', 'movieId', 'rating']], reader)

# Hyperparameter tuning
param_grid = {
    "n_factors": [2, 5, 10],  
    "reg_all": [0.001, 0.01, 0.05]  
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(data)

best_params = gs.best_params["rmse"]
print("Best Params:", best_params)

# **2️⃣ Cross-Validation Before Batch Training**
best_algo = SVD(n_factors=best_params["n_factors"], reg_all=best_params["reg_all"])
cv_results = cross_validate(best_algo, data, cv=5, verbose=True)
print(cv_results)

# **3️⃣ Train on Full Dataset in Batches**
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    dataset = Dataset.load_from_df(batch_df[['userId', 'movieId', 'rating']], reader)
    trainset = dataset.build_full_trainset()
    
    best_algo.fit(trainset)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")

### **4️⃣ Evaluate Best Model on a Fresh Test Set**
test_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
ORDER BY RAND()
LIMIT 1500000  -- Adjust for ~5% of 32M rows
"""
test_df = client.query(test_query).to_dataframe()
testset = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader).build_full_trainset().build_testset()

predictions = best_algo.test(testset)
print("Final RMSE on test set:", rmse(predictions))




Best Params: {'n_factors': 10, 'reg_all': 0.05}
Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9397  0.9410  0.9373  0.9373  0.9378  0.9386  0.0015  
MAE (testset)     0.7224  0.7226  0.7219  0.7219  0.7206  0.7219  0.0007  
Fit time          2.20    2.08    2.28    2.21    2.16    2.19    0.07    
Test time         0.33    0.35    0.32    7.60    0.35    1.79    2.90    
{'test_rmse': array([0.93973984, 0.9410432 , 0.93731174, 0.93731422, 0.93781788]), 'test_mae': array([0.7223674 , 0.72264216, 0.72189225, 0.72185187, 0.72056979]), 'fit_time': (2.203209161758423, 2.0799431800842285, 2.281464099884033, 2.209975004196167, 2.164191961288452), 'test_time': (0.32975316047668457, 0.34781718254089355, 0.3220970630645752, 7.598642110824585, 0.35052013397216797)}




✅ Processed 1000000 rows




✅ Processed 2000000 rows




✅ Processed 3000000 rows




✅ Processed 4000000 rows




✅ Processed 5000000 rows




✅ Processed 6000000 rows




✅ Processed 7000000 rows




✅ Processed 8000000 rows




✅ Processed 9000000 rows




✅ Processed 10000000 rows




✅ Processed 11000000 rows




✅ Processed 12000000 rows




✅ Processed 13000000 rows




✅ Processed 14000000 rows




✅ Processed 15000000 rows




✅ Processed 16000000 rows




✅ Processed 17000000 rows




✅ Processed 18000000 rows




✅ Processed 19000000 rows




✅ Processed 20000000 rows




✅ Processed 21000000 rows




✅ Processed 22000000 rows




✅ Processed 23000000 rows




✅ Processed 24000000 rows




✅ Processed 25000000 rows




✅ Processed 26000000 rows




✅ Processed 27000000 rows




✅ Processed 28000000 rows




✅ Processed 29000000 rows




✅ Processed 30000000 rows




✅ Processed 31000000 rows




✅ Processed 32000000 rows




RMSE: 0.9673
Final RMSE on test set: 0.9672514163083512


In [12]:
# Path to your CSV file
csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/Adam films/9a884b4e-8993-4800-925a-bea11dcce39e.csv"

# Load the CSV into a DataFrame
movies_df = pd.read_csv(csv_path)

# Convert the 'date' column to datetime if it's not already
movies_df['Date'] = pd.to_datetime(movies_df['Date'])

# Filter out movies released in or after October 2023
movies_df = movies_df[movies_df['Date'] < '2023-10-01']

#Filter out movies that Grouplens later has an issue matching with
# Exclude specific movies by title
movies_to_exclude = ["Herod's Law", "Spirited Away", "Sing", "Living", "Fury"]  # Replace with actual movie titles
movies_df = movies_df[~movies_df['Name'].isin(movies_to_exclude)]

##### Half the ratings to be out of 5 - only for Adam #####
movies_df['Rating'] = movies_df['Rating']/2

# Sort by 'date' in descending order to get the most recent entries
oliver_recent_movies = movies_df.sort_values(by='Date', ascending=False).head(10)

# Show the most recent 10 movies and their ratings
display(oliver_recent_movies)

  movies_df['Date'] = pd.to_datetime(movies_df['Date'])


Unnamed: 0,Rating,Date,Name,Year,Release Date
1,5.0,2023-07-09,Cinema Paradiso,1988.0,23/02/1990
9,4.5,2022-02-05,Sing 2,2021.0,28/01/2022
2,5.0,2021-03-22,The Shawshank Redemption,1994.0,17/02/1995
96,2.5,2021-01-16,This Beautiful Fantastic,2016.0,19/02/2018
10,4.5,2020-04-07,"Planes, Trains & Automobiles",1987.0,12/02/1988
105,1.5,2019-08-16,The Hateful Eight,2015.0,08/01/2016
37,4.0,2019-06-06,Baby Driver,2017.0,28/06/2017
3,5.0,2019-03-01,GoodFellas,1990.0,26/10/1990
38,4.0,2019-01-07,Django Unchained,2012.0,18/01/2013
78,3.5,2019-01-07,Zero Dark Thirty,2012.0,25/01/2013


In [14]:
from fuzzywuzzy import fuzz, process

# Set up BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Query to fetch the relevant data from BigQuery
query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
# Fetch data from BigQuery and load it into a DataFrame
grouplens_movies_df = client.query(query).to_dataframe()

# Show the first few rows of the DataFrame
display('grouplens_movies_df:')
display(grouplens_movies_df.head())

# Perform fuzzy matching between 'title' in 'oliver_recent_movies' and 'title' in 'grouplens_movies_df'
def get_best_match(title, choices, scorer=fuzz.ratio):
    """Fuzzy match using fuzz.ratio and return the best match."""
    match = process.extractOne(title, choices, scorer=scorer)
    return match[0] if match else None

# Apply fuzzy matching to the titles in 'oliver_recent_movies'
oliver_recent_movies['matched_title'] = oliver_recent_movies['Name'].apply(
    get_best_match, args=(grouplens_movies_df['title'],)
)

# Merge the original 'oliver_recent_movies' DataFrame with 'grouplens_movies_df' based on the 'matched_title'
test_movies_with_ids = pd.merge(
    oliver_recent_movies, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='matched_title', 
    right_on='title', 
    how='left'
)

##### Drop the 'matched_title' column and any other unnecessary columns - didn't use with Adam's csv
# test_movies_with_ids = test_movies_with_ids.drop(columns=['matched_title', 'Letterboxd URI', 'title', 'Date'])



'grouplens_movies_df:'

Unnamed: 0,movieId,title
0,181685,به نام پدر
1,139596,Danielův svět
2,151307,The Lovers and the Despot
3,178223,Injecting Aluminum
4,189345,The Doctor From India


In [15]:
test_movies_with_ids.drop_duplicates(inplace=True)

# Show the final DataFrame
display(test_movies_with_ids)

Unnamed: 0,Rating,Date,Name,Year,Release Date,matched_title,title,movieId
0,5.0,2023-07-09,Cinema Paradiso,1988.0,23/02/1990,Facciamo paradiso,Facciamo paradiso,146966
1,4.5,2022-02-05,Sing 2,2021.0,28/01/2022,Sing 2,Sing 2,265550
2,5.0,2021-03-22,The Shawshank Redemption,1994.0,17/02/1995,"Shawshank Redemption, The","Shawshank Redemption, The",318
3,2.5,2021-01-16,This Beautiful Fantastic,2016.0,19/02/2018,This Beautiful Fantastic,This Beautiful Fantastic,166940
4,4.5,2020-04-07,"Planes, Trains & Automobiles",1987.0,12/02/1988,"Planes, Trains & Automobiles","Planes, Trains & Automobiles",4002
5,1.5,2019-08-16,The Hateful Eight,2015.0,08/01/2016,The Hateful Eight,The Hateful Eight,128360
6,4.0,2019-06-06,Baby Driver,2017.0,28/06/2017,Baby Driver,Baby Driver,171763
7,5.0,2019-03-01,GoodFellas,1990.0,26/10/1990,Goodfellas,Goodfellas,1213
8,4.0,2019-01-07,Django Unchained,2012.0,18/01/2013,Django Unchained,Django Unchained,99114
9,3.5,2019-01-07,Zero Dark Thirty,2012.0,25/01/2013,Zero Dark Thirty,Zero Dark Thirty,98961


In [29]:
# Get all movie IDs in the dataset (this could be all movies in the system or a smaller list)
all_movie_ids = ratings_df['movieId'].unique()

# Find unwatched movies for user 1 (exclude movies that user 1 has already rated)
unwatched_movie_ids = [i for i in all_movie_ids if i not in test_movies_with_ids['movieId'].values]

# Make predictions for the unwatched movies
predictions = [best_algo.predict(100, movie_id) for movie_id in unwatched_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top X recommended movies with their predicted ratings
recommended_movies = [(pred.iid, pred.est) for pred in predictions[:10]]

# Convert to DataFrame
recommended_df = pd.DataFrame(recommended_movies, columns=["Movie ID", "Predicted Rating"])
display(recommended_df)

Unnamed: 0,Movie ID,Predicted Rating
0,26587,4.629156
1,159817,4.6105
2,171011,4.562839
3,6981,4.554737
4,7926,4.534336
5,5056,4.533719
6,2920,4.518831
7,670,4.517054
8,6669,4.513213
9,26082,4.507518


In [30]:
display(recommended_df.info())
display(grouplens_movies_df.info())

# Bring back titles
final_recommendation = pd.merge(
    recommended_df, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='Movie ID', 
    right_on='movieId', 
    how='left'
)

final_recommendation

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10 entries, 0 to 9
Data columns (total 2 columns):
 #   Column            Non-Null Count  Dtype  
---  ------            --------------  -----  
 0   Movie ID          10 non-null     int64  
 1   Predicted Rating  10 non-null     float64
dtypes: float64(1), int64(1)
memory usage: 288.0 bytes


None

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 87585 entries, 0 to 87584
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   movieId  87585 non-null  Int64 
 1   title    87585 non-null  object
dtypes: Int64(1), object(1)
memory usage: 1.4+ MB


None

Unnamed: 0,Movie ID,Predicted Rating,title,movieId
0,26587,4.629156,"Decalogue, The (Dekalog)",26587
1,159817,4.6105,Planet Earth,159817
2,171011,4.562839,Planet Earth II,171011
3,6981,4.554737,"Ordet (Word, The)",6981
4,7926,4.534336,High and Low (Tengoku to jigoku),7926
5,5056,4.533719,"Enigma of Kaspar Hauser, The (a.k.a. Mystery o...",5056
6,2920,4.518831,Children of Paradise (Les enfants du paradis),2920
7,670,4.517054,"World of Apu, The (Apur Sansar)",670
8,6669,4.513213,Ikiru,6669
9,26082,4.507518,Harakiri (Seppuku),26082


## Understanding the Grouplens dataset

In [31]:
# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Pull table from bq
sample_query = """
SELECT *
FROM `film-wizard-453315.Grouplens.grouplens_movies`
ORDER BY RAND()
LIMIT 1000  -- Adjust for ~1% of 32M rows
"""

test_df = client.query(sample_query).to_dataframe()
test_df.head(50)



Unnamed: 0,movieId,title,year,genres,original_title,clean_extraction
0,147012,1989,2014,(no genres listed),1989 (2014),True
1,181213,Aztec Rex,2007,Action|Adventure,Aztec Rex (2007),True
2,222505,Just Let Go,2015,(no genres listed),Just Let Go (2015),True
3,127449,Hippie Masala - Forever in India,2006,Documentary,Hippie Masala - Forever in India (2006),True
4,161988,Medusa,1998,Fantasy|Thriller,Medusa (1998),True
5,135210,Binta and the Great Idea,2004,Children|Drama,Binta and the Great Idea (2004),True
6,283367,The Girl in the Fountain,2021,Documentary,The Girl in the Fountain (2021),True
7,121779,Skateland,2010,Drama,Skateland (2010),True
8,215409,Free In Deed,2015,Drama,Free In Deed (2015),True
9,180593,The Dawning,1988,Drama|Thriller,The Dawning (1988),True


## Saving model locally so I don't have to retrain every time

In [23]:
import pickle

with open('SVD_film_model.pkl', 'wb') as file: 
    pickle.dump(best_algo, file)

In [24]:
import pickle

with open('SVD_film_model.pkl', 'rb') as file:
    best_algo_test = pickle.load(file)

In [32]:
# Make predictions for the unwatched movies
predictions = [best_algo_test.predict(100, movie_id) for movie_id in unwatched_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

In [34]:
predictions

[Prediction(uid=100, iid=26587, r_ui=None, est=4.62915551009108, details={'was_impossible': False}),
 Prediction(uid=100, iid=159817, r_ui=None, est=4.6105004726919105, details={'was_impossible': False}),
 Prediction(uid=100, iid=171011, r_ui=None, est=4.562839439168177, details={'was_impossible': False}),
 Prediction(uid=100, iid=6981, r_ui=None, est=4.554736967496806, details={'was_impossible': False}),
 Prediction(uid=100, iid=7926, r_ui=None, est=4.534335836565334, details={'was_impossible': False}),
 Prediction(uid=100, iid=5056, r_ui=None, est=4.533718984210778, details={'was_impossible': False}),
 Prediction(uid=100, iid=2920, r_ui=None, est=4.518830610014069, details={'was_impossible': False}),
 Prediction(uid=100, iid=670, r_ui=None, est=4.517053531507208, details={'was_impossible': False}),
 Prediction(uid=100, iid=6669, r_ui=None, est=4.513212671627404, details={'was_impossible': False}),
 Prediction(uid=100, iid=26082, r_ui=None, est=4.507518100638054, details={'was_impossi