## SVD Grid Search User Review Model

In [1]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")

Authenticated successfully!


In [2]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.accuracy import rmse
from google.cloud import bigquery
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define batch size & dataset properties
BATCH_SIZE = 100_000  # 100k rows per batch
TOTAL_ROWS = 500_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

### **GL df with  500k randomly sampled ratings**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.500k_ratings`
ORDER BY RAND()
"""
sample_gl_df = client.query(sample_query).to_dataframe()
sample_gl_df



Unnamed: 0,userId,movieId,rating
0,104833,2470,3.0
1,161697,551,4.0
2,170972,239,3.0
3,147342,1222,4.0
4,68377,106920,3.0
...,...,...,...
499995,5496,4886,4.5
499996,183343,788,2.5
499997,66054,1214,4.0
499998,88500,98836,4.0


In [3]:
### **1️⃣ Perform Grid Search on Small Sample**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.500k_ratings`
ORDER BY RAND()
LIMIT 100000  -- Adjust for ~20% of 500k rows
"""
grid_search_df = client.query(sample_query).to_dataframe()
data = Dataset.load_from_df(grid_search_df[['userId', 'movieId', 'rating']], reader)

# Hyperparameter tuning
param_grid = {
    "n_factors": [10, 20, 50],  
    "reg_all": [0.001, 0.01, 0.05]  
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(data)

best_params = gs.best_params["rmse"]
print("Best Params:", best_params)



Best Params: {'n_factors': 10, 'reg_all': 0.05}


In [4]:
# **2️⃣ Cross-Validation Before Batch Training**
best_algo = SVD(n_factors=best_params["n_factors"], reg_all=best_params["reg_all"])
cv_results = cross_validate(best_algo, data, cv=5, verbose=True)
print(cv_results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9761  0.9824  0.9737  0.9766  0.9722  0.9762  0.0035  
MAE (testset)     0.7543  0.7612  0.7536  0.7537  0.7512  0.7548  0.0034  
Fit time          0.55    0.54    0.59    0.58    0.58    0.57    0.02    
Test time         0.09    0.18    0.10    0.10    0.09    0.11    0.03    
{'test_rmse': array([0.97608594, 0.98239046, 0.97367309, 0.97655714, 0.97222009]), 'test_mae': array([0.75425928, 0.76122421, 0.75359025, 0.75365902, 0.75120742]), 'fit_time': (0.5457689762115479, 0.5390288829803467, 0.5860280990600586, 0.5803048610687256, 0.5773000717163086), 'test_time': (0.09341812133789062, 0.1780092716217041, 0.09600687026977539, 0.0960841178894043, 0.09351706504821777)}


In [5]:
# **3️⃣ Train on Full Dataset in Batches**
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM `film-wizard-453315.Grouplens.500k_ratings`
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    dataset = Dataset.load_from_df(batch_df[['userId', 'movieId', 'rating']], reader)
    trainset = dataset.build_full_trainset()
    
    best_algo.fit(trainset)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")



✅ Processed 100000 rows




✅ Processed 200000 rows




✅ Processed 300000 rows




✅ Processed 400000 rows




✅ Processed 500000 rows


In [6]:
### **4️⃣ Evaluate Best Model on a Fresh Test Set**
test_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.500k_ratings`
ORDER BY RAND()
LIMIT 1500000  -- Adjust for ~5% of 32M rows
"""
test_df = client.query(test_query).to_dataframe()
testset = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader).build_full_trainset().build_testset()

predictions = best_algo.test(testset)
print("Final RMSE on test set:", rmse(predictions))



RMSE: 0.9608
Final RMSE on test set: 0.9607709766743512


In [7]:
predictions = best_algo.predict(10531, 100)
predictions

Prediction(uid=10531, iid=100, r_ui=None, est=3.401492923710541, details={'was_impossible': False})

## Lenskit

In [38]:
import os
from google.cloud import bigquery
import pandas as pd
from lenskit import crossfold
from lenskit.als import BiasedMFScorer # https://lkpy.lenskit.org/stable/guide/gettingstarted
from lenskit import util

# Initialize BigQuery client
client = bigquery.Client(project="film-wizard-453315")
print("Authenticated successfully!")

# Define batch size & dataset properties
BATCH_SIZE = 100_000  # 100k rows per batch
TOTAL_ROWS = 500_000  # Approximate total dataset size

# Fetch 500k ratings from BigQuery randomly (for the entire dataset)
sample_query = """
SELECT userId, movieId, rating
FROM film-wizard-453315.Grouplens.500k_ratings
ORDER BY RAND()
"""
sample_gl_df = client.query(sample_query).to_dataframe()

# Rename columns to match LensKit's expected column names
sample_gl_df = sample_gl_df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
sample_gl_df

ModuleNotFoundError: No module named 'lenskit.als'

In [32]:
# I ned to define cv, hyperparam grid and then do a gridsearch

# Train the model
model = als.BiasedMF(features=3, reg=0.01, rng_spec=42)

# Fit the model with your data
model.fit(sample_gl_df[['user_id', 'item_id', 'rating']])

# Create a scorer from the model
scorer = model.scorer()

# # ---------------------------

# # Prepare data for training
# train_data = sample_gl_df[['user', 'item', 'rating']]

# # Define cross-validation with LensKit
# folds = crossfold.sample_users(train_data, 5, size=0.2, method='random')  # 5-fold cross-validation

# # Define a function for ALS model training
# def train_als_model(train_data, factors=10, reg=0.01):
#     algo = als.ALS(factors=factors, reg=reg, random_state=42)
#     algo.fit(train_data)
#     return algo

# # Example hyperparameter grid (number of factors and regularization)
# param_grid = {
#     "factors": [5, 10, 50],
#     "reg": [0.001, 0.01, 0.05]
# }

# # Manually grid search
# best_rmse = float('inf')
# best_params = {}

# for factors in param_grid["factors"]:
#     for reg in param_grid["reg"]:
#         fold_rmse = []
        
#         for train, test in folds:
#             model = train_als_model(train, factors=factors, reg=reg)
            
#             # Ensure the test data is in the form of a list of tuples (user, item)
#             user_item_pairs = list(test[['user', 'item']].itertuples(index=False, name=None))
            
#             # Make predictions for these user-item pairs
#             pred = model.predict(user_item_pairs)
            
#             # Calculate RMSE
#             rmse_val = util.rmse(pred, test['rating'])
#             fold_rmse.append(rmse_val)
        
#         mean_rmse = sum(fold_rmse) / len(fold_rmse)
#         print(f"Factors: {factors}, Reg: {reg}, RMSE: {mean_rmse}")
        
#         if mean_rmse < best_rmse:
#             best_rmse = mean_rmse
#             best_params = {'factors': factors, 'reg': reg}

# print(f"Best Params: {best_params}")

KeyError: 'item'

In [None]:
# Train the model using best hyperparameters from grid search
best_algo = train_als_model(train_data, best_params['factors'], best_params['reg'])

# Perform cross-validation to evaluate performance
fold_rmse = []

for train, test in folds:
    model = train_als_model(train, best_params['factors'], best_params['reg'])
    pred = model.predict(test[['userId', 'movieId']])
    rmse_val = util.rmse(pred, test['rating'])
    fold_rmse.append(rmse_val)

print(f"Cross-validation RMSE: {sum(fold_rmse)/len(fold_rmse)}")

In [None]:
# Train on full dataset in batches
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM film-wizard-453315.Grouplens.500k_ratings
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    train_data = batch_df[['userId', 'movieId', 'rating']]
    
    # Train the model on this batch
    best_algo.fit(train_data)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")
    

In [None]:
# Predict rating for a specific user and movie
user_id = 10531
movie_id = 100

# Make prediction using the trained ALS model
prediction = best_algo.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} and movie {movie_id}: {prediction}")

## Notes from original SVD jn

In [None]:
# Path to your CSV file
csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/Adam films/9a884b4e-8993-4800-925a-bea11dcce39e.csv"

# Load the CSV into a DataFrame
movies_df = pd.read_csv(csv_path)

# Convert the 'date' column to datetime if it's not already
movies_df['Date'] = pd.to_datetime(movies_df['Date'])

# Filter out movies released in or after October 2023
movies_df = movies_df[movies_df['Date'] < '2023-10-01']

#Filter out movies that Grouplens later has an issue matching with
# Exclude specific movies by title
movies_to_exclude = ["Herod's Law", "Spirited Away", "Sing", "Living", "Fury"]  # Replace with actual movie titles
movies_df = movies_df[~movies_df['Name'].isin(movies_to_exclude)]

##### Half the ratings to be out of 5 - only for Adam #####
movies_df['Rating'] = movies_df['Rating']/2

# Sort by 'date' in descending order to get the most recent entries
oliver_recent_movies = movies_df.sort_values(by='Date', ascending=False).head(10)

# Show the most recent 10 movies and their ratings
display(oliver_recent_movies)

In [None]:
from fuzzywuzzy import fuzz, process

# Set up BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Query to fetch the relevant data from BigQuery
query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
# Fetch data from BigQuery and load it into a DataFrame
grouplens_movies_df = client.query(query).to_dataframe()

# Show the first few rows of the DataFrame
display('grouplens_movies_df:')
display(grouplens_movies_df.head())

# Perform fuzzy matching between 'title' in 'oliver_recent_movies' and 'title' in 'grouplens_movies_df'
def get_best_match(title, choices, scorer=fuzz.ratio):
    """Fuzzy match using fuzz.ratio and return the best match."""
    match = process.extractOne(title, choices, scorer=scorer)
    return match[0] if match else None

# Apply fuzzy matching to the titles in 'oliver_recent_movies'
oliver_recent_movies['matched_title'] = oliver_recent_movies['Name'].apply(
    get_best_match, args=(grouplens_movies_df['title'],)
)

# Merge the original 'oliver_recent_movies' DataFrame with 'grouplens_movies_df' based on the 'matched_title'
test_movies_with_ids = pd.merge(
    oliver_recent_movies, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='matched_title', 
    right_on='title', 
    how='left'
)

##### Drop the 'matched_title' column and any other unnecessary columns - didn't use with Adam's csv
# test_movies_with_ids = test_movies_with_ids.drop(columns=['matched_title', 'Letterboxd URI', 'title', 'Date'])

In [None]:
test_movies_with_ids.drop_duplicates(inplace=True)

# Show the final DataFrame
display(test_movies_with_ids)

In [None]:
# Get all movie IDs in the GL dataset (this could be all movies in the system or a smaller list)
all_movie_ids = full_gl_df['movieId'].unique()

# Find unwatched movies for user 1 (exclude movies that user 1 has already rated)
unwatched_movie_ids = [i for i in all_movie_ids if i not in test_movies_with_ids['movieId'].values]

# Make predictions for the unwatched movies
predictions = [best_algo.predict(1, movie_id) for movie_id in unwatched_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top X recommended movies with their predicted ratings
recommended_movies = [(pred.iid, pred.est) for pred in predictions[:10]]

# Convert to DataFrame
recommended_df = pd.DataFrame(recommended_movies, columns=["Movie ID", "Predicted Rating"])
display(recommended_df)

In [None]:
display(recommended_df.info())
display(grouplens_movies_df.info())

# Bring back titles
final_recommendation = pd.merge(
    recommended_df, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='Movie ID', 
    right_on='movieId', 
    how='left'
)

final_recommendation

## Understanding the Grouplens dataset

In [None]:
# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Pull table from bq
sample_query = """
SELECT *
FROM `film-wizard-453315.Grouplens.grouplens_movies`
ORDER BY RAND()
LIMIT 1000  -- Adjust for ~1% of 32M rows
"""

test_df = client.query(sample_query).to_dataframe()
test_df.head(10)

In [None]:
import pickle

with open('SVD_film_model.pkl', 'rb') as file:
    best_algo_test = pickle.load(file)

In [None]:
# Make predictions for the unwatched movies
predictions = best_algo.predict(157707, 1)
predictions

# Sort the predictions by predicted rating (descending order)
# predictions.sort(key=lambda x: x.est, reverse=True)

In [None]:
reader.