## Final version - simplified SVD

In [1]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client(project="film-wizard-453315")
print("Authenticated successfully!")

Authenticated successfully!


In [2]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.accuracy import rmse
import pandas as pd

# Define batch size & dataset properties
BATCH_SIZE = 1_000_000  # 100k rows per batch
TOTAL_ROWS = 32_000_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

# Fetch GL ratings data from BigQuery - dataset is basis for SVD calcs
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
ORDER BY RAND()
LIMIT 1000000
"""
gl_df = client.query(sample_query).to_dataframe()
print("Loaded data from BigQuery:", gl_df.shape)
gl_df



Loaded data from BigQuery: (1000000, 3)


Unnamed: 0,userId,movieId,rating
0,36920,1136,3.5
1,154279,3624,3.5
2,173018,597,4.0
3,184006,1917,3.0
4,177369,206845,2.5
...,...,...,...
999995,106098,106920,5.0
999996,138221,909,4.5
999997,75230,53435,3.5
999998,107783,6350,4.0


In [3]:
# CSV input from new user
path_to_csv = '/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/letterboxd-oliverramsay-2025-03-13-15-05-utc/new_user_ratings_v3.csv'

new_user_ratings = pd.read_csv(path_to_csv)

new_user_ratings = new_user_ratings[new_user_ratings['Year'] <= 2022] # Drops movies which were released in 2023 or later
new_user_ratings = new_user_ratings[['Name', 'Rating']]
new_user_ratings.dropna(inplace=True)
if (new_user_ratings['Rating'] >= 6).any():
    new_user_ratings['Rating'] = new_user_ratings['Rating'] / 2
new_user_ratings.head(10)

Unnamed: 0,Name,Rating
0,The Big Lebowski,3.5
1,Casablanca,5.0
2,Living,2.0
3,Cinema Paradiso,5.0
4,Sing 2,4.5
5,Sing,4.0
6,The Shawshank Redemption,5.0
7,This Beautiful Fantastic,2.5
8,"Planes, Trains & Automobiles",4.5
9,The Hateful Eight,1.5


In [4]:
# Fuzzy matching
from fuzzywuzzy import fuzz, process
import pandas as pd

sample_query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
grouplens_movies = client.query(sample_query).to_dataframe()
print("Loaded data from BigQuery:", gl_df.shape)
display(grouplens_movies.head(5))



Loaded data from BigQuery: (1000000, 3)


Unnamed: 0,movieId,title
0,181685,به نام پدر
1,139596,Danielův svět
2,151307,The Lovers and the Despot
3,178223,Injecting Aluminum
4,189345,The Doctor From India


In [5]:
def fuzzy_match(df1, col1, df2, col2, threshold=80):
    matched_data = []
    choices = df2[col2].tolist()  # Convert column to a list

    for _, row in df1.iterrows():
        name = row[col1]
        rating = row['Rating']  # Retain 'Rating' column
        result = process.extractOne(name, choices, scorer=fuzz.ratio)
        
        if result:  
            match, score = result[:2]  # Extract match title and score
            
            if score >= threshold:
                movie_id = df2.loc[df2[col2] == match, 'movieId'].values
                movie_id = movie_id[0] if len(movie_id) > 0 else None
            else:
                match, movie_id = None, None
        else:
            match, score, movie_id = None, 0, None

        matched_data.append((movie_id, name, match, rating, score))  # Order: movieId, Name, Matched_Title, rating, FuzzyScore

    return pd.DataFrame(matched_data, columns=['movieId', col1, 'Matched_Title', 'rating', 'Fuzzy_Score'])

In [6]:
# Example usage
matches_df = fuzzy_match(new_user_ratings, 'Name', grouplens_movies, 'title')

In [7]:
matches_df.dropna(inplace=True)
matches_df.head(10)

Unnamed: 0,movieId,Name,Matched_Title,rating,Fuzzy_Score
1,912.0,Casablanca,Casablanca,5.0,100
2,173407.0,Living,Living,2.0,100
4,265550.0,Sing 2,Sing 2,4.5,100
5,155923.0,Sing,Sing,4.0,100
6,318.0,The Shawshank Redemption,"Shawshank Redemption, The",5.0,82
7,166940.0,This Beautiful Fantastic,This Beautiful Fantastic,2.5,100
8,4002.0,"Planes, Trains & Automobiles","Planes, Trains & Automobiles",4.5,100
9,128360.0,The Hateful Eight,The Hateful Eight,1.5,100
10,171763.0,Baby Driver,Baby Driver,4.0,100
11,115210.0,Fury,Fury,2.0,100


In [8]:
# Set a unique userId for the new user's films and ratings
new_user_id = gl_df['userId'].max() + 1
matches_df['userId'] = new_user_id  # Assign new userId to all rows
display(matches_df)
display(gl_df.shape)

# # Append matches_df to gl_df
gl_df = pd.concat([gl_df, matches_df[['userId', 'movieId', 'rating']]], ignore_index=True)

gl_df

Unnamed: 0,movieId,Name,Matched_Title,rating,Fuzzy_Score,userId
1,912.0,Casablanca,Casablanca,5.0,100,200949
2,173407.0,Living,Living,2.0,100,200949
4,265550.0,Sing 2,Sing 2,4.5,100,200949
5,155923.0,Sing,Sing,4.0,100,200949
6,318.0,The Shawshank Redemption,"Shawshank Redemption, The",5.0,82,200949
...,...,...,...,...,...,...
99,25.0,Leaving Las Vegas,Leaving Las Vegas,3.5,100,200949
100,260277.0,The Kite Runner,The River Runner,3.5,84,200949
101,125475.0,The Painted Veil,The Painted Hills,3.5,85,200949
103,72011.0,Up in the Air,Up in the Air,3.0,100,200949


(1000000, 3)

Unnamed: 0,userId,movieId,rating
0,36920,1136.0,3.5
1,154279,3624.0,3.5
2,173018,597.0,4.0
3,184006,1917.0,3.0
4,177369,206845.0,2.5
...,...,...,...
1000091,200949,25.0,3.5
1000092,200949,260277.0,3.5
1000093,200949,125475.0,3.5
1000094,200949,72011.0,3.0


In [9]:
data = Dataset.load_from_df(gl_df, reader)

# Split the data into a training and test set
trainset, testset = train_test_split(data, test_size=0.2)

# Create the SVD model and train it
model = SVD()
model.fit(trainset)

# Define a function to get top 5 recommendations for a user
def get_top_n(predictions, n):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and return the top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

In [10]:
# **2️⃣ Cross-Validation Before Batch Training**
cv_results = cross_validate(model, data, cv=5, verbose=True)
print(cv_results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9216  0.9193  0.9192  0.9191  0.9204  0.9199  0.0010  
MAE (testset)     0.7078  0.7067  0.7064  0.7056  0.7066  0.7066  0.0007  
Fit time          10.09   10.40   10.37   10.43   10.46   10.35   0.13    
Test time         2.24    1.68    1.67    1.66    1.68    1.79    0.23    
{'test_rmse': array([0.92156951, 0.91925061, 0.91919475, 0.919053  , 0.92044447]), 'test_mae': array([0.70782438, 0.70673628, 0.70636037, 0.70562853, 0.70662499]), 'fit_time': (10.086143016815186, 10.397918224334717, 10.367759943008423, 10.429032802581787, 10.458298206329346), 'test_time': (2.2388858795166016, 1.6809520721435547, 1.6712260246276855, 1.6644840240478516, 1.677549123764038)}


In [11]:
# Extract list of all movieIds
all_movie_ids = grouplens_movies['movieId']
all_movie_ids = all_movie_ids.unique()
all_movie_ids.shape

(87585,)

In [12]:
# Predictions for selected user for all movieIds
new_user_id = new_user_id
predictions_list = []

# Loop through all movieIds and make predictions
for movieId in all_movie_ids:
    prediction = model.predict(new_user_id, movieId)
    # Append the movieId and predicted rating (rename 'prediction' to 'estimated rating')
    predictions_list.append({'movieId': movieId, 'estimated rating': prediction.est})

# Create a DataFrame from the collected data with columns ['movieId', 'estimated rating']
df_predictions = pd.DataFrame(predictions_list)
df_predictions_sorted = df_predictions.sort_values(by='estimated rating', ascending=False)
df_predictions_sorted

Unnamed: 0,movieId,estimated rating
53462,160289,4.768609
340,171495,4.761653
59417,1193,4.695697
74342,202749,4.667964
37327,202439,4.665305
...,...,...
15584,5094,1.989602
18032,31698,1.931812
30119,61348,1.892170
10344,181,1.868727


In [13]:
df_predictions = df_predictions_sorted.merge(grouplens_movies[['movieId', 'title']], on='movieId', how='inner')
df_predictions.head(20)

Unnamed: 0,movieId,estimated rating,title
0,160289,4.768609,O.J.: Made in America
1,171495,4.761653,Cosmos
2,1193,4.695697,One Flew Over the Cuckoo's Nest
3,202749,4.667964,Portrait of a Lady on Fire
4,202439,4.665305,Parasite
5,318,4.654675,"Shawshank Redemption, The"
6,5971,4.653455,My Neighbor Totoro (Tonari no Totoro)
7,908,4.652485,North by Northwest
8,1221,4.628084,"Godfather: Part II, The"
9,136449,4.622116,Ghost in the Shell 2.0


## Adam's multiple users addition

In [14]:
# ==== :two: Load CSV Files ====
user1_csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/Adam films/9a884b4e-8993-4800-925a-bea11dcce39e.csv"
user2_csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/letterboxd-oliverramsay-2025-03-13-15-05-utc/oliver.jn_adam_test.csv"
user3_csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/letterboxd-oliverramsay-2025-03-13-15-05-utc/oliver.jn_adam_test.csv"

# Load the CSVs
user1_movies = pd.read_csv(user1_csv_path)
user2_movies = pd.read_csv(user2_csv_path)
user3_movies = pd.read_csv(user3_csv_path)

# Strip leading/trailing spaces from column names
user1_movies.columns = user1_movies.columns.str.strip()
user2_movies.columns = user2_movies.columns.str.strip()
user3_movies.columns = user3_movies.columns.str.strip()

# Ensure required columns exist
required_columns = {"title", "rating"}

if not required_columns.issubset(user1_movies.columns):
    raise ValueError("User 1 CSV files must contain 'title' and 'rating' columns.")

if not required_columns.issubset(user2_movies.columns):
    raise ValueError("User 2 CSV files must contain 'title' and 'rating' columns.")

if not required_columns.issubset(user3_movies.columns):
    raise ValueError("User 3 CSV files must contain 'title' and 'rating' columns.")

In [17]:
# ==== :three: Convert Titles to Movie IDs ====
def get_movie_ids(movie_df, movie_data):
    """Convert movie titles to movie IDs."""
    if 'title' not in movie_df.columns:
        raise KeyError("The 'title' column is missing in the movie dataframe.")
    
    matched_movies = movie_data[movie_data['title'].isin(movie_df['title'])]
    return matched_movies[['movieId']].merge(movie_df, on='title', how='left')

# Assuming grouplens_movies is already defined elsewhere:
# Map movies to their IDs - potential to improve by using FuzzyMatching
user1_rated = get_movie_ids(user1_movies, grouplens_movies)
# user2_rated = get_movie_ids(user2_movies, grouplens_movies)
# user3_rated = get_movie_ids(user3_movies, grouplens_movies)

KeyError: 'title'

In [None]:
# ==== :four: Set new users Train model on new users ====
# Assuming gl_df (GroupLens DataFrame) is already defined:
# Get the maximum existing userId
max_user_id = gl_df['userId'].max()

# Assign unique user IDs to new users
user1_rated['userId'] = max_user_id + 1
user2_rated['userId'] = max_user_id + 2
user3_rated['userId'] = max_user_id + 3

# Display the results
print(user1_rated.head())  # Or whatever you need to do with the DataFrame

In [None]:
# ==== :four: Find Unseen Movies ====
watched_movies = set(user1_rated['movieId']).union(set(user2_rated['movieId']))
all_movie_ids = set(gl_df['movieId'].unique())  # Get all available movies
unseen_movies = list(all_movie_ids - watched_movies)  # Movies neither user has seen


# ==== :five: Predict Ratings for Unseen Movies ====
predictions_list = []
for movieId in unseen_movies:
    pred_user1 = model.predict(999999, movieId).est  # Fake user ID for prediction
    pred_user2 = model.predict(888888, movieId).est
    avg_rating = (pred_user1 + pred_user2) / 2  # :white_check_mark: Averaging both predictions
    predictions_list.append({'movieId': movieId, 'avg_rating': avg_rating})

# Convert predictions to a DataFrame and sort
df_recommendations = pd.DataFrame(predictions_list).sort_values(by='avg_rating', ascending=False)

# Merge with movie titles
df_recommendations = df_recommendations.merge(grouplens_movies[['movieId', 'title']], on='movieId', how='inner')

# Display top 5 recommendations
print("\nTop 5 Movie Recommendations for Both Users:")
print(df_recommendations[['title', 'avg_rating']].head(5))