## Edited SVD- minus Fuzzy

In [1]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")

Authenticated successfully!


In [118]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.accuracy import rmse
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define batch size & dataset properties
BATCH_SIZE = 1_000_000  # 100k rows per batch
TOTAL_ROWS = 32_000_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

movie_metadata_query = """
SELECT movieId, title, imdbId, tmdbId
FROM `film-wizard-453315.Grouplens.movies_with_imdb`
"""

# Fetch metadata from BigQuery
movie_metadata_df = client.query(movie_metadata_query).to_dataframe()
print("Loaded movie metadata:", movie_metadata_df.shape)

# Convert movie metadata into a dictionary for fast lookup
movie_metadata = movie_metadata_df.set_index("movieId")[["title", "imdbId", "tmdbId"]]


### **GL df**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
ORDER BY RAND()
LIMIT 2000000
"""

# Fetch data from BigQuery for Surprise 3 columns limit
gl_df = client.query(sample_query).to_dataframe()
print("Loaded data from BigQuery:", gl_df.shape)
display(gl_df.head(5))

data = Dataset.load_from_df(gl_df, reader)

Loaded movie metadata: (87461, 4)
Loaded data from BigQuery: (2000000, 3)


Unnamed: 0,userId,movieId,rating
0,169430,25,2.5
1,30484,136235,2.5
2,89021,1584,5.0
3,20980,4285,3.0
4,7858,93126,1.0


In [119]:
movie_metadata_df["movieId"] = movie_metadata_df["movieId"].astype(int)  # Ensure movieId is integer

# Correctly convert to a dictionary
movie_metadata = movie_metadata_df.set_index("movieId")[["title", "imdbId", "tmdbId"]].to_dict(orient="index")

# Check sample output
print(list(movie_metadata.items())[:5])  # Should print first 5 movieId mappings

print(list(movie_metadata.items())[:5])  # Print first 5 movieId-title pairs


[(155121, {'title': 'Kshanam', 'imdbId': 5504168, 'tmdbId': 386427}), (168730, {'title': 'Mumford & Sons: We Wrote This Yesterday', 'imdbId': 6268930, 'tmdbId': 439121}), (157787, {'title': '.hack Liminality: In the Case of Mai Minase', 'imdbId': 371501, 'tmdbId': 93734}), (183935, {'title': 'Made in Italy', 'imdbId': 6917242, 'tmdbId': 500268}), (213620, {'title': 'Playtime with Destiny', 'imdbId': 11898442, 'tmdbId': 680842})]
[(155121, {'title': 'Kshanam', 'imdbId': 5504168, 'tmdbId': 386427}), (168730, {'title': 'Mumford & Sons: We Wrote This Yesterday', 'imdbId': 6268930, 'tmdbId': 439121}), (157787, {'title': '.hack Liminality: In the Case of Mai Minase', 'imdbId': 371501, 'tmdbId': 93734}), (183935, {'title': 'Made in Italy', 'imdbId': 6917242, 'tmdbId': 500268}), (213620, {'title': 'Playtime with Destiny', 'imdbId': 11898442, 'tmdbId': 680842})]


In [129]:
#Accept new user input...via csv for now

# gl_df['userId'].max()+1

new_user_ratings = pd.read_csv('/Users/adamdyerson/Downloads/updated_ollie_ratings2.csv')


In [130]:
# Assign a User ID to the new input

new_user_id = gl_df['userId'].max() + 1
new_user_ratings['userId'] = new_user_id  # Assign new userId to all rows
new_user_ratings

Unnamed: 0,Date,Name,Year,Letterboxd URI,Rating,MovieID,userId
0,09/05/2021,"Three Billboards Outside Ebbing, Missouri",2017,https://boxd.it/ceBS,4.5,177593.0,200949
1,16/05/2021,Nomadland,2020,https://boxd.it/lnRy,4.0,225145.0,200949
2,17/05/2021,Lady Bird,2017,https://boxd.it/dGNE,3.0,177615.0,200949
3,03/07/2021,Yesterday,2019,https://boxd.it/iF7M,2.5,201811.0,200949
4,03/07/2021,Coco,2017,https://boxd.it/bYJQ,3.5,177765.0,200949
5,05/07/2021,mid90s,2018,https://boxd.it/fxIa,4.0,,200949
6,05/07/2021,The Street,2019,https://boxd.it/ndKQ,4.0,263705.0,200949
7,21/07/2021,There Will Be Blood,2007,https://boxd.it/20Z2,3.5,56782.0,200949
8,21/07/2021,Pain and Glory,2019,https://boxd.it/iOBQ,3.0,202237.0,200949
9,22/07/2021,Senna,2010,https://boxd.it/kfE,3.5,85774.0,200949


In [131]:
trainset, testset = train_test_split(data, test_size=0.25)

# Create the SVD model and train it
model = SVD()
model.fit(trainset)

def get_top_n(predictions, n, movie_metadata):
    """Returns top N movie recommendations for each user with metadata."""
    top_n = {}

    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        
        # Fetch movie details from metadata dictionary
        movie_details = movie_metadata.get(iid, {"title": "Unknown Title", "imdbId": "N/A", "tmdbId": "N/A"})
        
        # Append movie details instead of just movieId
        top_n[uid].append((movie_details["title"], movie_details["imdbId"], movie_details["tmdbId"], round(est, 2)))

    # Sort the predictions for each user and return the top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[3], reverse=True)  # Sort by predicted rating
        top_n[uid] = user_ratings[:n]
    
    return top_n


In [128]:


# Perform 5-fold cross-validation
cv_results = cross_validate(model, data, cv=5, verbose=True)

# Compute average values for each metric
avg_fit_time = sum(cv_results['fit_time']) / len(cv_results['fit_time'])
avg_test_time = sum(cv_results['test_time']) / len(cv_results['test_time'])
avg_rmse = sum(cv_results['test_rmse']) / len(cv_results['test_rmse'])
avg_mae = sum(cv_results['test_mae']) / len(cv_results['test_mae'])

# Display results
print("\n===== Cross-Validation Results =====")
print(f"Average Fit Time: {avg_fit_time:.2f} sec")
print(f"Average Test Time: {avg_test_time:.2f} sec")
print(f"Average RMSE: {avg_rmse:.4f}")
print(f"Average MAE: {avg_mae:.4f}")


Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9050  0.9044  0.9064  0.9038  0.9064  0.9052  0.0011  
MAE (testset)     0.6931  0.6934  0.6943  0.6925  0.6944  0.6935  0.0007  
Fit time          10.04   9.69    10.66   9.53    10.25   10.04   0.40    
Test time         0.89    0.96    1.98    1.91    2.15    1.58    0.54    

===== Cross-Validation Results =====
Average Fit Time: 10.04 sec
Average Test Time: 1.58 sec
Average RMSE: 0.9052
Average MAE: 0.6935


In [132]:
# Get predictions for the testset
predictions = model.test(testset)

# Get the top 10 recommendations for each user
top_n = get_top_n(predictions, n=10, movie_metadata=movie_metadata)

# Get recommendations for a specific user (example: user ID 11157)
user_recommendations = top_n.get(11157, [])

# Display recommendations
print(f"Top 10 Recommendations for User 11157:")
for title, imdb_id, tmdb_id, rating in user_recommendations:
    print(f"- {title} (IMDb: {imdb_id}, TMDb: {tmdb_id}) -> Predicted Rating: {rating}")


Top 10 Recommendations for User 11157:


In [60]:
# top_n.get(200949)

In [109]:
gl_df.movieId.unique()

<IntegerArray>
[110975,   1517,   4734,   1639,   2248,   1282,  93326, 180297,   7899,
   6870,
 ...
 120789, 262879, 127220, 133255, 113080,   4818, 112741, 287229,  80812,
 187283]
Length: 27576, dtype: Int64

In [133]:
model.predict(200949, gl_df.movieId.unique()[0])[3]

3.764545220263573

In [134]:


result = pd.DataFrame()
for ind, i in enumerate(gl_df.movieId.unique()):
    rating = model.predict(200949, i)[3]
    result.loc[ind, 'movieId'] = int(i)
    result.loc[ind, 'Rating'] = rating
    # Lookup title from movie_metadata dictionary
    result.loc[ind, 'Title'] = movie_metadata.get(i, {}).get("title", "Unknown")
# Display the result
print(result)


        movieId    Rating                     Title
0          25.0  3.764545         Leaving Las Vegas
1      136235.0  3.364946           Hercules Reborn
2        1584.0  3.714225                   Contact
3        4285.0  3.447259        Frankie and Johnny
4       93126.0  3.361751              Bag of Bones
...         ...       ...                       ...
36039  117320.0  3.540448             The Last Word
36040  134763.0  3.522730             To Trap A Spy
36041  206111.0  3.434485                Baby Steps
36042   26314.0  3.523063  Cars That Ate Paris, The
36043  111119.0  3.455658     Man in the Wilderness

[36044 rows x 3 columns]


In [135]:
result.sort_values(by='Rating', ascending=False).head(10)

Unnamed: 0,movieId,Rating,Title
5549,159817.0,4.540486,Planet Earth
2756,171011.0,4.494497,Planet Earth II
2150,170705.0,4.466337,Band of Brothers
1614,46855.0,4.44016,Army of Shadows (L'armée des ombres)
12508,8684.0,4.422617,"Man Escaped, A (Un condamné à mort s'est écha..."
41,318.0,4.420636,"Shawshank Redemption, The"
4457,142115.0,4.407571,The Blue Planet
4402,26082.0,4.379367,Harakiri (Seppuku)
154,858.0,4.363542,"Godfather, The"
7375,7505.0,4.357391,"Kingdom, The (Riget)"


# Fuzzy Logic Code if needed

In [None]:
# ### Fuzzy matching ###
# ### **GL df**
# sample_query = """
# SELECT movieId, title
# FROM `film-wizard-453315.Grouplens.grouplens_movies`
# """

# # Fetch data from BigQuery
# grouplens_movies = client.query(sample_query).to_dataframe()
# print("Loaded data from BigQuery:", gl_df.shape)
# display(grouplens_movies.head(5))

In [None]:
# from fuzzywuzzy import fuzz, process
# import pandas as pd

# def fuzzy_match(df1, col1, df2, col2, threshold=80):
#     matched_data = []
#     choices = df2[col2].tolist()  # Convert column to a list to avoid unpacking issues

#     for name in df1[col1]:
#         result = process.extractOne(name, choices, scorer=fuzz.ratio)
#         match, score = result if result else ("", 0)  # Ensure safe unpacking
#         matched_data.append((name, match if score >= threshold else None, score))

#     return pd.DataFrame(matched_data, columns=[col1, 'Matched_Title', 'Score'])

In [None]:
# # Example usage
# matches_df = fuzzy_match(new_user_ratings, 'Name', grouplens_movies, 'title')
# display(matches_df)

In [None]:
# # Split the data into a training and test set
# trainset, testset = train_test_split(data, test_size=0.25)

# # Create the SVD model and train it
# model = SVD()
# model.fit(trainset)

# # Define a function to get top 5 recommendations for a user
# def get_top_n(predictions, n):
#     top_n = {}
#     for uid, iid, true_r, est, _ in predictions:
#         if uid not in top_n:
#             top_n[uid] = []
#         top_n[uid].append((iid, round(est, 2)))

#     # Sort the predictions for each user and return the top n
#     for uid, user_ratings in top_n.items():
#         user_ratings.sort(key=lambda x: x[1], reverse=True)
#         top_n[uid] = user_ratings[:n]
    
#     return top_n

# # Load your data from BigQuery (assuming you have already loaded gl_df)
# # Here's an example:
# # gl_df = pd.read_csv('your_data.csv')

## Lenskit

In [None]:
import os
from google.cloud import bigquery
import pandas as pd
from lenskit import crossfold
from lenskit.als import BiasedMFScorer # https://lkpy.lenskit.org/stable/guide/gettingstarted
from lenskit import util

# Initialize BigQuery client
client = bigquery.Client(project="film-wizard-453315")
print("Authenticated successfully!")

# Define batch size & dataset properties
BATCH_SIZE = 100_000  # 100k rows per batch
TOTAL_ROWS = 500_000  # Approximate total dataset size

# Fetch 500k ratings from BigQuery randomly (for the entire dataset)
sample_query = """
SELECT userId, movieId, rating
FROM film-wizard-453315.Grouplens.500k_ratings
ORDER BY RAND()
"""
sample_gl_df = client.query(sample_query).to_dataframe()

# Rename columns to match LensKit's expected column names
sample_gl_df = sample_gl_df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
sample_gl_df

In [None]:
# I ned to define cv, hyperparam grid and then do a gridsearch

# Train the model
model = als.BiasedMF(features=3, reg=0.01, rng_spec=42)

# Fit the model with your data
model.fit(sample_gl_df[['user_id', 'item_id', 'rating']])

# Create a scorer from the model
scorer = model.scorer()

# # ---------------------------

# # Prepare data for training
# train_data = sample_gl_df[['user', 'item', 'rating']]

# # Define cross-validation with LensKit
# folds = crossfold.sample_users(train_data, 5, size=0.2, method='random')  # 5-fold cross-validation

# # Define a function for ALS model training
# def train_als_model(train_data, factors=10, reg=0.01):
#     algo = als.ALS(factors=factors, reg=reg, random_state=42)
#     algo.fit(train_data)
#     return algo

# # Example hyperparameter grid (number of factors and regularization)
# param_grid = {
#     "factors": [5, 10, 50],
#     "reg": [0.001, 0.01, 0.05]
# }

# # Manually grid search
# best_rmse = float('inf')
# best_params = {}

# for factors in param_grid["factors"]:
#     for reg in param_grid["reg"]:
#         fold_rmse = []
        
#         for train, test in folds:
#             model = train_als_model(train, factors=factors, reg=reg)
            
#             # Ensure the test data is in the form of a list of tuples (user, item)
#             user_item_pairs = list(test[['user', 'item']].itertuples(index=False, name=None))
            
#             # Make predictions for these user-item pairs
#             pred = model.predict(user_item_pairs)
            
#             # Calculate RMSE
#             rmse_val = util.rmse(pred, test['rating'])
#             fold_rmse.append(rmse_val)
        
#         mean_rmse = sum(fold_rmse) / len(fold_rmse)
#         print(f"Factors: {factors}, Reg: {reg}, RMSE: {mean_rmse}")
        
#         if mean_rmse < best_rmse:
#             best_rmse = mean_rmse
#             best_params = {'factors': factors, 'reg': reg}

# print(f"Best Params: {best_params}")

In [None]:
# Train the model using best hyperparameters from grid search
best_algo = train_als_model(train_data, best_params['factors'], best_params['reg'])

# Perform cross-validation to evaluate performance
fold_rmse = []

for train, test in folds:
    model = train_als_model(train, best_params['factors'], best_params['reg'])
    pred = model.predict(test[['userId', 'movieId']])
    rmse_val = util.rmse(pred, test['rating'])
    fold_rmse.append(rmse_val)

print(f"Cross-validation RMSE: {sum(fold_rmse)/len(fold_rmse)}")

In [None]:
# Train on full dataset in batches
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM film-wizard-453315.Grouplens.500k_ratings
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    train_data = batch_df[['userId', 'movieId', 'rating']]
    
    # Train the model on this batch
    best_algo.fit(train_data)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")
    

In [None]:
# Predict rating for a specific user and movie
user_id = 10531
movie_id = 100

# Make prediction using the trained ALS model
prediction = best_algo.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} and movie {movie_id}: {prediction}")

## Notes from original SVD jn

In [None]:
# Path to your CSV file
csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/Adam films/9a884b4e-8993-4800-925a-bea11dcce39e.csv"

# Load the CSV into a DataFrame
movies_df = pd.read_csv(csv_path)

# Convert the 'date' column to datetime if it's not already
movies_df['Date'] = pd.to_datetime(movies_df['Date'])

# Filter out movies released in or after October 2023
movies_df = movies_df[movies_df['Date'] < '2023-10-01']

#Filter out movies that Grouplens later has an issue matching with
# Exclude specific movies by title
movies_to_exclude = ["Herod's Law", "Spirited Away", "Sing", "Living", "Fury"]  # Replace with actual movie titles
movies_df = movies_df[~movies_df['Name'].isin(movies_to_exclude)]

##### Half the ratings to be out of 5 - only for Adam #####
movies_df['Rating'] = movies_df['Rating']/2

# Sort by 'date' in descending order to get the most recent entries
oliver_recent_movies = movies_df.sort_values(by='Date', ascending=False).head(10)

# Show the most recent 10 movies and their ratings
display(oliver_recent_movies)

In [None]:
from fuzzywuzzy import fuzz, process

# Set up BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Query to fetch the relevant data from BigQuery
query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
# Fetch data from BigQuery and load it into a DataFrame
grouplens_movies_df = client.query(query).to_dataframe()

# Show the first few rows of the DataFrame
display('grouplens_movies_df:')
display(grouplens_movies_df.head())

# Perform fuzzy matching between 'title' in 'oliver_recent_movies' and 'title' in 'grouplens_movies_df'
def get_best_match(title, choices, scorer=fuzz.ratio):
    """Fuzzy match using fuzz.ratio and return the best match."""
    match = process.extractOne(title, choices, scorer=scorer)
    return match[0] if match else None

# Apply fuzzy matching to the titles in 'oliver_recent_movies'
oliver_recent_movies['matched_title'] = oliver_recent_movies['Name'].apply(
    get_best_match, args=(grouplens_movies_df['title'],)
)

# Merge the original 'oliver_recent_movies' DataFrame with 'grouplens_movies_df' based on the 'matched_title'
test_movies_with_ids = pd.merge(
    oliver_recent_movies, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='matched_title', 
    right_on='title', 
    how='left'
)

##### Drop the 'matched_title' column and any other unnecessary columns - didn't use with Adam's csv
# test_movies_with_ids = test_movies_with_ids.drop(columns=['matched_title', 'Letterboxd URI', 'title', 'Date'])

In [None]:
test_movies_with_ids.drop_duplicates(inplace=True)

# Show the final DataFrame
display(test_movies_with_ids)

In [None]:
# Get all movie IDs in the GL dataset (this could be all movies in the system or a smaller list)
all_movie_ids = full_gl_df['movieId'].unique()

# Find unwatched movies for user 1 (exclude movies that user 1 has already rated)
unwatched_movie_ids = [i for i in all_movie_ids if i not in test_movies_with_ids['movieId'].values]

# Make predictions for the unwatched movies
predictions = [best_algo.predict(1, movie_id) for movie_id in unwatched_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top X recommended movies with their predicted ratings
recommended_movies = [(pred.iid, pred.est) for pred in predictions[:10]]

# Convert to DataFrame
recommended_df = pd.DataFrame(recommended_movies, columns=["Movie ID", "Predicted Rating"])
display(recommended_df)

In [None]:
display(recommended_df.info())
display(grouplens_movies_df.info())

# Bring back titles
final_recommendation = pd.merge(
    recommended_df, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='Movie ID', 
    right_on='movieId', 
    how='left'
)

final_recommendation

## Understanding the Grouplens dataset

In [None]:
# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Pull table from bq
sample_query = """
SELECT *
FROM `film-wizard-453315.Grouplens.grouplens_movies`
ORDER BY RAND()
LIMIT 1000  -- Adjust for ~1% of 32M rows
"""

test_df = client.query(sample_query).to_dataframe()
test_df.head(10)

In [None]:
import pickle

with open('SVD_film_model.pkl', 'rb') as file:
    best_algo_test = pickle.load(file)

In [None]:
# Make predictions for the unwatched movies
predictions = best_algo.predict(157707, 1)
predictions

# Sort the predictions by predicted rating (descending order)
# predictions.sort(key=lambda x: x.est, reverse=True)

In [None]:
reader.

## Online example of Surprise

##### https://medium.com/@ckucewicz21/building-a-simple-movie-recommendation-system-with-surprise-6e61479e1e73