## Final version - simplified SVD

In [6]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")

Authenticated successfully!


In [7]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import train_test_split, cross_validate, GridSearchCV
from surprise.accuracy import rmse
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define batch size & dataset properties
BATCH_SIZE = 1_000_000  # 100k rows per batch
TOTAL_ROWS = 32_000_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

### **GL df**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
ORDER BY RAND()
LIMIT 1000000
"""

# Fetch data from BigQuery
gl_df = client.query(sample_query).to_dataframe()
print("Loaded data from BigQuery:", gl_df.shape)
display(gl_df.head(5))

data = Dataset.load_from_df(gl_df, reader)



Loaded data from BigQuery: (1000000, 3)


Unnamed: 0,userId,movieId,rating
0,29022,6548,4.0
1,38843,260,4.0
2,3035,8678,4.0
3,195624,5459,1.5
4,87064,36,4.0


In [8]:
gl_df['userId'].max()+1

new_user_ratings = pd.read_csv('/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/letterboxd-oliverramsay-2025-03-13-15-05-utc/new_user_ratings.csv')
new_user_ratings = new_user_ratings[new_user_ratings['Year'] <= 2022]

In [9]:
new_user_ratings.drop(columns=['Date', 'Year', 'Letterboxd URI'], inplace=True)
new_user_ratings.dropna(inplace=True)

In [10]:
new_user_ratings

Unnamed: 0,Name,Rating
0,"Three Billboards Outside Ebbing, Missouri",4.5
1,Nomadland,4.0
2,Lady Bird,3.0
3,Yesterday,2.5
4,Coco,3.5
5,mid90s,4.0
6,The Street,4.0
7,There Will Be Blood,3.5
8,Pain and Glory,3.0
9,Senna,3.5


In [11]:
### Fuzzy matching ###
### **GL df**
sample_query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""

# Fetch data from BigQuery
grouplens_movies = client.query(sample_query).to_dataframe()
print("Loaded data from BigQuery:", gl_df.shape)
display(grouplens_movies.head(5))



Loaded data from BigQuery: (1000000, 3)


Unnamed: 0,movieId,title
0,181685,به نام پدر
1,139596,Danielův svět
2,151307,The Lovers and the Despot
3,178223,Injecting Aluminum
4,189345,The Doctor From India


In [16]:
from fuzzywuzzy import fuzz, process
import pandas as pd

def fuzzy_match(df1, col1, df2, col2, threshold=80):
    matched_data = []
    choices = df2[col2].tolist()  # Convert column to a list to avoid unpacking issues

    for name in df1[col1]:
        result = process.extractOne(name, choices, scorer=fuzz.ratio)
        match, score = result if result else ("", 0)  # Ensure safe unpacking
        matched_data.append((name, match if score >= threshold else None, score))

    return pd.DataFrame(matched_data, columns=[col1, 'Matched_Title', 'Score'])

In [17]:
# Example usage
matches_df = fuzzy_match(new_user_ratings, 'Name', grouplens_movies, 'title')
display(matches_df)

Unnamed: 0,Name,Matched_Title,Score
0,"Three Billboards Outside Ebbing, Missouri","Three Billboards Outside Ebbing, Missouri",100
1,Nomadland,Nomadland,100
2,Lady Bird,Lady Bird,100
3,Yesterday,Yesterday,100
4,Coco,Coco,100
5,mid90s,Mid90s,100
6,The Street,The Street,100
7,There Will Be Blood,There Will Be Blood,100
8,Pain and Glory,Pain and Glory,100
9,Senna,Senna,100


In [14]:
# Split the data into a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

# Create the SVD model and train it
model = SVD()
model.fit(trainset)

# Define a function to get top 5 recommendations for a user
def get_top_n(predictions, n):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and return the top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

# Load your data from BigQuery (assuming you have already loaded gl_df)
# Here's an example:
# gl_df = pd.read_csv('your_data.csv')

In [15]:
# **2️⃣ Cross-Validation Before Batch Training**
cv_results = cross_validate(model, data, cv=5, verbose=True)
print(cv_results)

Evaluating RMSE, MAE of algorithm SVD on 5 split(s).

                  Fold 1  Fold 2  Fold 3  Fold 4  Fold 5  Mean    Std     
RMSE (testset)    0.9203  0.9204  0.9196  0.9200  0.9184  0.9197  0.0007  
MAE (testset)     0.7069  0.7075  0.7060  0.7063  0.7062  0.7066  0.0005  
Fit time          10.16   10.31   10.79   10.54   10.75   10.51   0.25    
Test time         1.15    1.88    1.88    1.81    1.83    1.71    0.28    
{'test_rmse': array([0.92025165, 0.92044244, 0.91963211, 0.91995995, 0.91844325]), 'test_mae': array([0.70689903, 0.70745444, 0.70601073, 0.70631366, 0.70616308]), 'fit_time': (10.158315181732178, 10.31357216835022, 10.790318012237549, 10.543659925460815, 10.752399921417236), 'test_time': (1.1525812149047852, 1.8755269050598145, 1.88232421875, 1.8144030570983887, 1.825963020324707)}


In [23]:
# Get predictions for the testset
predictions = model.test(testset)

# Get the top 10 recommendations for each user
top_n = get_top_n(predictions, n=10)
top_n.get(11157)

[(1193, 4.204463778449138), (858, 4.125093975694352)]

In [None]:
model.predict()

In [None]:
test = {
    132721: [(3030, 4.343930472863401),
  (3147, 4.173104100455896),
  (8014, 4.10402787200236)],
 134273: [(69481, 3.2115050041528552),
  (2917, 3.147696080876573),
  (1227, 3.14304677816368)],
 163615: [(1196, 4.082660659362825),
  (7099, 4.038083503624332),
  (608, 3.769995671477647)]
}

In [None]:
test.items()

In [None]:
# Convert user recommendations into a single DataFrame
data = []
for user_id, movies in top_n.items():
    for movie_id, rating in movies:
        data.append((user_id, movie_id, rating))

df = pd.DataFrame(data, columns=["userId", "movieId", "rating"])
df

In [None]:
# Count occurrences of each movieId
movie_counts = df['movieId'].value_counts()

# Filter rows where movieId appears more than once
duplicated_movies_df = df[df['movieId'].isin(movie_counts[movie_counts > 1].index)]

print(duplicated_movies_df)

In [None]:
df[df['movieId'] == 55820]

In [None]:
test_dict = {
    'userId': [132721],
    'movieId': [3030],
    'rating': [4.343930472863401]
}

test_df = pd.DataFrame(data=test_dict)
test_df

In [None]:
# Initialize an empty dictionary to store DataFrames
user_dfs = {}

# Loop through each userId and create a DataFrame
for user_id in top_n:
    movie_data = top_n[user_id]  # Get movie list for the user
    movie_ids = []
    predicted_ratings = []
    
    # Extract movieId and predicted_rating
    for movie in movie_data:
        movie_ids.append(movie[0])
        predicted_ratings.append(movie[1])
    
    # Create DataFrame
    df = pd.DataFrame({"movieId": movie_ids, "predicted_rating": predicted_ratings})
    
    # Store in dictionary
    user_dfs[user_id] = df

# Display example output
for user_id in user_dfs:
    print(f"User {user_id} recommendations:")
    print(user_dfs[user_id], "\n")

## SVD Grid Search User Review Model

In [None]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")

In [None]:
from surprise import SVD, Dataset, Reader
from surprise.model_selection import cross_validate, GridSearchCV
from surprise.accuracy import rmse
from google.cloud import bigquery
import pandas as pd

# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Define batch size & dataset properties
BATCH_SIZE = 100_000  # 100k rows per batch
TOTAL_ROWS = 500_000  # Approximate total dataset size
reader = Reader(rating_scale=(0.5, 5))

### **GL df with  500k randomly sampled ratings**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.500k_ratings`
ORDER BY RAND()
"""
sample_gl_df = client.query(sample_query).to_dataframe()
sample_gl_df.head(5)

In [None]:
reader = Reader(rating_scale=(0.5, 5))
data = Dataset.load_from_df(sample_gl_df, reader)

# Split the data into a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

# Create the SVD model and train it
model = SVD()
model.fit(trainset)

# Define a function to get top 5 recommendations for a user
def get_top_n(predictions, n):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and return the top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

# **2️⃣ Cross-Validation Before Batch Training**
cv_results = cross_validate(model, data, cv=5, verbose=True)
print(cv_results)

# Get predictions for the testset
predictions = model.test(testset)

# Get the top 5 recommendations for each user
top_n = get_top_n(predictions, n=50)
top_n

In [None]:
import seaborn as sns

grouped_rating_df = sample_gl_df.groupby(by='movieId').count().sort_values(by='movieId', ascending=True)
display(grouped_rating_df)

# sns.histplot(data=grouped_rating_df, x='movieId')

# sample_gl_df['quartile'] = pd.qcut(sample_gl_df['rating'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
# sample_gl_df

# sample_gl_df.groupby(by='quartile').count()

In [None]:
grouped_df = sample_gl_df.groupby('movieId').count().reset_index(names='num_reviews')
grouped_df

# grouped_df['quartile'] = pd.qcut(grouped_df['num_reviews'], q=4, labels=['Q1', 'Q2', 'Q3', 'Q4'])
# group_df['quartile']

In [None]:
### **1️⃣ Perform Grid Search on Small Sample**
sample_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.500k_ratings`
ORDER BY RAND()
LIMIT 100000  -- Adjust for ~20% of 500k rows
"""
grid_search_df = client.query(sample_query).to_dataframe()
data = Dataset.load_from_df(grid_search_df[['userId', 'movieId', 'rating']], reader)

# Hyperparameter tuning
param_grid = {
    "n_factors": [10, 20, 50],  
    "reg_all": [0.01, 0.03, 0.05]  
}
gs = GridSearchCV(SVD, param_grid, measures=["rmse"], cv=3)
gs.fit(data)

best_params = gs.best_params["rmse"]
print("Best Params:", best_params)

In [None]:
# **2️⃣ Cross-Validation Before Batch Training**
best_algo = SVD(n_factors=best_params["n_factors"], reg_all=best_params["reg_all"])
cv_results = cross_validate(best_algo, data, cv=5, verbose=True)
print(cv_results)

In [None]:
# **3️⃣ Train on Full Dataset in Batches**
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM `film-wizard-453315.Grouplens.500k_ratings`
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    dataset = Dataset.load_from_df(batch_df[['userId', 'movieId', 'rating']], reader)
    trainset = dataset.build_full_trainset()
    
    best_algo.fit(trainset)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")

In [None]:
### **4️⃣ Evaluate Best Model on a Fresh Test Set**
test_query = """
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.500k_ratings`
ORDER BY RAND()
LIMIT 1500000  -- Adjust for ~5% of 32M rows
"""
test_df = client.query(test_query).to_dataframe()
testset = Dataset.load_from_df(test_df[['userId', 'movieId', 'rating']], reader).build_full_trainset().build_testset()

predictions = best_algo.test(testset)
print("Final RMSE on test set:", rmse(predictions))

In [None]:
movieId_list = sample_gl_df['movieId']

predictions_list = []

for x in movieId_list:
    predictions = best_algo.predict(2000, x)
    predictions_list.append(predictions)
    
predictions_list

In [None]:
import matplotlib.pyplot as plt

# Extract movie IDs (iid) and predicted ratings (est)
movie_ids = [pred.iid for pred in predictions_list]
est_values = [pred.est for pred in predictions_list]

# Plot the predictions
plt.figure(figsize=(12, 6))
plt.scatter(movie_ids, est_values, alpha=0.5)
plt.xlabel("Movie ID")
plt.ylabel("Predicted Rating (est)")
plt.title("Predicted Ratings for Each Movie")
plt.show()

## Lenskit

In [None]:
import os
from google.cloud import bigquery
import pandas as pd
from lenskit import crossfold
from lenskit.als import BiasedMFScorer # https://lkpy.lenskit.org/stable/guide/gettingstarted
from lenskit import util

# Initialize BigQuery client
client = bigquery.Client(project="film-wizard-453315")
print("Authenticated successfully!")

# Define batch size & dataset properties
BATCH_SIZE = 100_000  # 100k rows per batch
TOTAL_ROWS = 500_000  # Approximate total dataset size

# Fetch 500k ratings from BigQuery randomly (for the entire dataset)
sample_query = """
SELECT userId, movieId, rating
FROM film-wizard-453315.Grouplens.500k_ratings
ORDER BY RAND()
"""
sample_gl_df = client.query(sample_query).to_dataframe()

# Rename columns to match LensKit's expected column names
sample_gl_df = sample_gl_df.rename(columns={'userId': 'user_id', 'movieId': 'item_id'})
sample_gl_df

In [None]:
# I ned to define cv, hyperparam grid and then do a gridsearch

# Train the model
model = als.BiasedMF(features=3, reg=0.01, rng_spec=42)

# Fit the model with your data
model.fit(sample_gl_df[['user_id', 'item_id', 'rating']])

# Create a scorer from the model
scorer = model.scorer()

# # ---------------------------

# # Prepare data for training
# train_data = sample_gl_df[['user', 'item', 'rating']]

# # Define cross-validation with LensKit
# folds = crossfold.sample_users(train_data, 5, size=0.2, method='random')  # 5-fold cross-validation

# # Define a function for ALS model training
# def train_als_model(train_data, factors=10, reg=0.01):
#     algo = als.ALS(factors=factors, reg=reg, random_state=42)
#     algo.fit(train_data)
#     return algo

# # Example hyperparameter grid (number of factors and regularization)
# param_grid = {
#     "factors": [5, 10, 50],
#     "reg": [0.001, 0.01, 0.05]
# }

# # Manually grid search
# best_rmse = float('inf')
# best_params = {}

# for factors in param_grid["factors"]:
#     for reg in param_grid["reg"]:
#         fold_rmse = []
        
#         for train, test in folds:
#             model = train_als_model(train, factors=factors, reg=reg)
            
#             # Ensure the test data is in the form of a list of tuples (user, item)
#             user_item_pairs = list(test[['user', 'item']].itertuples(index=False, name=None))
            
#             # Make predictions for these user-item pairs
#             pred = model.predict(user_item_pairs)
            
#             # Calculate RMSE
#             rmse_val = util.rmse(pred, test['rating'])
#             fold_rmse.append(rmse_val)
        
#         mean_rmse = sum(fold_rmse) / len(fold_rmse)
#         print(f"Factors: {factors}, Reg: {reg}, RMSE: {mean_rmse}")
        
#         if mean_rmse < best_rmse:
#             best_rmse = mean_rmse
#             best_params = {'factors': factors, 'reg': reg}

# print(f"Best Params: {best_params}")

In [None]:
# Train the model using best hyperparameters from grid search
best_algo = train_als_model(train_data, best_params['factors'], best_params['reg'])

# Perform cross-validation to evaluate performance
fold_rmse = []

for train, test in folds:
    model = train_als_model(train, best_params['factors'], best_params['reg'])
    pred = model.predict(test[['userId', 'movieId']])
    rmse_val = util.rmse(pred, test['rating'])
    fold_rmse.append(rmse_val)

print(f"Cross-validation RMSE: {sum(fold_rmse)/len(fold_rmse)}")

In [None]:
# Train on full dataset in batches
for offset in range(0, TOTAL_ROWS, BATCH_SIZE):
    batch_query = f"""
    SELECT userId, movieId, rating
    FROM film-wizard-453315.Grouplens.500k_ratings
    ORDER BY userId
    LIMIT {BATCH_SIZE} OFFSET {offset}
    """
    
    batch_df = client.query(batch_query).to_dataframe()
    
    if batch_df.empty:
        break  # Stop when there are no more rows
    
    train_data = batch_df[['userId', 'movieId', 'rating']]
    
    # Train the model on this batch
    best_algo.fit(train_data)
    print(f"✅ Processed {offset + BATCH_SIZE} rows")
    

In [None]:
# Predict rating for a specific user and movie
user_id = 10531
movie_id = 100

# Make prediction using the trained ALS model
prediction = best_algo.predict(user_id, movie_id)
print(f"Predicted rating for user {user_id} and movie {movie_id}: {prediction}")

## Notes from original SVD jn

In [None]:
# Path to your CSV file
csv_path = "/Users/oliverramsaygray/code/oliverramsaygray/film_wizard/raw_data/Adam films/9a884b4e-8993-4800-925a-bea11dcce39e.csv"

# Load the CSV into a DataFrame
movies_df = pd.read_csv(csv_path)

# Convert the 'date' column to datetime if it's not already
movies_df['Date'] = pd.to_datetime(movies_df['Date'])

# Filter out movies released in or after October 2023
movies_df = movies_df[movies_df['Date'] < '2023-10-01']

#Filter out movies that Grouplens later has an issue matching with
# Exclude specific movies by title
movies_to_exclude = ["Herod's Law", "Spirited Away", "Sing", "Living", "Fury"]  # Replace with actual movie titles
movies_df = movies_df[~movies_df['Name'].isin(movies_to_exclude)]

##### Half the ratings to be out of 5 - only for Adam #####
movies_df['Rating'] = movies_df['Rating']/2

# Sort by 'date' in descending order to get the most recent entries
oliver_recent_movies = movies_df.sort_values(by='Date', ascending=False).head(10)

# Show the most recent 10 movies and their ratings
display(oliver_recent_movies)

In [None]:
from fuzzywuzzy import fuzz, process

# Set up BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Query to fetch the relevant data from BigQuery
query = """
SELECT movieId, title
FROM `film-wizard-453315.Grouplens.grouplens_movies`
"""
# Fetch data from BigQuery and load it into a DataFrame
grouplens_movies_df = client.query(query).to_dataframe()

# Show the first few rows of the DataFrame
display('grouplens_movies_df:')
display(grouplens_movies_df.head())

# Perform fuzzy matching between 'title' in 'oliver_recent_movies' and 'title' in 'grouplens_movies_df'
def get_best_match(title, choices, scorer=fuzz.ratio):
    """Fuzzy match using fuzz.ratio and return the best match."""
    match = process.extractOne(title, choices, scorer=scorer)
    return match[0] if match else None

# Apply fuzzy matching to the titles in 'oliver_recent_movies'
oliver_recent_movies['matched_title'] = oliver_recent_movies['Name'].apply(
    get_best_match, args=(grouplens_movies_df['title'],)
)

# Merge the original 'oliver_recent_movies' DataFrame with 'grouplens_movies_df' based on the 'matched_title'
test_movies_with_ids = pd.merge(
    oliver_recent_movies, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='matched_title', 
    right_on='title', 
    how='left'
)

##### Drop the 'matched_title' column and any other unnecessary columns - didn't use with Adam's csv
# test_movies_with_ids = test_movies_with_ids.drop(columns=['matched_title', 'Letterboxd URI', 'title', 'Date'])

In [None]:
test_movies_with_ids.drop_duplicates(inplace=True)

# Show the final DataFrame
display(test_movies_with_ids)

In [None]:
# Get all movie IDs in the GL dataset (this could be all movies in the system or a smaller list)
all_movie_ids = full_gl_df['movieId'].unique()

# Find unwatched movies for user 1 (exclude movies that user 1 has already rated)
unwatched_movie_ids = [i for i in all_movie_ids if i not in test_movies_with_ids['movieId'].values]

# Make predictions for the unwatched movies
predictions = [best_algo.predict(1, movie_id) for movie_id in unwatched_movie_ids]

# Sort the predictions by predicted rating (descending order)
predictions.sort(key=lambda x: x.est, reverse=True)

# Extract top X recommended movies with their predicted ratings
recommended_movies = [(pred.iid, pred.est) for pred in predictions[:10]]

# Convert to DataFrame
recommended_df = pd.DataFrame(recommended_movies, columns=["Movie ID", "Predicted Rating"])
display(recommended_df)

In [None]:
display(recommended_df.info())
display(grouplens_movies_df.info())

# Bring back titles
final_recommendation = pd.merge(
    recommended_df, 
    grouplens_movies_df[['title', 'movieId']], 
    left_on='Movie ID', 
    right_on='movieId', 
    how='left'
)

final_recommendation

## Understanding the Grouplens dataset

In [None]:
# Define BigQuery client
client = bigquery.Client(project="film-wizard-453315")

# Pull table from bq
sample_query = """
SELECT *
FROM `film-wizard-453315.Grouplens.grouplens_movies`
ORDER BY RAND()
LIMIT 1000  -- Adjust for ~1% of 32M rows
"""

test_df = client.query(sample_query).to_dataframe()
test_df.head(10)

In [None]:
import pickle

with open('SVD_film_model.pkl', 'rb') as file:
    best_algo_test = pickle.load(file)

In [None]:
# Make predictions for the unwatched movies
predictions = best_algo.predict(157707, 1)
predictions

# Sort the predictions by predicted rating (descending order)
# predictions.sort(key=lambda x: x.est, reverse=True)

In [None]:
reader.

## Online example of Surprise

##### https://medium.com/@ckucewicz21/building-a-simple-movie-recommendation-system-with-surprise-6e61479e1e73

In [None]:
from surprise import Dataset, Reader
from surprise import SVD
from surprise.model_selection import train_test_split
from surprise import accuracy

# Load the MovieLens 100k dataset (this is built into Surprise)
data = Dataset.load_builtin('ml-100k')

In [None]:
# Split the data into a training and test set
trainset, testset = train_test_split(data, test_size=0.25)

# Create the SVD model and train it
model = SVD()
model.fit(trainset)

In [None]:
# Define a function to get top 5 recommendations for a user
def get_top_n(predictions, n):
    top_n = {}
    for uid, iid, true_r, est, _ in predictions:
        if uid not in top_n:
            top_n[uid] = []
        top_n[uid].append((iid, est))

    # Sort the predictions for each user and return the top n
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    
    return top_n

# Get predictions for the testset
predictions = model.test(testset)

# Get the top 5 recommendations for each user
top_n = get_top_n(predictions, n=50)

#-------------------------------

# Get top recommendations for a specific user (e.g., user ID = 1)
user_id = '825'
user_top_n = top_n.get(user_id, [])

# Convert to DataFrame
df_user_top_825 = pd.DataFrame(user_top_n, columns=['Item ID', 'Estimated Rating'])

# Get top recommendations for a specific user (e.g., user ID = 1)
user_id = '253'
user_top_n = top_n.get(user_id, [])

# Convert to DataFrame
df_user_top_253 = pd.DataFrame(user_top_n, columns=['Item ID', 'Estimated Rating'])

# Find common Item IDs between the two DataFrames
common_item_ids = df_user_top_825[df_user_top_825['Item ID'].isin(df_user_top_253['Item ID'])]

# Display the common Item IDs
display(common_item_ids)

# Display the DataFrame
display(df_user_top_825, df_user_top_253)

# # Print the top 5 recommendations for a specific user (user_id = 1)
# print(f"Top 5 movie recommendations for user 825: {top_n.get('825')}")

In [None]:
display(top_n)

In [None]:
# Print the top 5 recommendations for a specific user (user_id = 1)
print(f"Top 5 movie recommendations for user 622: {top_n.get('622')}")