In [13]:
# Updated Code with Areas for Consideration

# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from google.cloud import bigquery
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score
from surprise import Dataset, Reader

# Initialize BigQuery client
PROJECT_ID = "film-wizard-453315"
DATASET_ID = "Grouplens"
client = bigquery.Client(project=PROJECT_ID)

# Fetch data in batches from BigQuery to avoid memory issues
# Scalability: This prevents excessive memory usage

def fetch_data(batch_size=250000):
    query = f'''
    SELECT userId, movieId, rating 
    FROM `{PROJECT_ID}.{DATASET_ID}.raw_grouplens_ratings`
    LIMIT {batch_size}
    '''
    return client.query(query).to_dataframe()

# Load data
ratings_df = fetch_data()
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Create User & Movie ID mappings
user_to_idx = {user: i for i, user in enumerate(ratings_df['userId'].unique())}
movie_to_idx = {movie: i for i, movie in enumerate(ratings_df['movieId'].unique())}

ratings_df['user_idx'] = ratings_df['userId'].map(user_to_idx)
ratings_df['movie_idx'] = ratings_df['movieId'].map(movie_to_idx)

# Define Data Generator for Training

def data_generator(data, batch_size=1024):
    while True:
        for i in range(0, len(data), batch_size):
            batch = data.iloc[i:i+batch_size]
            X_batch = (batch['user_idx'].values, batch['movie_idx'].values)
            y_batch = batch['rating'].values
            yield X_batch, y_batch  # Ensure correct tuple format

# Define Deep Learning Model
num_users = len(user_to_idx)
num_movies = len(movie_to_idx)
embedding_size = 150  # Increased for better learning

user_input = keras.Input(shape=(1,))
movie_input = keras.Input(shape=(1,))

# Added L2 regularization to embeddings
user_embedding = keras.layers.Embedding(input_dim=num_users, output_dim=embedding_size,
                                        embeddings_regularizer=keras.regularizers.l2(0.001))(user_input)
movie_embedding = keras.layers.Embedding(input_dim=num_movies, output_dim=embedding_size,
                                         embeddings_regularizer=keras.regularizers.l2(0.001))(movie_input)

user_vec = keras.layers.Flatten()(user_embedding)
movie_vec = keras.layers.Flatten()(movie_embedding)

concat = keras.layers.Concatenate()([user_vec, movie_vec])
dense1 = keras.layers.Dense(128, activation='relu')(concat)
dropout1 = keras.layers.Dropout(0.4)(dense1)  # Increased dropout for regularization
dense2 = keras.layers.Dense(64, activation='relu')(dropout1)
dropout2 = keras.layers.Dropout(0.4)(dense2)  # Increased dropout for better generalization
output = keras.layers.Dense(1, activation='linear')(dropout2)

model = keras.Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.0001), loss=tf.keras.losses.Huber(), metrics=['mae'])  # Lower learning rate

# Add Early Stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)

# Train Model with Batch Processing
batch_size = 1024
train_steps = len(ratings_df) // batch_size

# Split data into training (90%) and validation (10%)
train_data = ratings_df.sample(frac=0.9, random_state=42)
val_data = ratings_df.drop(train_data.index)

model.fit(
    data_generator(train_data, batch_size),
    steps_per_epoch=len(train_data) // batch_size,
    validation_data=data_generator(val_data, batch_size),
    validation_steps=len(val_data) // batch_size,
    epochs=20,
    callbacks=[early_stopping]
)

# Additional Evaluation Metrics
# 1. Precision & Recall
# 2. Diversity & Serendipity - To ensure recommendations are not too homogeneous

from collections import defaultdict

def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Generate predictions using batch processing
val_users = val_data['user_idx'].values
val_movies = val_data['movie_idx'].values

# Predict in batches to reduce excessive print logs
predicted_ratings = model.predict([val_users, val_movies], batch_size=1024).flatten()

# Ensure y_pred has the correct shape
y_true = val_data['rating'].values
y_pred = predicted_ratings.reshape(-1)  # Flatten in case of incorrect shape

rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # Compute RMSE manually
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

# Combine predictions with actual values
predictions = list(zip(val_data['userId'], val_data['movieId'], val_data['rating'], y_pred, [None]*len(val_data)))

# Compute top recommendations
top_n = get_top_n(predictions, n=10)

# Compute diversity: Measure how varied recommendations are
unique_movies = set()
for uid, user_ratings in top_n.items():
    unique_movies.update([iid for iid, _ in user_ratings])
diversity_score = len(unique_movies) / len(set(ratings_df['movieId']))

# Print evaluation results
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")
print(f"Diversity Score: {diversity_score:.4f}")

# Explanation Component: Justifying recommendations
# This helps users understand why a particular recommendation was made

def explain_recommendation(user_id, top_n, ratings_df):
    if user_id not in top_n:
        print(f"No recommendations available for User {user_id}")
        return
    print(f"User {user_id} recommendations:")
    for movie_id, score in top_n[user_id]:
        movie_title = ratings_df.loc[ratings_df['movieId'] == movie_id, 'title'].values
        print(f"- {movie_title[0] if len(movie_title) > 0 else movie_id} (Score: {score:.2f})")

# Example explanation for a user
explain_recommendation(123, top_n, ratings_df)


Epoch 1/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 127ms/step - loss: 12.9067 - mae: 3.0387 - val_loss: 4.9650 - val_mae: 1.8916
Epoch 2/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 106ms/step - loss: 3.5224 - mae: 1.3307 - val_loss: 1.3583 - val_mae: 0.7941
Epoch 3/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m22s[0m 101ms/step - loss: 1.1761 - mae: 0.8426 - val_loss: 0.7163 - val_mae: 0.7561
Epoch 4/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 108ms/step - loss: 0.6803 - mae: 0.7827 - val_loss: 0.5637 - val_mae: 0.7479
Epoch 5/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m24s[0m 110ms/step - loss: 0.5412 - mae: 0.7490 - val_loss: 0.5218 - val_mae: 0.7548
Epoch 6/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m23s[0m 105ms/step - loss: 0.4845 - mae: 0.7276 - val_loss: 0.5069 - val_mae: 0.7680
Epoch 7/20
[1m219/219[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1

In [16]:
# 🚀 Print Summary of Results
print("\n🎯 **Final Model Evaluation Summary** 🎯")
# print(f"Test MSE: {mse:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test MAE: {mae:.4f}")
print(f"R² Score: {r2:.4f}")


# Compute the average predicted rating
avg_pred_rating = np.mean(predicted_ratings)
print(f"\n📊 **Average Predicted Rating:** {avg_pred_rating:.2f}")



🎯 **Final Model Evaluation Summary** 🎯
Test RMSE: 0.8711
Test MAE: 0.6775
R² Score: 0.0904

📊 **Average Predicted Rating:** 3.10


In [19]:
#Check my ratings vs predicted ratings

# Initialize BigQuery client
PROJECT_ID = "film-wizard-453315"
DATASET_ID = "Grouplens"
client = bigquery.Client(project=PROJECT_ID)

# 🚀 **STEP 2: TEST THE MODEL ON SEPARATE IMDb CSV**

# Load IMDb Ratings CSV
my_csv = '/Users/adamdyerson/Downloads/IMDB My Ratings.csv'
your_ratings = pd.read_csv(my_csv)

# Rename columns to match expected format (if necessary)
your_ratings.rename(columns={'Your Rating': 'rating', 'Const': 'imdbId'}, inplace=True)

# Fetch `movies_with_imdb` to map IMDb to MovieLens IDs
query_movies = '''
SELECT movieId, title, imdbId, tmdbId 
FROM `film-wizard-453315.Grouplens.movies_with_imdb`
'''
movies_with_imdb = client.query(query_movies).to_dataframe()

# Ensure IMDb IDs are formatted properly
your_ratings['imdbId'] = your_ratings['imdbId'].astype(str).str.zfill(7)
movies_with_imdb['imdbId'] = movies_with_imdb['imdbId'].astype(str).str.zfill(7)

# Match IMDb Ratings to MovieLens IDs (Only for Testing)
test_ratings = your_ratings.merge(movies_with_imdb, on="imdbId", how="inner")
test_ratings = test_ratings[['movieId', 'title', 'rating']]

# Convert IMDb Test Data to Model Format
test_ratings['user_idx'] = 0  # Dummy user ID
test_ratings['movie_idx'] = test_ratings['movieId'].map(movie_to_idx)

# Remove any movies not in training data
test_ratings = test_ratings.dropna(subset=['movie_idx'])

# Ensure test set is not empty before making predictions
if test_ratings.empty:
    print("⚠️ No movies found in the test dataset after filtering. Check IMDb ID matching.")
else:
    # Make Predictions for IMDb Movies
    X_test = [test_ratings['user_idx'].values, test_ratings['movie_idx'].values]
    predicted_ratings = model.predict(X_test, batch_size=1024).flatten()

    # Attach Predictions & Sort
    test_ratings['predicted_rating'] = predicted_ratings
    test_ratings = test_ratings.sort_values(by='predicted_rating', ascending=False)

    print("🎬 **Predicted Ratings for Your IMDb Movies** 🎬")
    print(test_ratings[['title', 'rating', 'predicted_rating']])

⚠️ No movies found in the test dataset after filtering. Check IMDb ID matching.


In [21]:
print("Your Ratings IMDb IDs:", your_ratings['imdbId'].unique()[:10])
print("Movies with IMDb IDs:", movies_with_imdb['imdbId'].unique()[:10])


Your Ratings IMDb IDs: ['tt7671070' 'tt0118715' 'tt0034583' 'tt29940008' 'tt7203552' 'tt11691774'
 'tt9051908' 'tt0095765' 'tt13406094' 'tt3470600']
Movies with IMDb IDs: ['5504168' '6268930' '0371501' '6917242' '11898442' '6756498' '0986232'
 '1754898' '0000192' '0352691']


In [24]:


# Initialize BigQuery client
PROJECT_ID = "film-wizard-453315"
DATASET_ID = "Grouplens"
client = bigquery.Client(project=PROJECT_ID)

# 🚀 **STEP 2: TEST THE MODEL ON SEPARATE IMDb CSV**

# Load IMDb Ratings CSV
my_csv = '/Users/adamdyerson/Downloads/IMDB My Ratings.csv'
your_ratings = pd.read_csv(my_csv)

# Rename columns to match expected format (if necessary)
your_ratings.rename(columns={'Your Rating': 'rating', 'Const': 'imdbId'}, inplace=True)

# Fetch `movies_with_imdb` to map IMDb to MovieLens IDs
query_movies = '''
SELECT movieId, title, imdbId, tmdbId 
FROM `film-wizard-453315.Grouplens.movies_with_imdb`
'''
movies_with_imdb = client.query(query_movies).to_dataframe()

# Ensure IMDb IDs are formatted properly
your_ratings['imdbId'] = your_ratings['imdbId'].astype(str).str.zfill(7).str.replace('tt', '')
movies_with_imdb['imdbId'] = movies_with_imdb['imdbId'].astype(str).str.zfill(7)

# Match IMDb Ratings to MovieLens IDs (Only for Testing)
test_ratings = your_ratings.merge(movies_with_imdb, on="imdbId", how="inner")
test_ratings = test_ratings[['movieId', 'title', 'rating']]

# Convert IMDb ratings from 10-point scale to 5-point scale
test_ratings['rating'] = test_ratings['rating'] / 2

# Convert IMDb Test Data to Model Format
test_ratings['user_idx'] = 0  # Dummy user ID
test_ratings['movie_idx'] = test_ratings['movieId'].map(movie_to_idx)

# Remove any movies not in training data
test_ratings = test_ratings.dropna(subset=['movie_idx'])

# Ensure test set is not empty before making predictions
if test_ratings.empty:
    print("⚠️ No movies found in the test dataset after filtering. Check IMDb ID matching.")
else:
    # Make Predictions for IMDb Movies
    X_test = [test_ratings['user_idx'].values, test_ratings['movie_idx'].values]
    predicted_ratings = model.predict(X_test, batch_size=1024).flatten()

    # Attach Predictions & Sort
    test_ratings['predicted_rating'] = predicted_ratings
    test_ratings = test_ratings.sort_values(by='predicted_rating', ascending=False)

    print("🎬 **Predicted Ratings for Your IMDb Movies** 🎬")
    print(test_ratings[['title', 'rating', 'predicted_rating']])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 13ms/step
🎬 **Predicted Ratings for Your IMDb Movies** 🎬
                                                title  rating  \
35  For a Few Dollars More (Per qualche dollaro in...     4.0   
99                    One Flew Over the Cuckoo's Nest     5.0   
6                           Shawshank Redemption, The     5.0   
61                                     Godfather, The     5.0   
12                                         Goodfellas     5.0   
..                                                ...     ...   
13                               Mary Poppins Returns     3.5   
28                                           The Game     2.5   
80                                         Uncle Buck     4.0   
52                                     Coogan's Bluff     3.0   
37                                      Sudden Impact     3.5   

    predicted_rating  
35          4.007880  
99          4.000143  
6           3.984360  
61      

In [34]:
# 🚀 **STEP 3: RECOMMEND NEW MOVIES BASED ON SIMILAR USERS**

# Fetch user-movie interactions from the dataset
query_ratings = '''
SELECT userId, movieId, rating 
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
'''
ratings_data = client.query(query_ratings).to_dataframe()

# Identify similar users based on past ratings
similar_users = ratings_data[ratings_data['movieId'].isin(test_ratings['movieId'])]
similar_users = similar_users.groupby('userId').filter(lambda x: len(x) > 6)  # Reduce threshold to find more similar users  # Users who rated at least 5 of the same movies

# Get movie recommendations from these users
recommended_movies = ratings_data[ratings_data['userId'].isin(similar_users['userId'])]
recommended_movies = recommended_movies.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).reset_index()

# Merge with movie titles
recommended_movies = recommended_movies.merge(movies_with_imdb, on='movieId', how='left')
recommended_movies = recommended_movies[['title', 'rating', 'userId']]

# Filter out low-rated recommendations
#This gives rating that similar user gave it, not IMDB rating
recommended_movies = recommended_movies[recommended_movies['rating'] >= 4].sort_values(by='rating', ascending=False).head(10)

# Add explanation for recommendations
recommended_movies.rename(columns={'userId': 'num_users_rated'}, inplace=True)
recommended_movies['reason'] = recommended_movies.apply(lambda row: f"Rated highly ({row['rating']:.2f}) by {row['num_users_rated']} users similar to you.", axis=1)

print("🎥 **Recommended Movies Based on Similar Users** 🎥")
print(recommended_movies[['title', 'rating', 'reason']])


🎥 **Recommended Movies Based on Similar Users** 🎥
                                                   title  rating  \
57824                                     Carry on Jatta     5.0   
72674                                   Peg of Old Drury     5.0   
29274                                    The Unfaithfuls     5.0   
72612                                 ¡Se armó el belén!     5.0   
29240                                Jailbait Babysitter     5.0   
29235                                        Blue Summer     5.0   
72650  Music with Roots in the Aether: Opera for Tele...     5.0   
62980  The Incredible Adventure of Jojo (And His Anno...     5.0   
62871                                           Ciao Ni!     5.0   
64207                           AC/DC: Live At Donington     5.0   

                                               reason  
57824  Rated highly (5.00) by 2 users similar to you.  
72674  Rated highly (5.00) by 1 users similar to you.  
29274  Rated highly (5.00) by 1 u