In [1]:
!pip install surprise google-cloud-bigquery
!pip install google-cloud-bigquery google-auth google-auth-oauthlib google-auth-httplib2


Collecting google-auth-httplib2
  Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl.metadata (2.2 kB)


Collecting httplib2>=0.19.0 (from google-auth-httplib2)
  Downloading httplib2-0.22.0-py3-none-any.whl.metadata (2.6 kB)
Downloading google_auth_httplib2-0.2.0-py2.py3-none-any.whl (9.3 kB)
Downloading httplib2-0.22.0-py3-none-any.whl (96 kB)
Installing collected packages: httplib2, google-auth-httplib2
Successfully installed google-auth-httplib2-0.2.0 httplib2-0.22.0


In [4]:
import os
from google.cloud import bigquery

# Initialize BigQuery client
client = bigquery.Client()
print("Authenticated successfully!")


Authenticated successfully!


In [5]:


# Import necessary libraries
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow import keras
from google.cloud import bigquery
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, precision_score, recall_score
from surprise import Dataset, Reader

# Initialize BigQuery client
PROJECT_ID = "film-wizard-453315"
DATASET_ID = "Grouplens"
client = bigquery.Client(project=PROJECT_ID)

# Fetch data in batches from BigQuery to avoid memory issues
# Scalability: This prevents excessive memory usage

#Increase data size here
def fetch_data(batch_size=1_000_000):
    query = f'''
    SELECT userId, movieId, rating
    FROM `{PROJECT_ID}.{DATASET_ID}.raw_grouplens_ratings`
    LIMIT {batch_size}
    '''
    return client.query(query).to_dataframe()

# Load data
ratings_df = fetch_data()
reader = Reader(rating_scale=(0.5, 5.0))
data = Dataset.load_from_df(ratings_df[['userId', 'movieId', 'rating']], reader)

# Create User & Movie ID mappings
user_to_idx = {user: i for i, user in enumerate(ratings_df['userId'].unique())}
movie_to_idx = {movie: i for i, movie in enumerate(ratings_df['movieId'].unique())}

ratings_df['user_idx'] = ratings_df['userId'].map(user_to_idx)
ratings_df['movie_idx'] = ratings_df['movieId'].map(movie_to_idx)

# Define Data Generator for Training

def data_generator(data, batch_size=1024):
    while True:
        for i in range(0, len(data), batch_size):
            batch = data.iloc[i:i+batch_size]
            X_batch = (batch['user_idx'].values, batch['movie_idx'].values)
            y_batch = batch['rating'].values
            yield X_batch, y_batch  # Ensure correct tuple format

# Define Deep Learning Model
num_users = len(user_to_idx)
num_movies = len(movie_to_idx)
embedding_size = 64  # Increased for better learning

user_input = keras.Input(shape=(1,))
movie_input = keras.Input(shape=(1,))

# Added L2 regularization to embeddings
user_embedding = keras.layers.Embedding(input_dim=num_users, output_dim=embedding_size,
                                        embeddings_regularizer=keras.regularizers.l2(0.001))(user_input)
movie_embedding = keras.layers.Embedding(input_dim=num_movies, output_dim=embedding_size,
                                         embeddings_regularizer=keras.regularizers.l2(0.001))(movie_input)

user_vec = keras.layers.Flatten()(user_embedding)
movie_vec = keras.layers.Flatten()(movie_embedding)

concat = keras.layers.Concatenate()([user_vec, movie_vec])
dense1 = keras.layers.Dense(128, activation='relu')(concat)
dropout1 = keras.layers.Dropout(0.4)(dense1)  # Increased dropout for regularization
dense2 = keras.layers.Dense(64, activation='relu')(dropout1)
dropout2 = keras.layers.Dropout(0.4)(dense2)  # Increased dropout for better generalization
output = keras.layers.Dense(1, activation='linear')(dropout2)

model = keras.Model(inputs=[user_input, movie_input], outputs=output)
model.compile(optimizer=keras.optimizers.Adam(learning_rate=0.001), loss=tf.keras.losses.Huber(), metrics=['mae'])  # Lower learning rate

# Add Early Stopping
early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=3, restore_best_weights=True)

# Train Model with Batch Processing
batch_size = 1024
train_steps = len(ratings_df) // batch_size

# Split data into training (80%) and validation (20%)
train_data = ratings_df.sample(frac=0.8, random_state=42)
val_data = ratings_df.drop(train_data.index)

model.fit(
    data_generator(train_data, batch_size),
    steps_per_epoch=len(train_data) // batch_size,
    validation_data=data_generator(val_data, batch_size),
    validation_steps=len(val_data) // batch_size,
    epochs=20,
    callbacks=[early_stopping]
)






Epoch 1/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 35ms/step - loss: 1.9188 - mae: 1.3543 - val_loss: 0.5418 - val_mae: 0.7854
Epoch 2/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m26s[0m 33ms/step - loss: 0.6014 - mae: 0.8691 - val_loss: 0.5215 - val_mae: 0.7754
Epoch 3/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0m 36ms/step - loss: 0.5629 - mae: 0.8296 - val_loss: 0.5068 - val_mae: 0.7623
Epoch 4/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m27s[0m 35ms/step - loss: 0.5385 - mae: 0.8027 - val_loss: 0.4933 - val_mae: 0.7455
Epoch 5/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - loss: 0.5179 - mae: 0.7811 - val_loss: 0.4788 - val_mae: 0.7355
Epoch 6/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 37ms/step - loss: 0.4997 - mae: 0.7641 - val_loss: 0.4876 - val_mae: 0.7448
Epoch 7/20
[1m781/781[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m28s[0

<keras.src.callbacks.history.History at 0x322a39b40>

In [8]:
from collections import defaultdict


def get_top_n(predictions, n=10):
    top_n = defaultdict(list)
    for uid, iid, true_r, est, _ in predictions:
        top_n[uid].append((iid, est))
    for uid, user_ratings in top_n.items():
        user_ratings.sort(key=lambda x: x[1], reverse=True)
        top_n[uid] = user_ratings[:n]
    return top_n

# Generate predictions using batch processing
val_users = val_data['user_idx'].values
val_movies = val_data['movie_idx'].values

# Predict in batches to reduce excessive print logs
predicted_ratings = model.predict([val_users, val_movies], batch_size=1024).flatten()

# Ensure y_pred has the correct shape
y_true = val_data['rating'].values
y_pred = predicted_ratings.reshape(-1)  # Flatten in case of incorrect shape

rmse = np.sqrt(mean_squared_error(y_true, y_pred))  # Compute RMSE manually
mae = mean_absolute_error(y_true, y_pred)
r2 = r2_score(y_true, y_pred)

# Combine predictions with actual values
predictions = list(zip(val_data['userId'], val_data['movieId'], val_data['rating'], y_pred, [None]*len(val_data)))

# Compute top recommendations
top_n = get_top_n(predictions, n=10)

# Compute diversity: Measure how varied recommendations are
unique_movies = set()
for uid, user_ratings in top_n.items():
    unique_movies.update([iid for iid, _ in user_ratings])
diversity_score = len(unique_movies) / len(set(ratings_df['movieId']))

avg_pred_rating = np.mean(predicted_ratings)
print(f"\n📊 **Average Predicted Rating:** {avg_pred_rating:.2f}")
# Print evaluation results
print(f"RMSE: {rmse:.4f}")
print(f"MAE: {mae:.4f}")
print(f"R2 Score: {r2:.4f}")
print(f"Diversity Score: {diversity_score:.4f}")

[1m196/196[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 682us/step

📊 **Average Predicted Rating:** 3.52
RMSE: 0.9643
MAE: 0.7387
R2 Score: 0.2536
Diversity Score: 0.4803


In [22]:


# Initialize BigQuery client
PROJECT_ID = "film-wizard-453315"
DATASET_ID = "Grouplens"
client = bigquery.Client(project=PROJECT_ID)

# 🚀 **STEP 2: TEST THE MODEL ON SEPARATE IMDb CSV**

# Load IMDb Ratings CSV
my_csv = '/Users/adamdyerson/Downloads/test_movie_data_2.csv'
your_ratings = pd.read_csv(my_csv)

# Rename columns to match expected format (if necessary)
your_ratings.rename(columns={'Your Rating': 'rating', 'Const': 'imdbId'}, inplace=True)

# Fetch `movies_with_imdb` to map IMDb to MovieLens IDs
query_movies = '''
SELECT movieId, title, imdbId, tmdbId
FROM `film-wizard-453315.Grouplens.movies_with_imdb`
'''
movies_with_imdb = client.query(query_movies).to_dataframe()

# Ensure IMDb IDs are formatted properly
your_ratings['imdbId'] = your_ratings['imdbId'].astype(str).str.zfill(7).str.replace('tt', '')
movies_with_imdb['imdbId'] = movies_with_imdb['imdbId'].astype(str).str.zfill(7)

# Match IMDb Ratings to MovieLens IDs (Only for Testing)
test_ratings = your_ratings.merge(movies_with_imdb, on="imdbId", how="inner")
test_ratings = test_ratings[['movieId', 'title', 'rating']]

# Convert IMDb ratings from 10-point scale to 5-point scale
# test_ratings['rating'] = test_ratings['rating']

# Convert IMDb Test Data to Model Format
test_ratings['user_idx'] = 0  # Dummy user ID
test_ratings['movie_idx'] = test_ratings['movieId'].map(movie_to_idx)

# Remove any movies not in training data
test_ratings = test_ratings.dropna(subset=['movie_idx'])

# Ensure test set is not empty before making predictions
if test_ratings.empty:
    print("⚠️ No movies found in the test dataset after filtering. Check IMDb ID matching.")
else:
    # Make Predictions for IMDb Movies
    X_test = [test_ratings['user_idx'].values, test_ratings['movie_idx'].values]
    predicted_ratings = model.predict(X_test, batch_size=1024).flatten()

    # Attach Predictions & Sort
    test_ratings['predicted_rating'] = predicted_ratings
    test_ratings = test_ratings.sort_values(by='predicted_rating', ascending=False)

    print("🎬 **Predicted Ratings for Your IMDb Movies** 🎬")
    print(test_ratings[['title', 'rating', 'predicted_rating']])


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 77ms/step
🎬 **Predicted Ratings for Your IMDb Movies** 🎬
                                                title  rating  \
10                                There Will Be Blood     4.5   
0   Good, the Bad and the Ugly, The (Buono, il bru...     4.5   
11          Three Billboards Outside Ebbing, Missouri     4.5   
7                             Searching for Sugar Man     3.0   
1                                   The Hateful Eight     4.0   
4                                  King's Speech, The     4.5   
2                                  The Imitation Game     4.0   
3                             Kids Are All Right, The     4.0   
8                                               Senna     3.5   
5                                     Minority Report     3.0   
12                                          Yesterday     2.5   
6                                           Nomadland     4.0   
9                                   

In [32]:
# #Unweighted Suggestions gives obscure suggestions

# # 🚀 **STEP 3: RECOMMEND NEW MOVIES BASED ON SIMILAR USERS**

# # Fetch user-movie interactions from the dataset
# query_ratings = '''
# SELECT userId, movieId, rating
# FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
# '''
# ratings_data = client.query(query_ratings).to_dataframe()

# # Identify similar users based on past ratings
# similar_users = ratings_data[ratings_data['movieId'].isin(test_ratings['movieId'])]
# similar_users = similar_users.groupby('userId').filter(lambda x: len(x) > 3)  # Reduce threshold to find more similar users  # Users who rated at least 3 of the same movies

# # Get movie recommendations from these users
# recommended_movies = ratings_data[ratings_data['userId'].isin(similar_users['userId'])]
# recommended_movies = recommended_movies.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).reset_index()

# # Merge with movie titles
# recommended_movies = recommended_movies.merge(movies_with_imdb, on='movieId', how='left')
# recommended_movies = recommended_movies[['title', 'rating', 'userId']].sort_values(by='rating', ascending=False).head(10)

# # Add explanation for recommendations
# recommended_movies.rename(columns={'userId': 'num_users_rated'}, inplace=True)
# recommended_movies['reason'] = recommended_movies.apply(lambda row: f"Rated highly ({row['rating']:.2f}) by {row['num_users_rated']} users similar to you.", axis=1)

# print("🎥 **Recommended Movies Based on Similar Users** 🎥")
# print(recommended_movies[['title', 'rating', 'reason']])


In [28]:
#Blend of suggestions of 5 popular and 5 lesser known based on user profile

# Fetch user-movie interactions from the dataset
query_ratings = '''
SELECT userId, movieId, rating
FROM `film-wizard-453315.Grouplens.raw_grouplens_ratings`
'''
ratings_data = client.query(query_ratings).to_dataframe()

# Identify similar users based on past ratings
similar_users = ratings_data[ratings_data['movieId'].isin(test_ratings['movieId'])]
similar_users = similar_users.groupby('userId').filter(lambda x: len(x) > 5)  # Require 5+ overlapping movies

# Get movie recommendations from these users
recommended_movies = ratings_data[ratings_data['userId'].isin(similar_users['userId'])]
recommended_movies = recommended_movies.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).reset_index()

# Set a threshold to filter out movies with very few ratings
recommended_movies = recommended_movies[recommended_movies['userId'] >= 10]  # Only consider movies rated by 10+ users

# Merge with movie titles
recommended_movies = recommended_movies.merge(movies_with_imdb, on='movieId', how='left')
recommended_movies.rename(columns={'userId': 'num_users_rated'}, inplace=True)

#weighted ranking to affect popular movie suggestions
recommended_movies['adjusted_score'] = (
    recommended_movies['rating'] * (np.log(1 + recommended_movies['num_users_rated']) ** 0.2)  # Square root dampens the effect
)
recommended_movies = recommended_movies.sort_values(by='adjusted_score', ascending=False)

# Select top 7 highest-ranked movies
top_ranked = recommended_movies.head(7)

# Select 3 diverse, random movies from the remaining pool
# Filter movies with a rating > 3.5 and exclude the top 7 ranked movies
exploratory_pool = recommended_movies.iloc[7:]  # Exclude top 7
exploratory_pool = exploratory_pool[exploratory_pool['rating'] > 3.5]  # Only movies with rating > 3.5

# Ensure we have at least 3 movies to sample from
if len(exploratory_pool) >= 3:
    exploratory = exploratory_pool.sample(n=3, random_state=42)
else:
    exploratory = exploratory_pool  # Take whatever is available if fewer than 3 remain

# Merge them to create the final recommendation list (before adding popular movies)
final_recommendations = pd.concat([top_ranked, exploratory]).drop_duplicates()

# Get popular movies separately
popular_movies = ratings_data.groupby('movieId').agg({'rating': 'mean', 'userId': 'count'}).reset_index()
popular_movies = popular_movies[popular_movies['userId'] > 3]  # Movies rated by at least 3 users
popular_movies = popular_movies.sort_values(by='rating', ascending=False).head(5)

# Merge popular movies with metadata
popular_movies = popular_movies.merge(movies_with_imdb, on='movieId', how='left')

# Now, blend popular movies with recommendations without overriding diversity
final_recommendations = pd.concat([final_recommendations, popular_movies]).drop_duplicates(subset=['title']).head(10)

# Add explanation for recommendations
final_recommendations['reason'] = final_recommendations.apply(
    lambda row: f"Rated highly ({row['rating']:.2f}) by {row.get('num_users_rated', 'N/A')} users similar to you.",
    axis=1
)

# Display final recommendations
print("🎥 **Recommended Movies Based on Similar Users** 🎥")
print(final_recommendations[['title', 'rating', 'reason']])


🎥 **Recommended Movies Based on Similar Users** 🎥
                                                title    rating  \
260                                      Pulp Fiction  4.376303   
281                         Shawshank Redemption, The  4.360481   
695                                    Godfather, The  4.345608   
966                                      12 Angry Men  4.317167   
2449                                       Fight Club  4.269252   
18224                                Band of Brothers  4.406667   
17423                                    Planet Earth  4.399561   
8559          Bittersweet Life, A (Dalkomhan insaeng)  3.657371   
6017   Lord of the Rings: The Return of the King, The  4.065732   
11705                                     Toy Story 3  3.785892   

                                                  reason  
260    Rated highly (4.38) by 3646 users similar to you.  
281    Rated highly (4.36) by 3573 users similar to you.  
695    Rated highly (4.35) by 3313 