# ANN Model: Z-score

# Claude Improvements and Corrections:

In [13]:
import pandas as pd
import numpy as np
import time
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score
from google.cloud import bigquery
from scipy.sparse import csr_matrix, coo_matrix
import faiss  # Approximate Nearest Neighbors

# Initialize BigQuery client
PROJECT_ID = "film-wizard-453315"
DATASET_ID = "Grouplens"
client = bigquery.Client(project=PROJECT_ID)

# Fetch data in batches from BigQuery
def fetch_data(batch_size=100000, max_rows=2000000):
    try:
        offset = 0
        all_data = []
        
        while offset < max_rows:
            query = f'''
            SELECT userId, movieId, rating 
            FROM `{PROJECT_ID}.{DATASET_ID}.raw_grouplens_ratings`
            LIMIT {batch_size} OFFSET {offset}
            '''
            df = client.query(query).to_dataframe()
            if df.empty:
                break
                
            all_data.append(df)
            offset += batch_size
            # Only print every 100k rows
            if offset % 100000 == 0:
                print(f"Fetched {offset} rows")
            
        return pd.concat(all_data, ignore_index=True) if all_data else pd.DataFrame()
    except Exception as e:
        print(f"Error fetching data: {e}")
        return pd.DataFrame()

# Function to normalize ratings using Z-score (only on training data)
def normalize_ratings(df):
    user_means = df.groupby('userId')['rating'].mean()
    user_stds = df.groupby('userId')['rating'].std().fillna(1.0)  # Handle users with constant ratings
    
    # Merge means and stds back to df
    df = df.join(user_means.rename('user_mean'), on='userId')
    df = df.join(user_stds.rename('user_std'), on='userId')
    
    # Calculate z_score
    df['z_score'] = (df['rating'] - df['user_mean']) / df['user_std']
    
    return df

# Apply user means and stds from training set to test set
def apply_normalization(test_df, train_df):
    # Extract user stats from training data
    user_means = train_df.groupby('userId')['rating'].mean()
    user_stds = train_df.groupby('userId')['rating'].std().fillna(1.0)
    
    # Apply to test data
    test_df = test_df.join(user_means.rename('user_mean'), on='userId')
    test_df = test_df.join(user_stds.rename('user_std'), on='userId')
    
    # Handle users not in training set - fixing the warnings
    global_mean = train_df['rating'].mean()
    global_std = train_df['rating'].std()
    
    # Fix pandas warning by avoiding chained assignment
    test_df.loc[test_df['user_mean'].isna(), 'user_mean'] = global_mean
    test_df.loc[test_df['user_std'].isna(), 'user_std'] = global_std
    
    # Calculate z_score
    test_df['z_score'] = (test_df['rating'] - test_df['user_mean']) / test_df['user_std']
    
    return test_df

# Main process
def main():
    # Load data
    print("Fetching data...")
    try:
        data = fetch_data(batch_size=100000, max_rows=2000000)  # Amend data size as required
        if data.empty:
            print("No data retrieved. Exiting.")
            return
    except Exception as e:
        print(f"Error loading data: {e}")
        return
    
    print(f"Raw data: {len(data)} ratings")
    
    # Filter to keep only top users and popular movies
    user_counts = data['userId'].value_counts()
    movie_counts = data['movieId'].value_counts()
    
    top_users = user_counts[user_counts >= 5].index  # Reduced minimum ratings per user
    top_movies = movie_counts[movie_counts >= 10].index  # Reduced minimum ratings per movie
    
    data = data[data['userId'].isin(top_users)]
    data = data[data['movieId'].isin(top_movies)]
    
    print(f"Filtered data: {len(data)} ratings, {len(top_users)} users, {len(top_movies)} movies")
    
    # Train-test split (do this before normalization to prevent data leakage)
    train_data, test_data = train_test_split(data, test_size=0.2, random_state=42)
    
    # Normalize ratings using z-score (training data only)
    train_data = normalize_ratings(train_data)
    
    # Apply normalization from training data to test data
    test_data = apply_normalization(test_data, train_data)
    
    # Build sparse matrix for training data
    user_ids = sorted(train_data['userId'].unique())
    movie_ids = sorted(train_data['movieId'].unique())
    
    user_idx_map = {uid: idx for idx, uid in enumerate(user_ids)}
    movie_idx_map = {mid: idx for idx, mid in enumerate(movie_ids)}
    
    # Create sparse matrix
    row_idx = [user_idx_map[uid] for uid in train_data['userId']]
    col_idx = [movie_idx_map[mid] for mid in train_data['movieId']]
    ratings = train_data['z_score'].values
    
    user_movie_matrix = coo_matrix((ratings, (row_idx, col_idx)), 
                                   shape=(len(user_ids), len(movie_ids)))
    user_movie_csr = user_movie_matrix.tocsr()
    
    # Build FAISS HNSW index
    print("Building FAISS HNSW index...")
    n_users, n_movies = user_movie_csr.shape
    
    try:
        d = n_movies  # Dimensions = number of movies
        faiss_index = faiss.IndexHNSWFlat(d, 32)  # 32 links per node
        
        # Convert sparse to dense in batches to avoid memory issues
        batch_size = 1000  # Adjust based on available RAM
        for start_idx in range(0, n_users, batch_size):
            end_idx = min(start_idx + batch_size, n_users)
            batch = user_movie_csr[start_idx:end_idx].toarray().astype('float32')
            
            # Replace NaN values (FAISS doesn't handle NaN)
            batch = np.nan_to_num(batch, nan=0.0)
            
            faiss_index.add(batch)
            # Only print every 10k users
            if (end_idx - 1) % 10000 < batch_size and end_idx > 10000:
                print(f"Added users up to {end_idx-1}")
        
        print("FAISS indexing complete")
    except Exception as e:
        print(f"Error building FAISS index: {e}")
        return
        
    # Predict ratings for test set
    print("Predicting ratings for test set...")
    predictions = []
    
    try:
        # Group test data by user for more efficient processing
        user_groups = test_data.groupby('userId')
        processed_users = 0
        
        for user_id, group in user_groups:
            if processed_users % 1000 == 0:
                print(f"Processed {processed_users} users")
            processed_users += 1
            
            if user_id in user_idx_map:
                # Get user vector from CSR matrix
                user_idx = user_idx_map[user_id]
                user_vector = user_movie_csr[user_idx].toarray().astype('float32')
                
                # Handle zero vectors (users with no ratings)
                if user_vector.sum() == 0:
                    continue
                    
                # Replace NaN values
                user_vector = np.nan_to_num(user_vector, nan=0.0)
                
                # Find similar users
                try:
                    D, I = faiss_index.search(user_vector, k=50)  #50 nearest neighbors gave better results than 20
                except Exception as e:
                    print(f"FAISS search error for user {user_id}: {e}")
                    continue
                
                # Process each movie the user rated in test set. Different weights not yet tested.
                for _, row in group.iterrows():
                    movie_id = row['movieId']
                    if movie_id in movie_idx_map:
                        movie_idx = movie_idx_map[movie_id]
                        
                        # Get ratings from similar users for this movie
                        similar_ratings = []
                        similar_distances = []
                        for i, neighbor_idx in enumerate(I[0]):
                            if 0 <= neighbor_idx < n_users:  # Valid index
                                rating = user_movie_csr[neighbor_idx, movie_idx]
                                if rating != 0:  # Rating exists
                                    similar_ratings.append(float(rating))
                                    similar_distances.append(D[0][i])
                        
                        if similar_ratings:
                            # Fix weights sum to zero issue
                            weights = np.exp(-np.array(similar_distances))
                            
                            # Check if weights sum to zero and fix if needed
                            if weights.sum() == 0:
                                # Use a simple average instead
                                weighted_avg = np.mean(similar_ratings)
                            else:
                                # Use weighted average
                                weighted_avg = np.average(similar_ratings, weights=weights)
                            
                            # Ensure we have a finite value
                            if np.isfinite(weighted_avg) and np.isfinite(row['z_score']):
                                # Store prediction
                                predictions.append({
                                    'userId': user_id,
                                    'movieId': movie_id,
                                    'actual': float(row['z_score']),
                                    'predicted': float(weighted_avg)
                                })
        
        predictions_df = pd.DataFrame(predictions)
        
        # Evaluate model
        if not predictions_df.empty:
            # Check for and remove any remaining NaN values
            predictions_df = predictions_df.dropna(subset=['actual', 'predicted'])
            
            if len(predictions_df) > 0:
                rmse = np.sqrt(mean_squared_error(predictions_df['actual'], predictions_df['predicted']))
                mae = mean_absolute_error(predictions_df['actual'], predictions_df['predicted'])
                
                # Convert z-scores back to ratings for interpretability
                # Do the merges first and explicit column access to avoid NaN issues
                user_mean_df = predictions_df.merge(test_data[['userId', 'user_mean']], on='userId')
                user_std_df = predictions_df.merge(test_data[['userId', 'user_std']], on='userId')
                
                predictions_df['actual_rating'] = (predictions_df['actual'] * 
                                                  user_std_df['user_std'] + 
                                                  user_mean_df['user_mean'])
                
                predictions_df['predicted_rating'] = (predictions_df['predicted'] * 
                                                    user_std_df['user_std'] + 
                                                    user_mean_df['user_mean'])
                
                # Clip predictions to valid rating range and handle any NaNs
                predictions_df['predicted_rating'] = predictions_df['predicted_rating'].clip(1, 5)
                predictions_df = predictions_df.dropna(subset=['actual_rating', 'predicted_rating'])
                
                if len(predictions_df) > 0:
                    rmse_raw = np.sqrt(mean_squared_error(predictions_df['actual_rating'], 
                                                        predictions_df['predicted_rating']))
                    
                    print(f"\nResults:")
                    print(f"Z-Score RMSE: {rmse:.4f}")
                    print(f"Z-Score MAE: {mae:.4f}")
                    print(f"Rating RMSE: {rmse_raw:.4f}")
                    print(f"Number of predictions: {len(predictions_df)}")
                    
                    # Show a few sample predictions
                    print("\nSample predictions:")
                    sample = predictions_df.sample(5) if len(predictions_df) >= 5 else predictions_df
                    for _, row in sample.iterrows():
                        print(f"User {row['userId']}, Movie {row['movieId']}: Actual {row['actual_rating']:.2f}, Predicted {row['predicted_rating']:.2f}")
                else:
                    print("After cleaning NaN values, no valid predictions remain")
            else:
                print("After cleaning NaN values, no valid predictions remain")
        else:
            print("No predictions were generated.")
    except Exception as e:
        print(f"Error during prediction: {e}")
        import traceback
        traceback.print_exc()

if __name__ == "__main__":
    start_time = time.time()
    main()
    print(f"Total execution time: {time.time() - start_time:.2f} seconds")

Fetching data...
Fetched 100000 rows
Fetched 200000 rows
Fetched 300000 rows
Fetched 400000 rows
Fetched 500000 rows
Fetched 600000 rows
Fetched 700000 rows
Fetched 800000 rows
Fetched 900000 rows
Fetched 1000000 rows
Fetched 1100000 rows
Fetched 1200000 rows
Fetched 1300000 rows
Fetched 1400000 rows
Fetched 1500000 rows
Fetched 1600000 rows
Fetched 1700000 rows
Fetched 1800000 rows
Fetched 1900000 rows
Fetched 2000000 rows
Raw data: 2000000 ratings
Filtered data: 1758095 ratings, 102658 users, 10418 movies
Building FAISS HNSW index...
Added users up to 10999
Added users up to 20999
Added users up to 30999
Added users up to 40999
Added users up to 50999
Added users up to 60999
Added users up to 70999
Added users up to 80999
Added users up to 90999
Added users up to 100999
FAISS indexing complete
Predicting ratings for test set...
Processed 0 users
Processed 1000 users
Processed 2000 users
Processed 3000 users
Processed 4000 users
Processed 5000 users
Processed 6000 users
Processed 7000

In [19]:
user_ids = sorted(train_data['userId'].unique())
movie_ids = sorted(train_data['movieId'].unique())
user_idx_map = {uid: idx for idx, uid in enumerate(user_ids)}
movie_idx_map = {mid: idx for idx, mid in enumerate(movie_ids)}
n_movies = len(movie_ids)  # Add this line for n_movies
n_users = len(user_ids) 

In [21]:
# Create the user-movie matrix (CSR format)
# First create the COO matrix
row_idx = [user_idx_map[uid] for uid in train_data['userId']]
col_idx = [movie_idx_map[mid] for mid in train_data['movieId']]
ratings = train_data['z_score'].values  # Use z-scores for better prediction

user_movie_matrix = coo_matrix((ratings, (row_idx, col_idx)), 
                               shape=(len(user_ids), len(movie_ids)))
user_movie_csr = user_movie_matrix.tocsr()

# Also need to initialize FAISS index
# This is a simplified version - in the original code it's more complex
d = n_movies  # Dimensions = number of movies
faiss_index = faiss.IndexHNSWFlat(d, 32)  # 32 links per node

# Convert sparse to dense for FAISS indexing
# In a real implementation, you'd do this in batches to avoid memory issues
user_vectors = user_movie_csr.toarray().astype('float32')
user_vectors = np.nan_to_num(user_vectors, nan=0.0)  # Replace NaN values
faiss_index.add(user_vectors)  # Add vectors to index

In [22]:
#Testing on my csv which I can add to BQ later

my_csv = '/Users/adamdyerson/Downloads/IMDB My Ratings.csv'
user_ratings = pd.read_csv(my_csv)
user_ratings.rename(columns={'Your Rating': 'rating', 'Const': 'imdbId'}, inplace=True)
user_ratings['rating'] = user_ratings['rating'] / 2  # Convert 10-point to 5-point scale

# Fetch movies mapping IMDb to MovieLens IDs
query_movies = '''
SELECT movieId, title, imdbId, tmdbId 
FROM `film-wizard-453315.Grouplens.movies_with_imdb`
'''
movies_with_imdb = client.query(query_movies).to_dataframe()

# Normalize IMDb IDs for consistent matching
user_ratings['imdbId'] = user_ratings['imdbId'].str.replace('tt', '').astype(str).str.zfill(7)
movies_with_imdb['imdbId'] = movies_with_imdb['imdbId'].astype(str).str.zfill(7)

# Merge your ratings with the MovieLens IDs
test_ratings = user_ratings.merge(movies_with_imdb, on="imdbId", how="inner")
print(f"Found {len(test_ratings)} of your ratings in the MovieLens dataset")

# Calculate z-scores and add user mean/std for denormalization later
test_ratings['user_mean'] = test_ratings['rating'].mean()
test_ratings['user_std'] = test_ratings['rating'].std()
test_ratings['z_score'] = (test_ratings['rating'] - test_ratings['user_mean']) / test_ratings['user_std']

# Check if required variables exist
required_vars = ['movie_idx_map', 'movie_idx_map', 'user_movie_csr', 'n_movies', 'n_users', 'faiss_index']
for var in required_vars:
    if var not in globals():
        print(f"Error: Variable '{var}' is not defined. Run the main ANN code first.")
        # Uncomment this to stop execution if variables are missing
        # raise NameError(f"Variable '{var}' is not defined")

# Generate predictions for your movies
personal_predictions = []
for _, row in test_ratings.iterrows():
    movie_id = row['movieId']
    if movie_id in movie_idx_map:
        movie_idx = movie_idx_map[movie_id]
        
        # Get your normalized user vector (from your existing ratings)
        user_vector = np.zeros((1, n_movies), dtype=np.float32)
        
        # Create a user vector from your ratings
        for _, r in test_ratings.iterrows():
            if r['movieId'] in movie_idx_map:
                m_idx = movie_idx_map[r['movieId']]
                user_vector[0, m_idx] = r['z_score']
        
        # Replace NaN values
        user_vector = np.nan_to_num(user_vector, nan=0.0)
        
        # Find similar users excluding yourself (since you won't be in the training data)
        D, I = faiss_index.search(user_vector, k=30)  # Use more neighbors for better predictions
        
        # Get ratings from similar users for this movie
        similar_ratings = []
        similar_distances = []
        for i, neighbor_idx in enumerate(I[0]):
            if 0 <= neighbor_idx < n_users:  # Valid index
                rating = user_movie_csr[neighbor_idx, movie_idx]
                if rating != 0:  # Rating exists
                    similar_ratings.append(float(rating))
                    similar_distances.append(D[0][i])
        
        if similar_ratings:
            # Fix weights sum to zero issue
            weights = np.exp(-np.array(similar_distances))
            
            # Check if weights sum to zero and fix if needed
            if weights.sum() == 0:
                # Use a simple average instead
                weighted_avg = np.mean(similar_ratings)
            else:
                # Use weighted average
                weighted_avg = np.average(similar_ratings, weights=weights)
            
            # Convert back to rating scale
            predicted_rating = (weighted_avg * row['user_std']) + row['user_mean']
            
            # Clip predictions to valid rating range
            predicted_rating = max(1, min(5, predicted_rating))
            
            personal_predictions.append({
                'movieId': movie_id,
                'title': row['title'],
                'actual': row['rating'],
                'predicted': predicted_rating,
                'difference': row['rating'] - predicted_rating
            })

# Create DataFrame from predictions
personal_df = pd.DataFrame(personal_predictions)

# Calculate metrics
if len(personal_df) > 0:
    rmse = np.sqrt(mean_squared_error(personal_df['actual'], personal_df['predicted']))
    mae = mean_absolute_error(personal_df['actual'], personal_df['predicted'])
    
    print(f"\nYour Personal Recommendation Results:")
    print(f"RMSE: {rmse:.4f}")
    print(f"MAE: {mae:.4f}")
    print(f"Number of predictions: {len(personal_df)}")
    
    # Sort by largest discrepancies
    personal_df['abs_diff'] = personal_df['difference'].abs()
    personal_df = personal_df.sort_values('abs_diff', ascending=False)
    
    # Display the top matches and mismatches
    print("\nBiggest Rating Differences (Movies you might have unusual taste in):")
    print(personal_df.head(5)[['title', 'actual', 'predicted', 'difference']])
    
    print("\nClosest Rating Matches (Movies the system understands your taste in):")
    print(personal_df.sort_values('abs_diff').head(5)[['title', 'actual', 'predicted', 'difference']])
else:
    print("No movies from your ratings were found in the dataset")
# The key addition I made is checking if the required variables exist before proceeding with the code. You need to run this code snippet after running the main ANN model code where these variables are defined.RetryClaude can make mistakes. Please double-check responses.

Found 108 of your ratings in the MovieLens dataset

Your Personal Recommendation Results:
RMSE: 0.5060
MAE: 0.3671
Number of predictions: 12

Biggest Rating Differences (Movies you might have unusual taste in):
                        title  actual  predicted  difference
6    Wolf of Wall Street, The     4.5   3.131709    1.368291
0                  Casablanca     5.0   4.307172    0.692828
1   Shawshank Redemption, The     5.0   4.470659    0.529341
8               Departed, The     4.0   4.353638   -0.353638
11               Garden State     2.5   2.837544   -0.337544

Closest Rating Matches (Movies the system understands your taste in):
                                               title  actual  predicted  \
9                                    American Beauty     5.0   4.941948   
5   Birdman: Or (The Unexpected Virtue of Ignorance)     4.0   3.875277   
10                                       Frost/Nixon     4.0   4.156893   
4                                 The Imitation Game

In [25]:
def get_movie_recommendations(test_ratings, n_recommendations=5):
    """
    Recommend new movies based on similar users' preferences
    
    Parameters:
        test_ratings: DataFrame with your movie ratings
        n_recommendations: Number of recommendations to return
        
    Returns:
        DataFrame with recommended movies
    """
    # Create your user vector from existing ratings
    user_vector = np.zeros((1, n_movies), dtype=np.float32)
    
    # Fill in the user vector with your ratings (z-scores)
    for _, row in test_ratings.iterrows():
        if row['movieId'] in movie_idx_map:
            movie_idx = movie_idx_map[row['movieId']]
            user_vector[0, movie_idx] = row['z_score']
    
    # Replace NaN values
    user_vector = np.nan_to_num(user_vector, nan=0.0)
    
    # Find similar users
    D, I = faiss_index.search(user_vector, k=100)  # Use more neighbors for better recommendations
    
    # Get the movies you've already rated
    rated_movie_ids = set(test_ratings['movieId'])
    
    # Collect candidate movies from similar users
    candidate_movies = {}
    
    # Create a simpler method to get movies rated highly by similar users
    for i, neighbor_idx in enumerate(I[0]):
        if 0 <= neighbor_idx < n_users:  # Valid index
            similarity = np.exp(-D[0][i])  # Convert distance to similarity
            
            # Get this user's original movie ratings
            for movie_id, movie_idx in movie_idx_map.items():
                if movie_id not in rated_movie_ids:  # Skip movies you've already rated
                    rating_z = user_movie_csr[neighbor_idx, movie_idx]
                    
                    if rating_z > 0.5:  # Only consider movies the user rated positively
                        if movie_id not in candidate_movies:
                            candidate_movies[movie_id] = {'count': 0, 'score_sum': 0, 'similarity_sum': 0}
                        
                        candidate_movies[movie_id]['count'] += 1
                        candidate_movies[movie_id]['score_sum'] += rating_z * similarity
                        candidate_movies[movie_id]['similarity_sum'] += similarity
    
    # Calculate scores for each candidate movie
    recommendations = []
    for movie_id, data in candidate_movies.items():
        if data['count'] >= 2:  # Require at least 3 similar users to recommend
            # Calculate weighted average score
            if data['similarity_sum'] > 0:
                score = data['score_sum'] / data['similarity_sum']
            else:
                score = data['score_sum'] / data['count']
            
            # Get movie details
            movie_title = "Unknown"
            
            # Try to get movie details from the movies dataset
            movie_data = movies_with_imdb[movies_with_imdb['movieId'] == movie_id]
            if not movie_data.empty:
                movie_title = movie_data.iloc[0]['title']
            
            recommendations.append({
                'movieId': movie_id,
                'title': movie_title,
                'score': score,
                'recommender_count': data['count']
            })
    
    # Sort by score and select top N
    recommendations_df = pd.DataFrame(recommendations)
    if not recommendations_df.empty:
        recommendations_df = recommendations_df.sort_values('score', ascending=False).head(n_recommendations)
        
    return recommendations_df

# Get 5 movie recommendations
recommendations = get_movie_recommendations(test_ratings, n_recommendations=5)

if not recommendations.empty:
    print("Top 5 Movie Recommendations for You:")
    for i, row in recommendations.iterrows():
        print(f"{i+1}. {row['title']} - Recommendation Score: {row['score']:.2f} (Recommended by {row['recommender_count']} similar users)")
else:
    print("Could not generate recommendations. Try increasing the number of similar users or reducing filtering criteria.")

Top 5 Movie Recommendations for You:
39. Pulp Fiction - Recommendation Score: 1.33 (Recommended by 3 similar users)
42. Alien - Recommendation Score: 1.12 (Recommended by 2 similar users)
26. Terminator 2: Judgment Day - Recommendation Score: 1.10 (Recommended by 3 similar users)
8. The Hunger Games: Catching Fire - Recommendation Score: 1.07 (Recommended by 2 similar users)
31. Highlander - Recommendation Score: 0.98 (Recommended by 2 similar users)


In [27]:
# Select just 3 random movies from your ratings
import random
minimal_ratings = test_ratings.sample(n=3)
print("Using only these 3 movies for recommendations:")
print(minimal_ratings[['title', 'rating']])

# Get recommendations based on just these 3 movies
# Modify this to match your actual function definition
minimal_recommendations = get_movie_recommendations(
    minimal_ratings, 
    n_recommendations=5
)

print("\nRecommendations based on only 3 movies:")
if not minimal_recommendations.empty:
    for i, row in minimal_recommendations.iterrows():
        print(f"{i+1}. {row['title']} - Score: {row['score']:.2f} (Recommended by {row['recommender_count']} similar users)")
else:
    print("Could not generate recommendations with just 3 movies. Try different movies or adjust parameters.")

Using only these 3 movies for recommendations:
                                                title  rating
4                                                Sing     4.0
58  Good, the Bad and the Ugly, The (Buono, il bru...     4.5
0                                   Big Lebowski, The     3.5

Recommendations based on only 3 movies:
21. American Beauty - Score: 1.39 (Recommended by 3 similar users)
43. Inside Man - Score: 1.16 (Recommended by 2 similar users)
28. Terminator 2: Judgment Day - Score: 1.09 (Recommended by 3 similar users)
24. Lord of the Rings: The Two Towers, The - Score: 1.08 (Recommended by 2 similar users)
17. Harry Potter and the Prisoner of Azkaban - Score: 1.06 (Recommended by 2 similar users)
