# User Features
In this notebook, we select the top 10k users `top_netflix_users.json` with the highest number of movie ratings and create user-specific features `user_profiles_{n}.pickle` as high/low ratios (on a scale of 0 to 1) for each movie feature base on their ratings.

Naturally, this dataset will be very sparse as many users have not rated any movies pertaining to a large set of movie features. Such values are imputed as -1 for geometric interpretability in downstream tasks.

**Note**: The resulting `user_profiles_{n}.pickle` file is too large for Github. To produce the file, please run this notebook as is.

In [1]:
import os
import json
import pickle
from collections import defaultdict
from typing import Dict, Set, List, Tuple
import heapq

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
USER_FOLDER_PATH = os.path.join(NETFLIX_FOLDER_PATH, "training_set")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")

MIN_OCCURRENCES = 20

In [2]:
def load_data(data_path: str, min_occurrences: int = 5) -> Tuple[Dict, Dict, Set[str]]:
    """
    Load required data files.
    Returns: feature_mapping, movie_features, netflix_movies_set
    """
    # Load Netflix to IMDB mapping
    with open(os.path.join(data_path, 'netflix_to_imdb.json'), 'r') as f:
        netflix_to_imdb = json.load(f)
    netflix_movies_set = set(netflix_to_imdb.keys())
    
    # Load feature mapping
    with open(os.path.join(data_path, 'processed', f'feature_mapping_{min_occurrences}.pickle'), 'rb') as f:
        feature_mapping = pickle.load(f)
    
    # Load movie features
    with open(os.path.join(data_path, 'processed', f'movie_features_{min_occurrences}.pickle'), 'rb') as f:
        movie_features = pickle.load(f)
    
    return feature_mapping, movie_features, netflix_movies_set

def load_top_users(data_path: str) -> Set[str]:
    """Load the set of top users to process."""
    with open(os.path.join(data_path, 'top_netflix_users.json'), 'r') as f:
        top_users = json.load(f)
    return set(top_users.keys())

class UserProfile:
    def __init__(self, num_features: int):
        self.feature_ratings = defaultdict(lambda: {"high": 0, "low": 0})
        self.total_ratings = 0
        self.high_ratings = 0
        self.low_ratings = 0
        self.seen_features = set()
        self.num_features = num_features
    
    def update(self, features: List[int], rating: int):
        """Update profile with a new movie rating."""
        self.total_ratings += 1
        is_high = rating >= 4
        
        if is_high:
            self.high_ratings += 1
        else:
            self.low_ratings += 1
        
        for feature_id in features:
            self.seen_features.add(feature_id)
            if is_high:
                self.feature_ratings[feature_id]["high"] += 1
            else:
                self.feature_ratings[feature_id]["low"] += 1
    
    def calculate_preferences(self) -> Dict:
        """Calculate final preference ratios for all features."""
        preferences = {}
        
        # Calculate ratios for seen features
        for feature_id in range(self.num_features):
            if feature_id in self.seen_features:
                ratings = self.feature_ratings[feature_id]
                total = ratings["high"] + ratings["low"]
                ratio = ratings["high"] / total if total > 0 else 0
                preferences[feature_id] = ratio
            else:
                preferences[feature_id] = -1
        
        return {
            "feature_preferences": preferences,
            "total_ratings": self.total_ratings,
            "high_ratings": self.high_ratings,
            "low_ratings": self.low_ratings
        }

def process_rating_file(filepath: str, 
                       netflix_movies: Set[str],
                       movie_features: Dict[str, List[int]], 
                       top_users: Set[str],
                       user_profiles: Dict[str, UserProfile]) -> None:
    """Process a single rating file and update user profiles."""
    current_user = None
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'):
                current_user = line[:-1]
            elif current_user in top_users:
                movie_id, rating, _ = line.split(',')
                if movie_id in netflix_movies and movie_id in movie_features:
                    user_profiles[current_user].update(
                        movie_features[movie_id],
                        int(rating)
                    )

def create_user_profiles(data_path: str, min_occurrences: int = 5) -> Dict:
    """Main function to create user profiles."""
    # Load required data
    feature_mapping, movie_features, netflix_movies = load_data(data_path, min_occurrences)
    top_users = load_top_users(data_path)
    num_features = len(feature_mapping['feature_to_id'])
    
    # Initialize user profiles
    user_profiles = {
        user_id: UserProfile(num_features) 
        for user_id in top_users
    }
    
    # Process each rating file
    netflix_folder = os.path.join(data_path, "netflix_prize", "training_set")
    for filename in sorted(os.listdir(netflix_folder)):
        if filename.startswith('mv_'):
            filepath = os.path.join(netflix_folder, filename)
            process_rating_file(
                filepath,
                netflix_movies,
                movie_features,
                top_users,
                user_profiles
            )
    
    # Calculate final preferences for all users
    final_profiles = {
        user_id: profile.calculate_preferences()
        for user_id, profile in user_profiles.items()
    }
    
    return final_profiles

## Select Top 10k Users By Ratings Count
- Only count ratings for movies within scope of the task, per `netflix_to_imdb.json`

In [None]:
# top_n = 10000

# # Load Netflix to IMDB mapping
# with open(os.path.join(DATA_PATH, 'netflix_to_imdb.json'), 'r') as f: netflix_to_imdb = json.load(f)
# netflix_movies = set(netflix_to_imdb.keys())

# print("Processing rating files...")
# all_user_ratings = defaultdict(int)

# # Process each rating file
# for filename in sorted(os.listdir(os.path.join(NETFLIX_FOLDER_PATH, "training_set"))):
#     if filename.startswith('mv_'):
#         filepath = os.path.join(os.path.join(NETFLIX_FOLDER_PATH, "training_set"), filename)
#         file_user_ratings = process_rating_file(filepath, netflix_movies)
        
#         # Merge ratings
#         for user, count in file_user_ratings.items():
#             all_user_ratings[user] += count

# # Get top N users
# print(f"Finding top {top_n} users...")
# top_users = dict(heapq.nlargest(top_n, all_user_ratings.items(), key=lambda x: x[1]))

# # Save results
# output_path = os.path.join(DATA_PATH, 'top_netflix_users.json')
# print(f"Saving results to {output_path}...")
# with open(output_path, 'w') as f:
#     json.dump(top_users, f)

# print("Complete!")

## Create User Preference Features
- For each of the 10k users, for each movie that they’ve reviewed, compute feature preference ratios (high ratings of the feature divided by the total ratings of the feature)
- Impute missing feature values as -1
- Also keep count of totalRatings, highRatings, and lowRatings for each user

In [None]:
print("Creating user profiles...")
user_profiles = create_user_profiles(DATA_PATH, MIN_OCCURRENCES)

# Save the results
output_path = os.path.join(DATA_PATH, f'user_profiles_{MIN_OCCURRENCES}.pickle')
print(f"Saving profiles to {output_path}...")
with open(output_path, 'wb') as f:
    pickle.dump(user_profiles, f)

print("Complete!")

# Print sample statistics
sample_user = next(iter(user_profiles))
print(f"\nSample user {sample_user} statistics:")
print(f"Total ratings: {user_profiles[sample_user]['total_ratings']}")
print(f"High ratings: {user_profiles[sample_user]['high_ratings']}")
print(f"Low ratings: {user_profiles[sample_user]['low_ratings']}")