In [26]:
import os
import json
from collections import defaultdict
import heapq

DATA_PATH = "../data"
NETFLIX_FOLDER_PATH = os.path.join(DATA_PATH, "netflix_prize")
IMDB_FOLDER_PATH = os.path.join(DATA_PATH, "imdb")

In [27]:
def process_rating_file(filepath, netflix_movies):
    """Process a single rating file and return user rating counts."""
    user_ratings = defaultdict(int)
    current_user = None
    
    with open(filepath, 'r') as f:
        for line in f:
            line = line.strip()
            if line.endswith(':'):
                current_user = int(line[:-1])
            else:
                if current_user is not None:
                    movie_id = line.split(',')[0]
                    if movie_id in netflix_movies:
                        user_ratings[current_user] += 1
    
    return user_ratings

In [None]:
top_n = 10000

# Load Netflix to IMDB mapping
with open(os.path.join(DATA_PATH, 'netflix_to_imdb.json'), 'r') as f: netflix_to_imdb = json.load(f)
netflix_movies = set(netflix_to_imdb.keys())

print("Processing rating files...")
all_user_ratings = defaultdict(int)

# Process each rating file
for filename in sorted(os.listdir(os.path.join(NETFLIX_FOLDER_PATH, "training_set"))):
    if filename.startswith('mv_'):
        filepath = os.path.join(os.path.join(NETFLIX_FOLDER_PATH, "training_set"), filename)
        file_user_ratings = process_rating_file(filepath, netflix_movies)
        
        # Merge ratings
        for user, count in file_user_ratings.items():
            all_user_ratings[user] += count

# Get top N users
print(f"Finding top {top_n} users...")
top_users = dict(heapq.nlargest(top_n, all_user_ratings.items(), key=lambda x: x[1]))

# Save results
output_path = os.path.join(DATA_PATH, 'top_netflix_users.json')
print(f"Saving results to {output_path}...")
with open(output_path, 'w') as f:
    json.dump(top_users, f)

print("Complete!")