# Python code for generating synthetic user data to enrich the MovieLens dataset.

The code does the following:

- Loads the ratings.csv to extract all unique user IDs.
- Adds approximately 25% new user records with synthetic user IDs. (Optional)
- Augments the data with synthetic features like age, sex, favorite genres, and other features with values generated from reasonable distributions.
- Writes the resulting enriched user data to a CSV file user_enriched.csv.

In [12]:
import pandas as pd
import numpy as np
import random
from collections import defaultdict
from tqdm import tqdm  # For progress bar

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Load ratings.csv and movies_enriched.csv
ratings_path = "data/ml-32m/ratings.csv"
movies_path = "data/ml-32m/movies_enriched.csv"
ratings = pd.read_csv(ratings_path)
movies = pd.read_csv(movies_path)

# User options
add_new_users = False  # Set to False if no new users should be added

# Extract unique user IDs
existing_user_ids = ratings['userId'].unique()
n_existing_users = len(existing_user_ids)

if add_new_users:
    # Generate 25% additional synthetic user IDs
    n_new_users = int(n_existing_users * 0.25)
    new_user_ids = range(existing_user_ids.max() + 1, existing_user_ids.max() + 1 + n_new_users)
    all_user_ids = np.concatenate([existing_user_ids, new_user_ids])
else:
    all_user_ids = existing_user_ids

# Create a DataFrame for user_enriched.csv
user_enriched = pd.DataFrame({'userId': all_user_ids})

# Generate synthetic features
user_enriched['age'] = np.random.normal(loc=35, scale=10, size=len(user_enriched)).clip(18, 65).astype(int)
user_enriched['sex'] = np.random.choice(['male', 'female'], size=len(user_enriched), p=[0.5, 0.5])
user_enriched['num_reviews'] = np.random.poisson(lam=50, size=len(user_enriched)).clip(1, 200)
user_enriched['avg_rating'] = np.random.normal(loc=3.5, scale=0.5, size=len(user_enriched)).clip(1, 5).round(2)
user_enriched['spending_category'] = pd.cut(
    np.random.uniform(0, 1, size=len(user_enriched)),
    bins=[0, 0.33, 0.66, 1],
    labels=['Low', 'Medium', 'High']
)

# Preprocess genres into a lookup table for efficiency
movies['genres'] = movies['genres'].fillna('')
movie_genres_lookup = movies.set_index('movieId')['genres'].str.split('|').apply(set).to_dict()

# Create a mapping of userId to a set of all genres from rated movies
user_genres = defaultdict(set)
print("Aggregating genres for users...")
for user_id, movie_id in zip(ratings['userId'], ratings['movieId']):
    if movie_id in movie_genres_lookup:
        user_genres[user_id].update(movie_genres_lookup[movie_id])

# Vectorized function to derive favorite genres
def derive_favorite_genres(user_id, user_genres, pick_subset=True, min_favs=1, max_favs=3):
    genres = sorted(user_genres.get(user_id, []))
    if pick_subset and genres:
        n_genres = random.randint(min_favs, max_favs)
        return ', '.join(random.sample(genres, min(n_genres, len(genres))))
    return ', '.join(genres)

# Assign favorite genres to users with a progress bar
print("Assigning favorite genres to users...")
user_enriched['favorite_genres'] = [
    derive_favorite_genres(uid, user_genres, pick_subset=True, min_favs=1, max_favs=3)
    for uid in tqdm(user_enriched['userId'], desc="Processing Users", unit="user")
]

# Save the enriched user data to a CSV file
output_path = "data/ml-32m/users_enriched.csv"
user_enriched.to_csv(output_path, index=False)
print(f"User enriched data written to {output_path}")

# Preview the DataFrame
print(user_enriched.head(10))

Aggregating genres for users...
Assigning favorite genres to users...


Processing Users: 100%|███████████████████████████████████████████████████| 200948/200948 [00:00<00:00, 271749.04user/s]


User enriched data written to data/ml-32m/users_enriched.csv
   userId  age     sex  num_reviews  avg_rating spending_category  \
0       1   39    male           48        2.58            Medium   
1       2   33  female           51        2.71            Medium   
2       3   41  female           51        3.38            Medium   
3       4   50    male           49        2.73               Low   
4       5   32    male           49        3.52            Medium   
5       6   32    male           43        3.49              High   
6       7   50    male           47        3.49               Low   
7       8   42  female           53        3.31              High   
8       9   30    male           59        4.06               Low   
9      10   40  female           47        4.04              High   

             favorite_genres  
0    Comedy, Action, Romance  
1              Children, War  
2                   Children  
3  Sci-Fi, Adventure, Horror  
4                     Ac