# Python code for generating synthetic user data to enrich the MovieLens dataset.

The code does the following:

- Loads the ratings.csv to extract all unique user IDs.
- Adds approximately 25% new user records with synthetic user IDs.
- Augments the data with synthetic features like age, sex, favorite genres, and other features with values generated from reasonable distributions.
- Writes the resulting enriched user data to a CSV file user_enriched.csv.

In [1]:
import pandas as pd
import numpy as np
import random

# Set seed for reproducibility
np.random.seed(42)
random.seed(42)

# Load ratings.csv to extract user IDs
ratings_path = "data/ml-latest-small/ratings.csv"
ratings = pd.read_csv(ratings_path)

# Extract unique user IDs
existing_user_ids = ratings['userId'].unique()
n_existing_users = len(existing_user_ids)

# Generate 25% additional synthetic user IDs
n_new_users = int(n_existing_users * 0.25)
new_user_ids = range(existing_user_ids.max() + 1, existing_user_ids.max() + 1 + n_new_users)

# Combine existing and new user IDs
all_user_ids = np.concatenate([existing_user_ids, new_user_ids])

# Create a DataFrame for user_enriched.csv
user_enriched = pd.DataFrame({'userId': all_user_ids})

# Generate synthetic features
# Age: Normally distributed between 18 and 65
user_enriched['age'] = np.random.normal(loc=35, scale=10, size=len(user_enriched)).clip(18, 65).astype(int)

# Sex: Randomly assign "male" or "female"
user_enriched['sex'] = np.random.choice(['male', 'female'], size=len(user_enriched), p=[0.5, 0.5])

# Favorite genres: Randomly assign 1-3 favorite genres from a predefined list
genres = ['Action', 'Comedy', 'Drama', 'Fantasy', 'Horror', 'Mystery', 'Romance', 'Sci-Fi', 'Thriller']
user_enriched['favorite_genres'] = user_enriched.apply(
    lambda _: ', '.join(random.sample(genres, random.randint(1, 3))), axis=1
)

# Number of reviews: Simulate how active a user is, following a skewed distribution
user_enriched['num_reviews'] = np.random.poisson(lam=50, size=len(user_enriched)).clip(1, 200)

# Average rating: Normally distributed around 3.5 with a slight bias
user_enriched['avg_rating'] = np.random.normal(loc=3.5, scale=0.5, size=len(user_enriched)).clip(1, 5).round(2)

# Spending category: Simulate how much users spend on movie services (Low, Medium, High)
user_enriched['spending_category'] = pd.cut(
    np.random.uniform(0, 1, size=len(user_enriched)),
    bins=[0, 0.33, 0.66, 1],
    labels=['Low', 'Medium', 'High']
)

# Write enriched user data to CSV
output_path = "data/ml-latest-small/user_enriched.csv"
user_enriched.to_csv(output_path, index=False)
print(f"User enriched data written to {output_path}")

# Preview the DataFrame
print(user_enriched.head())


User enriched data written to data/ml-latest-small/user_enriched.csv
   userId  age     sex           favorite_genres  num_reviews  avg_rating  \
0       1   39    male   Comedy, Action, Mystery           58        3.44   
1       2   33  female         Fantasy, Thriller           56        3.18   
2       3   41    male                    Comedy           48        4.37   
3       4   50  female  Thriller, Comedy, Horror           56        3.34   
4       5   32    male          Action, Thriller           51        4.42   

  spending_category  
0              High  
1              High  
2            Medium  
3               Low  
4               Low  
