In [1]:
import os
import pandas as pd
import numpy as np

def create_reduced_dataset(input_dir, output_dir, num_users):
    """
    Create a reduced dataset with a specified number of users.
    
    Args:
        input_dir (str): Directory containing the original 32m dataset files.
        output_dir (str): Directory where the reduced dataset will be saved.
        num_users (int): Number of users to include in the reduced dataset.
    """
    # File paths
    ratings_file = os.path.join(input_dir, "ratings.csv")
    movies_file = os.path.join(input_dir, "movies_enriched.csv")
    tags_file = os.path.join(input_dir, "tags.csv")
    users_file = os.path.join(input_dir, "users_enriched.csv")

    # Read ratings.csv to extract users and their ratings
    print("Loading ratings.csv...")
    ratings = pd.read_csv(ratings_file)

    # Select a random sample of unique users
    print(f"Selecting {num_users} random users...")
    selected_users = np.random.choice(ratings['userId'].unique(), num_users, replace=False)

    # Filter ratings for the selected users
    print("Filtering ratings.csv...")
    reduced_ratings = ratings[ratings['userId'].isin(selected_users)]

    # Save the reduced ratings.csv
    os.makedirs(output_dir, exist_ok=True)
    reduced_ratings_file = os.path.join(output_dir, "ratings.csv")
    reduced_ratings.to_csv(reduced_ratings_file, index=False)
    print(f"Reduced ratings.csv saved to {reduced_ratings_file}")

    # Filter users_enriched.csv for the selected users
    print("Filtering users_enriched.csv...")
    users = pd.read_csv(users_file)
    reduced_users = users[users['userId'].isin(selected_users)]
    reduced_users_file = os.path.join(output_dir, "users_enriched.csv")
    reduced_users.to_csv(reduced_users_file, index=False)
    print(f"Reduced users_enriched.csv saved to {reduced_users_file}")

    # Filter tags.csv for the selected users
    print("Filtering tags.csv...")
    tags = pd.read_csv(tags_file)
    reduced_tags = tags[tags['userId'].isin(selected_users)]
    reduced_tags_file = os.path.join(output_dir, "tags.csv")
    reduced_tags.to_csv(reduced_tags_file, index=False)
    print(f"Reduced tags.csv saved to {reduced_tags_file}")

    # Filter movies_enriched.csv for the movies present in reduced ratings
    print("Filtering movies_enriched.csv...")
    movies = pd.read_csv(movies_file)
    selected_movie_ids = reduced_ratings['movieId'].unique()
    reduced_movies = movies[movies['movieId'].isin(selected_movie_ids)]
    reduced_movies_file = os.path.join(output_dir, "movies_enriched.csv")
    reduced_movies.to_csv(reduced_movies_file, index=False)
    print(f"Reduced movies_enriched.csv saved to {reduced_movies_file}")

    print("Reduced dataset creation complete.")


# Parameters
INPUT_DIR = "data/ml-32m"  # Replace with the actual directory of the full dataset
OUTPUT_DIR = "data/ml-10k-users"  # Replace with the desired output directory for the reduced dataset
NUM_USERS = 10000  # Replace with the desired number of users

# Create reduced dataset
create_reduced_dataset(INPUT_DIR, OUTPUT_DIR, NUM_USERS)


Loading ratings.csv...
Selecting 10000 random users...
Filtering ratings.csv...
Reduced ratings.csv saved to data/ml-10k-users/ratings.csv
Filtering users_enriched.csv...
Reduced users_enriched.csv saved to data/ml-10k-users/users_enriched.csv
Filtering tags.csv...
Reduced tags.csv saved to data/ml-10k-users/tags.csv
Filtering movies_enriched.csv...
Reduced movies_enriched.csv saved to data/ml-10k-users/movies_enriched.csv
Reduced dataset creation complete.


Validate the newly created dataset

In [2]:
import pandas as pd

def validate_users(input_dir):
    """
    Validates if users in ratings.csv match those in users_enriched.csv and tags.csv.

    Args:
        input_dir (str): Directory containing the dataset files (ratings.csv, users_enriched.csv, tags.csv).
    """
    # File paths
    ratings_file = f"{input_dir}/ratings.csv"
    users_enriched_file = f"{input_dir}/users_enriched.csv"
    tags_file = f"{input_dir}/tags.csv"

    # Load datasets
    print("Loading ratings.csv...")
    ratings = pd.read_csv(ratings_file)
    print("Loading users_enriched.csv...")
    users_enriched = pd.read_csv(users_enriched_file)
    print("Loading tags.csv...")
    tags = pd.read_csv(tags_file)

    # Extract unique user IDs from each file
    ratings_users = set(ratings['userId'].unique())
    users_enriched_users = set(users_enriched['userId'].unique())
    tags_users = set(tags['userId'].unique())

    # Validate users
    print("\nValidation Results:")

    # Check if all users in ratings are in users_enriched
    missing_in_users_enriched = ratings_users - users_enriched_users
    if missing_in_users_enriched:
        print(f"Users in ratings.csv but missing in users_enriched.csv: {len(missing_in_users_enriched)}")
        print(f"Example missing users: {list(missing_in_users_enriched)[:10]}")
    else:
        print("All users in ratings.csv are present in users_enriched.csv.")

    # Check if all users in ratings are in tags
    missing_in_tags = ratings_users - tags_users
    if missing_in_tags:
        print(f"Users in ratings.csv but missing in tags.csv: {len(missing_in_tags)}")
        print(f"Example missing users: {list(missing_in_tags)[:10]}")
    else:
        print("All users in ratings.csv are present in tags.csv.")

    # Check for extra users in users_enriched
    extra_in_users_enriched = users_enriched_users - ratings_users
    if extra_in_users_enriched:
        print(f"Extra users in users_enriched.csv not in ratings.csv: {len(extra_in_users_enriched)}")
        print(f"Example extra users: {list(extra_in_users_enriched)[:10]}")
    else:
        print("No extra users in users_enriched.csv.")

    # Check for extra users in tags
    extra_in_tags = tags_users - ratings_users
    if extra_in_tags:
        print(f"Extra users in tags.csv not in ratings.csv: {len(extra_in_tags)}")
        print(f"Example extra users: {list(extra_in_tags)[:10]}")
    else:
        print("No extra users in tags.csv.")

    print("\nValidation Complete.")

# Directory containing the dataset
INPUT_DIR = "data/ml-10k-users"  # Replace with the directory of your reduced dataset

# Validate the users
validate_users(INPUT_DIR)


Loading ratings.csv...
Loading users_enriched.csv...
Loading tags.csv...

Validation Results:
All users in ratings.csv are present in users_enriched.csv.
Users in ratings.csv but missing in tags.csv: 9185
Example missing users: [65537, 98306, 98310, 8, 10, 32778, 65555, 65567, 65568, 131117]
No extra users in users_enriched.csv.
No extra users in tags.csv.

Validation Complete.


Now let's integrate the tags in the movies/users enriched files.

In [3]:
import pandas as pd
import os


def integrate_tags(input_dir, output_dir):
    """
    Integrates tags from tags.csv into movies_enriched.csv and users_enriched.csv.
    
    Args:
        input_dir (str): Directory containing the reduced dataset files.
        output_dir (str): Directory where the updated dataset files will be saved.
    """
    # File paths
    tags_file = os.path.join(input_dir, "tags.csv")
    movies_file = os.path.join(input_dir, "movies_enriched.csv")
    users_file = os.path.join(input_dir, "users_enriched.csv")

    # Load files
    print("Loading tags.csv...")
    tags = pd.read_csv(tags_file)
    print("Loading movies_enriched.csv...")
    movies = pd.read_csv(movies_file)
    print("Loading users_enriched.csv...")
    users = pd.read_csv(users_file)

    # Step 1: Aggregate tags for movies
    print("Aggregating tags by movie...")
    tags_by_movie = tags.groupby('movieId')['tag'].apply(lambda x: ', '.join(sorted(set(x)))).reset_index()
    tags_by_movie.rename(columns={'tag': 'tags_by_users'}, inplace=True)

    # Merge aggregated tags into movies_enriched.csv
    print("Merging tags into movies_enriched.csv...")
    movies = movies.merge(tags_by_movie, on='movieId', how='left')
    movies['tags_by_users'] = movies['tags_by_users'].fillna('')  # Fill missing tags with empty strings

    # Save the updated movies_enriched.csv
    movies_output_file = os.path.join(output_dir, "movies_enriched_tags.csv")
    movies.to_csv(movies_output_file, index=False)
    print(f"Updated movies_enriched.csv saved to {movies_output_file}")

    # Step 2: Aggregate tags for users
    print("Aggregating tags by user...")
    tags_by_user = tags.groupby('userId')['tag'].apply(lambda x: ', '.join(sorted(set(x)))).reset_index()
    tags_by_user.rename(columns={'tag': 'tags_user'}, inplace=True)

    # Merge aggregated tags into users_enriched.csv
    print("Merging tags into users_enriched.csv...")
    users = users.merge(tags_by_user, on='userId', how='left')
    users['tags_user'] = users['tags_user'].fillna('')  # Fill missing tags with empty strings

    # Save the updated users_enriched.csv
    users_output_file = os.path.join(output_dir, "users_enriched_tags.csv")
    users.to_csv(users_output_file, index=False)
    print(f"Updated users_enriched.csv saved to {users_output_file}")

    print("Tags integration complete.")


# Parameters
INPUT_DIR = "data/ml-10k-users"  # Replace with the directory of the reduced dataset
OUTPUT_DIR = "data/ml-10k-users"  # Replace with the desired output directory

# Integrate tags
integrate_tags(INPUT_DIR, OUTPUT_DIR)

Loading tags.csv...
Loading movies_enriched.csv...
Loading users_enriched.csv...
Aggregating tags by movie...
Merging tags into movies_enriched.csv...
Updated movies_enriched.csv saved to data/ml-10k-users/movies_enriched_tags.csv
Aggregating tags by user...
Merging tags into users_enriched.csv...
Updated users_enriched.csv saved to data/ml-10k-users/users_enriched_tags.csv
Tags integration complete.
