In [2]:
pip install pandas Faker numpy

Note: you may need to restart the kernel to use updated packages.


In [3]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [4]:
NUM_USERS = 1000
NUM_EXERCISES = 150
AVG_WORKOUTS_PER_USER = 15
AVG_PROGRESS_ENTRIES_PER_USER = 10

In [5]:
fake = Faker('en_IN')

In [6]:
FITNESS_LEVELS = ['beginner', 'intermediate', 'advanced']
PRIMARY_GOALS = ['weight_loss', 'muscle_gain', 'endurance', 'skill_boxing', 'general_fitness']
EQUIPMENT_OPTIONS = ['bodyweight', 'dumbbells', 'resistance_bands', 'kettlebell', 'punching_bag']
LANGUAGES = ['en', 'hi', 'mr']
SUBSCRIPTION_STATUSES = ['free', 'premium', 'expired']
EXERCISE_TYPES = ['strength', 'cardio', 'plyometric', 'stretch']
MUSCLE_GROUPS = ['Chest', 'Back', 'Legs', 'Shoulders', 'Biceps', 'Triceps', 'Core', 'Full Body']

In [7]:
def generate_exercises(n):
    print("Generating exercises data...")
    data = []
    for i in range(n):
        exercise_type = random.choice(EXERCISE_TYPES)
        data.append({
            'id': i + 1,
            'exercise_name': fake.catch_phrase().title(), # Using catch_phrase for variety
            'muscle_group': random.choice(MUSCLE_GROUPS),
            'exercise_type': exercise_type,
            'equipment_needed': 'bodyweight' if exercise_type == 'stretch' else random.choice(EQUIPMENT_OPTIONS[:4]),
            'video_url': f'https://fightfit.ai/videos/ex_{i+1}.mp4',
            'instructions': fake.paragraph(nb_sentences=3),
            'difficulty_level': random.choice(FITNESS_LEVELS)
        })
    return pd.DataFrame(data)

In [8]:
def generate_users(n):
    print("Generating users data...")
    data = []
    for i in range(n):
        data.append({
            'id': i + 1,
            'full_name': fake.name(),
            'email': fake.unique.email(),
            'password_hash': fake.sha256(),
            'phone_number': fake.phone_number(),
            'preferred_language': random.choice(LANGUAGES),
            'subscription_status': np.random.choice(SUBSCRIPTION_STATUSES, p=[0.5, 0.4, 0.1]),
            'subscription_expiry': fake.date_between(start_date='-30d', end_date='+1y'),
            'created_at': fake.date_time_between(start_date='-2y', end_date='now'),
            'updated_at': fake.date_time_this_month()
        })
    return pd.DataFrame(data)

In [9]:
def generate_user_profiles(user_ids):
    print("Generating user_profiles data...")
    data = []
    for user_id in user_ids:
        equipment_count = random.randint(1, 3)
        data.append({
            'id': user_id,
            'user_id': user_id,
            'age': random.randint(18, 65),
            'gender': random.choice(['male', 'female']),
            'height_cm': round(random.uniform(150.0, 190.0), 2),
            'fitness_level': np.random.choice(FITNESS_LEVELS, p=[0.5, 0.4, 0.1]),
            'primary_goal': np.random.choice(PRIMARY_GOALS, p=[0.4, 0.3, 0.1, 0.1, 0.1]),
            'available_equipment': random.sample(EQUIPMENT_OPTIONS, equipment_count),
            'days_per_week': random.choice([3, 4, 5]),
            'time_per_session': random.choice([30, 45, 60]),
            'medical_notes': 'None' if random.random() > 0.2 else 'Minor knee pain'
        })
    return pd.DataFrame(data)

In [10]:
def generate_workout_logs_and_details(user_ids, exercises_df):
    print("Generating workout logs and details data...")
    logs_data = []
    details_data = []
    log_id_counter = 1
    detail_id_counter = 1

    for user_id in user_ids:
        num_logs = random.randint(5, AVG_WORKOUTS_PER_USER + 10)
        for _ in range(num_logs):
            log_date = fake.date_time_between(start_date='-1y', end_date='now')
            logs_data.append({
                'id': log_id_counter,
                'user_id': user_id,
                'workout_name': f'{random.choice(MUSCLE_GROUPS)} Day',
                'log_date': log_date.date(),
                'duration_minutes': random.randint(25, 65),
                'perceived_exertion': random.randint(4, 9),
                'user_notes': 'Felt good!' if random.random() > 0.3 else 'Tough one today.'
            })
            
            # Generate details for this log
            num_exercises_in_log = random.randint(5, 8)
            for i in range(num_exercises_in_log):
                exercise = exercises_df.sample(1).iloc[0]
                detail = {
                    'id': detail_id_counter,
                    'log_id': log_id_counter,
                    'exercise_id': exercise['id'],
                    'set_number': i + 1,
                    'reps_completed': None,
                    'weight_kg': None,
                    'time_seconds': None
                }
                if exercise['exercise_type'] == 'strength':
                    detail['reps_completed'] = random.choice([8, 10, 12, 15])
                    detail['weight_kg'] = round(random.uniform(5.0, 100.0) / 2.5) * 2.5 # Simulate plate increments
                else: # cardio, plyo, stretch
                    detail['time_seconds'] = random.choice([30, 45, 60, 90])
                
                details_data.append(detail)
                detail_id_counter += 1
            
            log_id_counter += 1
            
    return pd.DataFrame(logs_data), pd.DataFrame(details_data)

In [11]:
def generate_user_progress(user_ids):
    print("Generating user progress data...")
    data = []
    progress_id_counter = 1
    for user_id in user_ids:
        num_entries = random.randint(5, AVG_PROGRESS_ENTRIES_PER_USER + 5)
        start_weight = round(random.uniform(50.0, 110.0), 1)
        for i in range(num_entries):
            data.append({
                'id': progress_id_counter,
                'user_id': user_id,
                'log_date': fake.date_between(start_date=f'-{num_entries-i}w', end_date=f'-{num_entries-i-1}w'),
                'weight_kg': round(start_weight - (i * random.uniform(-0.5, 0.7)), 1), # Simulates weight fluctuation
                'body_fat_percentage': round(random.uniform(10.0, 35.0) - (i * 0.1), 1),
                'measurements': f'{{"waist_cm": {random.randint(70,100)}, "chest_cm": {random.randint(90,120)}}}'
            })
            progress_id_counter += 1
    return pd.DataFrame(data)

In [13]:
"""if __name__ == "__main__":
    # Generate data
    exercises_df = generate_exercises(NUM_EXERCISES)
    users_df = generate_users(NUM_USERS)
    user_profiles_df = generate_user_profiles(users_df['id'])
    workout_logs_df, workout_log_details_df = generate_workout_logs_and_details(users_df['id'], exercises_df)
    user_progress_df = generate_user_progress(users_df['id'])

    # Save to CSV files
    print("\nSaving data to CSV files...")
    exercises_df.to_csv('exercises.csv', index=False)
    users_df.to_csv('users.csv', index=False)
    user_profiles_df.to_csv('user_profiles.csv', index=False)
    workout_logs_df.to_csv('workout_logs.csv', index=False)
    workout_log_details_df.to_csv('workout_log_details.csv', index=False)
    user_progress_df.to_csv('user_progress.csv', index=False)
    
    print("\nData generation complete! Check for the .csv files in this directory.")"""

'if __name__ == "__main__":\n    # Generate data\n    exercises_df = generate_exercises(NUM_EXERCISES)\n    users_df = generate_users(NUM_USERS)\n    user_profiles_df = generate_user_profiles(users_df[\'id\'])\n    workout_logs_df, workout_log_details_df = generate_workout_logs_and_details(users_df[\'id\'], exercises_df)\n    user_progress_df = generate_user_progress(users_df[\'id\'])\n\n    # Save to CSV files\n    print("\nSaving data to CSV files...")\n    exercises_df.to_csv(\'exercises.csv\', index=False)\n    users_df.to_csv(\'users.csv\', index=False)\n    user_profiles_df.to_csv(\'user_profiles.csv\', index=False)\n    workout_logs_df.to_csv(\'workout_logs.csv\', index=False)\n    workout_log_details_df.to_csv(\'workout_log_details.csv\', index=False)\n    user_progress_df.to_csv(\'user_progress.csv\', index=False)\n    \n    print("\nData generation complete! Check for the .csv files in this directory.")'