In [26]:
import pandas as pd
import numpy as np
from faker import Faker
import random
from datetime import datetime, timedelta

In [27]:
# --- CONFIGURATION ---
NUM_USERS = 1000
NUM_EXERCISES = 150
AVG_WORKOUTS_PER_USER = 25
AVG_PROGRESS_ENTRIES_PER_USER = 15

In [28]:
# Initialize Faker for Indian context
fake = Faker('en_IN')

# --- REVISED DATA COMPONENTS (WITH REALISTIC EMAIL DOMAINS) ---
COMMON_DOMAINS = ['gmail.com', 'yahoo.com', 'outlook.com', 'hotmail.com', 'rediffmail.com']
generated_emails = set() # To ensure all generated emails are unique

FITNESS_LEVELS = ['beginner', 'intermediate', 'advanced']
PRIMARY_GOALS = ['weight_loss', 'muscle_gain', 'endurance', 'skill_boxing', 'general_fitness']
EQUIPMENT_TYPES = ['Dumbbell', 'Barbell', 'Kettlebell', 'Bodyweight', 'Resistance Band', 'Punching Bag']
LANGUAGES = ['en', 'hi', 'mr']
SUBSCRIPTION_STATUSES = ['free', 'premium', 'expired']

STRENGTH_MOVEMENTS = ['Squat', 'Lunge', 'Deadlift', 'Bench Press', 'Overhead Press', 'Row', 'Curl', 'Extension', 'Push Up', 'Pull Up', 'Swing', 'Calf Raise']
CARDIO_PLYO_MOVEMENTS = ['Jumping Jack', 'Burpee', 'High Knees', 'Mountain Climber', 'Box Jump', 'Rope Skip']
COMBAT_STRIKES = ['Jab', 'Cross', 'Hook', 'Uppercut', 'Roundhouse Kick', 'Front Kick', 'Knee Strike', 'Elbow Strike']
YOGA_POSES = ['Downward Dog', 'Warrior II', 'Triangle Pose', 'Plank Pose', 'Cobra Pose', 'Sun Salutation']
MODIFIERS = ['Alternating', 'Single Arm', 'Incline', 'Decline', 'Goblet', 'Sumo', 'Wide Grip', 'Close Grip', 'Jumping', 'Plyo', 'Pause', 'Russian']

In [29]:
# --- 1. GENERATE BASE EXERCISES ---
def generate_base_exercises(n):
    print(f"Step 1: Generating {n} base exercises...")
    data = []
    exercise_names = set()
    while len(exercise_names) < n:
        name = ""
        category = random.choice(['strength', 'cardio', 'combat', 'stretch'])
        if category == 'strength':
            equipment = random.choice([e for e in EQUIPMENT_TYPES if e not in ['Bodyweight', 'Punching Bag']])
            movement = random.choice(STRENGTH_MOVEMENTS)
            modifier = random.choice(MODIFIERS + [''])
            name = f"{equipment} {modifier} {movement}".replace("  ", " ").strip()
        elif category == 'cardio':
            name = random.choice(CARDIO_PLYO_MOVEMENTS)
        elif category == 'combat':
            prefix = random.choice(['Shadow Boxing', 'Heavy Bag Drill'])
            combo_length = random.randint(2, 4)
            combo = "-".join(random.sample(COMBAT_STRIKES, combo_length))
            name = f"{prefix} ({combo})"
        elif category == 'stretch':
            name = f"Yoga: {random.choice(YOGA_POSES)}"
        if name not in exercise_names:
            exercise_names.add(name)
            base_weight = round(random.uniform(20.0, 60.0), 1) if category == 'strength' else 0
            data.append({
                'id': len(exercise_names), 'exercise_name': name,
                'muscle_group': fake.word(ext_word_list=['Chest', 'Back', 'Legs', 'Shoulders', 'Arms', 'Core', 'Full Body']),
                'exercise_type': category,
                'equipment_needed': name.split(' ')[0].lower() if name.split(' ')[0] in ['Dumbbell', 'Barbell', 'Kettlebell', 'Punching'] else 'bodyweight',
                'base_weight_kg': base_weight,
                'difficulty_level': random.choice(FITNESS_LEVELS)
            })
    return pd.DataFrame(data)

In [30]:
# --- 2. GENERATE A SINGLE USER'S COMPLETE, REALISTIC HISTORY ---
def generate_user_and_full_history(user_id, exercises_df, log_id_counter, detail_id_counter, progress_id_counter):
    # --- Create User with realistic email ---
    full_name = fake.name()
    name_parts = full_name.lower().split(' ')
    
    # New realistic email generation loop
    while True:
        username = f"{name_parts[0]}.{name_parts[-1]}{random.randint(1,99)}"
        domain = random.choice(COMMON_DOMAINS)
        email = f"{username}@{domain}"
        if email not in generated_emails:
            generated_emails.add(email)
            break
            
    created_date = fake.date_time_between(start_date='-2y', end_date='-1y')
    user = {
        'id': user_id, 'full_name': full_name, 'email': email,
        'password_hash': fake.sha256(), 'phone_number': fake.phone_number(),
        'preferred_language': random.choice(LANGUAGES),
        'subscription_status': np.random.choice(SUBSCRIPTION_STATUSES, p=[0.5, 0.4, 0.1]),
        'subscription_expiry': fake.date_between(start_date='-30d', end_date='+1y'),
        'created_at': created_date, 'updated_at': fake.date_time_this_month()
    }
    
    # --- Create Profile ---
    fitness_level = np.random.choice(FITNESS_LEVELS, p=[0.5, 0.4, 0.1])
    primary_goal = np.random.choice(PRIMARY_GOALS, p=[0.4, 0.3, 0.1, 0.1, 0.1])
    strength_multiplier = {'beginner': 0.6, 'intermediate': 1.0, 'advanced': 1.5}[fitness_level]
    goal_weight_trend = {'weight_loss': -0.2, 'muscle_gain': 0.1, 'endurance': -0.05}.get(primary_goal, 0)
    user_equipment = random.sample(EQUIPMENT_TYPES, random.randint(1, 4))
    if primary_goal == 'skill_boxing' and 'Punching Bag' not in user_equipment: user_equipment.append('Punching Bag')
    
    profile = {
        'id': user_id, 'user_id': user_id, 'age': random.randint(18, 65),
        'gender': random.choice(['male', 'female']),
        'height_cm': round(random.uniform(150.0, 190.0), 2),
        'fitness_level': fitness_level, 'primary_goal': primary_goal,
        'available_equipment': user_equipment, 'days_per_week': random.choice([3, 4, 5]),
        'time_per_session': random.choice([30, 45, 60]),
        'medical_notes': 'None' if random.random() > 0.2 else 'Minor knee pain'
    }

    # --- Generate Activity Data (Logs and Progress) ---
    user_logs, user_log_details, user_progress_entries = [], [], []
    available_exercises = exercises_df[exercises_df['equipment_needed'].isin(user_equipment + ['bodyweight'])]
    if available_exercises.empty: available_exercises = exercises_df[exercises_df['equipment_needed'] == 'bodyweight']
    
    current_date = created_date + timedelta(days=random.randint(1, 7))
    start_weight = round(random.uniform(55.0, 100.0), 1)
    
    for i in range(AVG_WORKOUTS_PER_USER):
        if current_date > datetime.now(): break
        log_id = next(log_id_counter)
        user_logs.append({
            'id': log_id, 'user_id': user_id, 'workout_name': f'{random.choice(available_exercises["muscle_group"].unique())} Session',
            'log_date': current_date.date(), 'duration_minutes': random.randint(25, 65),
            'perceived_exertion': random.randint(max(4, int(strength_multiplier * 5)), 9), 'user_notes': 'Felt strong!' if random.random() > 0.3 else 'Good session.'
        })
        progression_factor = 1 + (i * 0.015)
        for j in range(random.randint(5, 8)):
            exercise = available_exercises.sample(1).iloc[0]
            detail_id, detail = next(detail_id_counter), {'id': next(detail_id_counter), 'log_id': log_id, 'exercise_id': exercise['id'], 'set_number': j + 1}
            if exercise['exercise_type'] == 'strength':
                detail['reps_completed'] = random.choice([8, 10, 12]) + (int(strength_multiplier) if strength_multiplier > 1 else 0)
                detail['weight_kg'] = round((exercise['base_weight_kg'] * strength_multiplier * progression_factor) / 2.5) * 2.5
            else: detail['time_seconds'] = random.choice([30, 45, 60]) * (0.8 if fitness_level == 'beginner' else 1.2 if fitness_level == 'advanced' else 1)
            user_log_details.append(detail)
        if i % 2 == 0 and len(user_progress_entries) < AVG_PROGRESS_ENTRIES_PER_USER:
            current_weight = start_weight + (len(user_logs) * goal_weight_trend) + random.uniform(-0.2, 0.2)
            user_progress_entries.append({
                'id': next(progress_id_counter), 'user_id': user_id, 'log_date': current_date.date(),
                'weight_kg': round(current_weight, 1), 'body_fat_percentage': round(random.uniform(10.0, 35.0) - (i * (goal_weight_trend * 0.1)), 1),
                'measurements': f'{{"waist_cm": {random.randint(70,100)}, "chest_cm": {random.randint(90,120)}}}'
            })
        current_date += timedelta(days=random.choice([2, 3, 4]))
    return user, profile, user_logs, user_log_details, user_progress_entries

def id_counter(start=1):
    n = start
    while True: yield n; n += 1


In [31]:
# --- MAIN EXECUTION ---
if __name__ == "__main__":
    base_exercises_df = generate_base_exercises(NUM_EXERCISES)
    print(f"\nStep 2: Generating full, realistic histories for {NUM_USERS} users...")
    all_users, all_profiles, all_logs, all_log_details, all_progress = [], [], [], [], []
    log_id_gen, detail_id_gen, progress_id_gen = id_counter(), id_counter(), id_counter()
    for i in range(1, NUM_USERS + 1):
        if i % 100 == 0: print(f"  ...generating user {i}/{NUM_USERS}")
        user, profile, logs, details, progress = generate_user_and_full_history(i, base_exercises_df, log_id_gen, detail_id_gen, progress_id_gen)
        all_users.append(user); all_profiles.append(profile); all_logs.extend(logs); all_log_details.extend(details); all_progress.extend(progress)
    print("\nStep 3: Assembling DataFrames...")
    users_df = pd.DataFrame(all_users); user_profiles_df = pd.DataFrame(all_profiles); workout_logs_df = pd.DataFrame(all_logs)
    workout_log_details_df = pd.DataFrame(all_log_details); user_progress_df = pd.DataFrame(all_progress)
    print("\nStep 4: Saving all data to CSV files...")
    base_exercises_df.to_csv('dataset/exercises.csv', index=False); users_df.to_csv('dataset/users.csv', index=False)
    user_profiles_df.to_csv('dataset/user_profiles.csv', index=False); workout_logs_df.to_csv('dataset/workout_logs.csv', index=False)
    workout_log_details_df.to_csv('dataset/workout_log_details.csv', index=False); user_progress_df.to_csv('dataset/user_progress.csv', index=False)
    print("\n--- Generation Complete! ---")
    print("Your final CSV files with realistic data (including emails) are ready.")



Step 1: Generating 150 base exercises...

Step 2: Generating full, realistic histories for 1000 users...
  ...generating user 100/1000
  ...generating user 200/1000
  ...generating user 300/1000
  ...generating user 400/1000
  ...generating user 500/1000
  ...generating user 600/1000
  ...generating user 700/1000
  ...generating user 800/1000
  ...generating user 900/1000
  ...generating user 1000/1000

Step 3: Assembling DataFrames...

Step 4: Saving all data to CSV files...

--- Generation Complete! ---
Your final CSV files with realistic data (including emails) are ready.
