In [None]:
from faker import Faker
from faker.providers import BaseProvider
import random
import csv
from datetime import datetime, timedelta

# Initialize Faker once
fake = Faker()

# Create global lists to store generated IDs
global_participant_ids = []
global_course_ids = []
global_enrollment_ids = []

# --- Providers ---

class ParticipantsProvider(BaseProvider):
    def participants(self, num_participants=100):
        participants_data = []
        for i in range(1, num_participants + 1): # Start IDs from 1 for simplicity
            participant_id = i
            global_participant_ids.append(participant_id) # Store ID globally
            participant = {
                'participant_id': participant_id,
                'first_name': fake.first_name(), # Use specific first/last name
                'last_name': fake.last_name(),
                'department': fake.job(),
                'region': fake.state(),
                'start_date_at_company': fake.date_between(start_date='-5y', end_date='today').isoformat()
            }
            participants_data.append(participant)
        return participants_data

class CourseProvider(BaseProvider):
    def courses(self, num_courses=50):
        courses_data = []
        for i in range(1, num_courses + 1): # Start IDs from 1
            course_id = i
            global_course_ids.append(course_id) # Store ID globally
            course = {
                'course_id': course_id,
                'course_name': fake.catch_phrase(),
                'instructor': fake.name(),
                'duration_hours': random.randint(1, 40),
                'category': random.choice(['Technology', 'Business', 'Health', 'Arts', 'Compliance', 'Leadership']),
            }
            courses_data.append(course)
        return courses_data

class EnrollmentProvider(BaseProvider):
    def enrollments(self, num_enrollments=200): # Increased num_enrollments for more data points
        enrollments_data = []
        # Ensure participants and courses are generated first for their IDs
        if not global_participant_ids or not global_course_ids:
            raise ValueError("Participants and Courses must be generated before Enrollments.")

        for i in range(1, num_enrollments + 1):
            enrollment_id = i
            global_enrollment_ids.append(enrollment_id) # Store ID globally
            
            # Pick existing participant and course IDs
            participant_id = random.choice(global_participant_ids)
            course_id = random.choice(global_course_ids)
            
            enroll_date = fake.date_between(start_date='-2y', end_date='today')
            
            status = random.choice(['completed', 'in_progress', 'not_started', 'dropped'])
            completion_date = None
            score = None

            if status == 'completed':
                # Ensure completion date is after enrollment date
                completion_date = fake.date_between(start_date=enroll_date, end_date='today').isoformat()
                score = random.randint(50, 100) # Higher scores for completed
            elif status == 'in_progress':
                # Ensure no completion date
                completion_date = None
                score = None # No score yet
            elif status == 'dropped':
                # Simulate dropping after some time
                if enroll_date + timedelta(days=random.randint(7, 90)) <= datetime.now().date():
                    completion_date = (enroll_date + timedelta(days=random.randint(7, 90))).isoformat()
                score = random.randint(0, 49) # Low score for dropped/failed
            
            enrollment = {
                'enrollment_id': enrollment_id,
                'participant_id': participant_id,
                'course_id': course_id,
                'enrollment_date': enroll_date.isoformat(),
                'completion_date': completion_date,
                'status': status,
                'score': score,
            }
            enrollments_data.append(enrollment)
        return enrollments_data

class FeedbackProvider(BaseProvider):
    def feedback(self, num_feedbacks=150): # Increased feedback count
        feedbacks_data = []
        if not global_enrollment_ids:
            raise ValueError("Enrollments must be generated before Feedbacks.")

        for i in range(1, num_feedbacks + 1):
            feedback = {
                'feedback_id': i,
                'enrollment_id': random.choice(global_enrollment_ids), # Pick existing enrollment ID
                'rating': random.randint(1, 5),
                'comments': fake.text(max_nb_chars=100), # Shorter comments
                'submitted_at': fake.date_time_between(start_date='-1y', end_date='now').isoformat(),
            }
            feedbacks_data.append(feedback)
        return feedbacks_data

class EngagementProvider(BaseProvider):
    def engagement(self, num_engagements=500): # Increased engagement count to be more realistic
        engagements_data = []
        if not global_participant_ids:
            raise ValueError("Participants must be generated before Engagements.")

        for i in range(1, num_engagements + 1):
            engagement = {
                'engagement_id': i,
                'participant_id': random.choice(global_participant_ids), # Pick existing participant ID
                'activity_type': random.choice(['login', 'course_view', 'quiz_attempt', 'video_watch', 'forum_post']),
                'timestamp': fake.date_time_between(start_date='-1y', end_date='now').isoformat(),
                'duration_minutes': random.randint(1, 120) if random.random() > 0.3 else None, # Some activities might not have duration
            }
            engagements_data.append(engagement)
        return engagements_data

# --- Register Providers and Generate Data ---
fake.add_provider(ParticipantsProvider)
fake.add_provider(CourseProvider)
fake.add_provider(EnrollmentProvider)
fake.add_provider(FeedbackProvider)
fake.add_provider(EngagementProvider)


def generate_all_data():
    # Order matters here: generate parent entities first
    participants = fake.participants(num_participants=100)
    courses = fake.courses(num_courses=50)
    # Now generate dependent entities, using the global IDs
    enrollments = fake.enrollments(num_enrollments=200)
    feedbacks = fake.feedback(num_feedbacks=150)
    engagements = fake.engagement(num_engagements=500)

    return {
        'participants': participants,
        'courses': courses,
        'enrollments': enrollments,
        'feedbacks': feedbacks,
        'engagements': engagements
    }

def save_to_csv(data, filename):
    if not data:
        print(f"No data to save for {filename}. Skipping.")
        return
    # Ensure all keys from the first dictionary are used as fieldnames
    fieldnames = list(data[0].keys())
    with open(filename, mode='w', newline='', encoding='utf-8') as file:
        writer = csv.DictWriter(file, fieldnames=fieldnames)
        writer.writeheader()
        for row in data:
            writer.writerow(row)

def main():
    print("Generating data...")
    data_sets = generate_all_data() # Use the new function that ensures order
    
    # Create a 'data/raw' subdirectory if it doesn't exist
    import os
    output_dir = 'data/raw'
    os.makedirs(output_dir, exist_ok=True)

    save_to_csv(data_sets['participants'], os.path.join(output_dir, 'participants.csv'))
    save_to_csv(data_sets['courses'], os.path.join(output_dir, 'courses.csv'))
    save_to_csv(data_sets['enrollments'], os.path.join(output_dir, 'enrollments.csv'))
    save_to_csv(data_sets['feedbacks'], os.path.join(output_dir, 'feedbacks.csv'))
    save_to_csv(data_sets['engagements'], os.path.join(output_dir, 'engagements.csv'))
    
    print("Data generation complete. CSV files created in 'data/raw/' directory.")

if __name__ == "__main__":
    main()

Data generation complete. CSV files created.
