In [29]:
!pip install Faker



In [3]:
from pymongo import MongoClient
from datetime import datetime, timedelta
import os
import json
import random
import time
import pandas as pd
from faker import Faker 
fake = Faker()


In [30]:
# Connect to MongoDB
client = MongoClient('mongodb://localhost:27017/')
db = client['eduhub_db']

print("Connected to MongoDB successfully!")


Connected to MongoDB successfully!


### Creating Collections

### Adding Validations for Collections

In [31]:
# Drop existing collections if they exist
# ------------------------------
for collection in ['Users', 'Courses', 'Enrollments', 'Lessons', 'Assignments', 'Submissions']:
    db[collection].drop()

# ------------------------------
# Users Collection
# ------------------------------
users_validator = {
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['userId', 'email', 'firstName', 'lastName', 'role', 'dateJoined', 'isActive'],
        'properties': {
            'userId': {'bsonType': 'string'},
            'email': {'bsonType': 'string'},
            'firstName': {'bsonType': 'string'},
            'lastName': {'bsonType': 'string'},
            'role': {'enum': ['student', 'instructor']},
            'dateJoined': {'bsonType': 'date'},
            'isActive': {'bsonType': 'bool'},
            'profile': {
                'bsonType': 'object',
                'properties': {
                    'bio': {'bsonType': 'string'},
                    'avatar': {'bsonType': 'string'},
                    'skills': {'bsonType': 'array', 'items': {'bsonType': 'string'}}
                }
            }
        }
    }
}
db.create_collection('Users')
db.command({"collMod": "Users", "validator": users_validator, "validationLevel": "moderate"})
print("Users collection created with validation")

# ------------------------------
# Courses Collection
# ------------------------------
courses_validator = {
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['courseId', 'title', 'instructorId', 'level', 'isPublished', 'createdAt'],
        'properties': {
            'courseId': {'bsonType': 'string'},
            'title': {'bsonType': 'string'},
            'description': {'bsonType': 'string'},
            'instructorId': {'bsonType': 'string'},
            'category': {'bsonType': 'string'},
            'level': {'enum': ['beginner', 'intermediate', 'advanced']},
            'duration': {'bsonType': 'number', 'minimum': 0},
            'price': {'bsonType': 'number', 'minimum': 0},
            'tags': {'bsonType': 'array', 'items': {'bsonType': 'string'}},
            'createdAt': {'bsonType': 'date'},
            'updatedAt': {'bsonType': 'date'},
            'isPublished': {'bsonType': 'bool'}
        }
    }
}
db.create_collection('Courses')
db.command({"collMod": "Courses", "validator": courses_validator, "validationLevel": "moderate"})
print("Courses collection created with validation")

# ------------------------------
# Enrollments Collection
# ------------------------------
enrollments_validator = {
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['enrollmentId', 'studentId', 'courseId', 'enrolledAt', 'status'],
        'properties': {
            'enrollmentId': {'bsonType': 'string'},
            'studentId': {'bsonType': 'string'},
            'courseId': {'bsonType': 'string'},
            'enrolledAt': {'bsonType': 'date'},
            'status': {'enum': ['active', 'completed', 'dropped']},
            'progress': {'bsonType': 'number', 'minimum': 0, 'maximum': 100},
            'completedAt': {'bsonType': ['date', 'null']},
            'lastAccessedAt': {'bsonType': 'date'}
        }
    }
}
db.create_collection('Enrollments')
db.command({"collMod": "Enrollments", "validator": enrollments_validator, "validationLevel": "moderate"})
print("Enrollments collection created with validation")

# ------------------------------
# Lessons Collection
# ------------------------------
lessons_validator = {
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['lessonId', 'courseId', 'title', 'content', 'order'],
        'properties': {
            'lessonId': {'bsonType': 'string'},
            'courseId': {'bsonType': 'string'},
            'title': {'bsonType': 'string'},
            'content': {'bsonType': 'string'},
            'order': {'bsonType': 'number', 'minimum': 1},
            'resources': {'bsonType': 'array', 'items': {'bsonType': 'string'}},
            'createdAt': {'bsonType': 'date'},
            'updatedAt': {'bsonType': 'date'}
        }
    }
}
db.create_collection('Lessons')
db.command({"collMod": "Lessons", "validator": lessons_validator, "validationLevel": "moderate"})
print("Lessons collection created with validation")

# ------------------------------
# Assignments Collection
# ------------------------------
assignments_validator = {
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['assignmentId', 'courseId', 'title', 'description', 'dueDate'],
        'properties': {
            'assignmentId': {'bsonType': 'string'},
            'courseId': {'bsonType': 'string'},
            'title': {'bsonType': 'string'},
            'description': {'bsonType': 'string'},
            'dueDate': {'bsonType': 'date'},
            'createdAt': {'bsonType': 'date'},
            'updatedAt': {'bsonType': 'date'},
            'maxScore': {'bsonType': 'number', 'minimum': 0}
        }
    }
}
db.create_collection('Assignments')
db.command({"collMod": "Assignments", "validator": assignments_validator, "validationLevel": "moderate"})
print("Assignments collection created with validation")

# ------------------------------
# Submissions Collection
# ------------------------------
submissions_validator = {
    '$jsonSchema': {
        'bsonType': 'object',
        'required': ['submissionId', 'assignmentId', 'studentId', 'submittedAt'],
        'properties': {
            'submissionId': {'bsonType': 'string'},
            'assignmentId': {'bsonType': 'string'},
            'studentId': {'bsonType': 'string'},
            'courseId': {'bsonType': 'string'},
            'submittedAt': {'bsonType': 'date'},
            'content': {'bsonType': 'string'},
            'fileUrl': {'bsonType': 'string'},
            'grade': {'bsonType': ['number', 'null'], 'minimum': 0},
            'feedback': {'bsonType': 'string'},
            'gradedAt': {'bsonType': ['date', 'null']},
            'gradedBy': {'bsonType': 'string'},
            'status': {'enum': ['submitted', 'graded', 'returned']}
        }
    }
}
db.create_collection('Submissions')
db.command({"collMod": "Submissions", "validator": submissions_validator, "validationLevel": "moderate"})
print("Submissions collection created with validation")

print("\nAll collections created with validations successfully!")


Users collection created with validation
Courses collection created with validation
Enrollments collection created with validation
Lessons collection created with validation
Assignments collection created with validation
Submissions collection created with validation

All collections created with validations successfully!


### Deleting Existing Folder to Avoid Duplicates

In [32]:
db.users.delete_many({})
db.courses.delete_many({})
db.enrollments.delete_many({})
db.lessons.delete_many({})
db.assignments.delete_many({})
db.submissions.delete_many({})

print("All collections cleared!")

All collections cleared!


### Inserting Users

In [33]:

# ------------------------------
# Instructor and student data
# ------------------------------
instructor_skills = ['Python', 'AI', 'Web Dev', 'Data Analysis', 'Cloud']
instructor_bios = [
    "Passionate about teaching Python and building scalable systems.",
    "Expert in AI and Data Science with 10+ years of experience.",
    "Focused on modern web development with React and Node.js.",
    "Loves mentoring students in cloud technologies and DevOps.",
    "Dedicated to simplifying Machine Learning concepts for everyone."
]

student_bios = [
    "Aspiring data analyst turning numbers into insights.",
    "Interested in web development and coding.",
    "Motivated learner exploring AI and machine learning.",
    "Excited about solving problems with Python and SQL.",
    "Building foundations in statistics and data visualization."
]

# ------------------------------
# Generate Users
# ------------------------------
users_data = []
instructors_ids = []

# Instructors
for i in range(1, 6):
    instructor_id = f'INST{i:03d}'
    instructors_ids.append(instructor_id)
    users_data.append({
        'userId': instructor_id,
        'email': fake.email(),
        'firstName': fake.first_name(),
        'lastName': fake.last_name(),
        'role': 'instructor',
        'dateJoined': fake.date_time_between(start_date='-2y', end_date='-1y'),
        'profile': {
            'bio': random.choice(instructor_bios),
            'avatar': fake.image_url(),
            'skills': random.sample(instructor_skills, k=random.randint(3, 5))
        },
        'isActive': random.choice([True, False])
    })

# Students
for i in range(1, 16):
    users_data.append({
        'userId': f'STU{i:03d}',
        'email': fake.email(),
        'firstName': fake.first_name(),
        'lastName': fake.last_name(),
        'role': 'student',
        'dateJoined': fake.date_time_between(start_date='-1y', end_date='now'),
        'profile': {
            'bio': random.choice(student_bios),
            'avatar': fake.image_url(),
            'skills': random.sample(instructor_skills, k=random.randint(0, 3))
        },
        'isActive': random.choice([True, False])
    })

# Insert into MongoDB
db.Users.insert_many(users_data)
print(f"Inserted {len(users_data)} users into 'Users' collection")
print("Instructor IDs:", instructors_ids)

# Save users to JSON
with open("./data/users.json", "w") as f:
    json.dump(users_data, f, indent=4, default=str)

# ------------------------------
# Course content
# ------------------------------
category_content = {
    "Data Science": {
        "titles": [
            "Introduction to Data Science",
            "Python for Data Analysis",
            "Machine Learning for Beginners"
        ],
        "descriptions": [
            "Learn the fundamentals of data science, from cleaning datasets to building predictive models.",
            "Master Python libraries such as Pandas, NumPy, and Matplotlib for real-world data tasks.",
            "Understand supervised and unsupervised learning with practical machine learning examples."
        ]
    },
    "Web Development": {
        "titles": [
            "Frontend Development with React",
            "Backend Development with Node.js",
            "Full-Stack Web Development"
        ],
        "descriptions": [
            "Build responsive user interfaces using React and modern JavaScript tools.",
            "Learn backend principles, APIs, and server management with Node.js and Express.",
            "Master full-stack skills by building complete, scalable web applications."
        ]
    },
    "Cloud Computing": {
        "titles": [
            "Cloud Computing with AWS",
            "DevOps Fundamentals",
            "Serverless Applications on Cloud"
        ],
        "descriptions": [
            "Understand cloud architecture and work hands-on with AWS services.",
            "Learn automation, CI/CD pipelines, and containerization for efficient deployments.",
            "Explore serverless computing and how to build applications without managing servers."
        ]
    }
}

levels = ["beginner", "intermediate", "advanced"]

# ------------------------------
# Generate Courses
# ------------------------------
courses = []
for i in range(1, 9):
    category = random.choice(list(category_content.keys()))
    content = category_content[category]

    courses.append({
        "courseId": f"CRS{i:03d}",
        "title": random.choice(content["titles"]),
        "description": random.choice(content["descriptions"]),
        "instructorId": random.choice(instructors_ids),
        "category": category,
        "level": random.choice(levels),
        "duration": random.randint(5, 100),  # hours
        "price": round(random.uniform(10, 200), 2),
        "tags": random.sample(instructor_skills, k=random.randint(1, 3)),  # skills as tags
        "createdAt": datetime.now() - timedelta(days=random.randint(1, 365)),
        "updatedAt": datetime.now(),
        "isPublished": random.choice([True, False])
    })

# Insert courses into MongoDB
db.Courses.insert_many(courses)
print(f"Inserted {len(courses)} courses into 'Courses' collection")

# Save courses to JSON
with open("./data/courses.json", "w") as f:
    json.dump(courses, f, indent=4, default=str)


# ------------------------------
# Collect IDs
# ------------------------------
students_ids = [u["userId"] for u in users_data if u["role"] == "student"]
course_ids = [c["courseId"] for c in courses]
print("Collected student and course IDs successfully")

Inserted 20 users into 'Users' collection
Instructor IDs: ['INST001', 'INST002', 'INST003', 'INST004', 'INST005']
Inserted 8 courses into 'Courses' collection
Collected student and course IDs successfully


In [34]:
# ------------------------------
# Enrollment Generation
# ------------------------------
statuses = ["active", "completed", "dropped"]
enrollments = []

for i in range(1, 16):  # Generate 15 enrollments
    status_value = random.choice(statuses)
    
    enrollment = {
        "enrollmentId": f"ENR{i:03d}",
        "studentId": random.choice(students_ids),
        "courseId": random.choice(course_ids),
        "enrolledAt": fake.date_time_between(start_date="-1y", end_date="now"),
        "status": status_value,
        "progress": random.randint(0, 100),
        "completedAt": (
            datetime.now() - timedelta(days=random.randint(1, 30))
            if status_value == "completed" else None
        ),
        "lastAccessedAt": datetime.now()
    }
    enrollments.append(enrollment)

# ------------------------------
# Insert into MongoDB
# ------------------------------
db.Enrollments.insert_many(enrollments)
print(f"Inserted {len(enrollments)} enrollments into 'Enrollments' collection")

# ------------------------------
# Save to JSON
# ------------------------------
with open("./data/enrollments.json", "w") as f:
    json.dump(enrollments, f, indent=4, default=str)


Inserted 15 enrollments into 'Enrollments' collection


### Inserting the Lessons 

In [37]:
# Inserting 25 Lessons (shorter predefined lists)
lesson_titles = [
    "Intro to Programming",
    "Data Structures",
    "Web Development Basics",
    "Machine Learning Intro",
    "SQL Fundamentals",
    "Power BI Dashboards",
    "APIs and Integration",
    "Data Cleaning",
    "Python Functions",
    "Final Project Overview"
]

lesson_contents = [
    "Covers key programming concepts and examples.",
    "Learn how to store and manage data efficiently.",
    "Hands-on session building a basic web app.",
    "Introduction to ML principles and workflows.",
    "Query and manage structured data using SQL.",
    "Create visuals and dashboards using Power BI.",
    "Connect external data using APIs.",
    "Clean and prepare datasets for analysis.",
    "Write efficient and reusable Python code.",
    "Wrap-up and final course project session."
]

lessons = []
for i in range(25):
    lessons.append({
        "lessonId": f"LES{i+1:03d}",
        "courseId": random.choice(course_ids),
        "title": random.choice(lesson_titles),
        "content": random.choice(lesson_contents),
        "order": i + 1,
        "resources": [fake.url() for _ in range(random.randint(0, 2))],
        "createdAt": datetime.now() - timedelta(days=random.randint(1, 100)),
        "updatedAt": datetime.now()
    })

# Insert into MongoDB
db.Lessons.insert_many(lessons)
print(f"{len(lessons)} lessons inserted.")

# Save to JSON
with open("./data/lessons.json", "w") as f:
    json.dump(lessons, f, indent=4, default=str)


25 lessons inserted.


### Inserting Assignment

In [35]:
assignments = []
descriptions = [
    "Analyze dataset and share insights.", "Automate task using Python.",
    "Build Power BI dashboard.", "Clean and preprocess dataset.",
    "Write report on market findings.", "Develop predictive model.",
    "Prepare summary presentation.", "Conduct statistical analysis.",
    "Visualize data trends.", "Implement problem-solving algorithm."
]

for i in range(10):
    assignments.append({
        "assignmentId": f"ASM{i+1:03d}",
        "courseId": random.choice(course_ids),
        "title": f"Assignment {i+1}",
        "description": random.choice(descriptions),
        "dueDate": datetime.now() + timedelta(days=random.randint(5, 30)),
        "createdAt": datetime.now() - timedelta(days=random.randint(1, 30)),
        "updatedAt": datetime.now(),
        "maxScore": 100
    })

db.Assignments.insert_many(assignments)
print(f"{len(assignments)} assignments inserted.")

with open("./data/assignments.json", "w") as f:
    json.dump(assignments, f, indent=4, default=str)


assignment_ids = [a["assignmentId"] for a in assignments]

10 assignments inserted.


### Inserting Submission 

In [36]:
submissions, contents = [], [
    "Completed with explanations.", "Attached solution file.",
    "Submitted project report.", "Code and documentation.",
    "Final essay version.", "Uploaded analysis charts.",
    "Included all reasoning.", "Attached case study.",
    "Solution with references.", "Report and summary attached."
]

statuses = ["graded", "submitted", "returned"]

for i in range(12):
    s = random.choice(statuses)
    grade = random.randint(50, 100) if s == "graded" else None
    gradedAt = datetime.now() - timedelta(days=random.randint(1, 5)) if s in ["graded", "returned"] else None
    feedback = random.choice([
        "Excellent work.", "Needs clarity.", "Recheck details.",
        "Great effort.", "Work incomplete."
    ]) if s != "submitted" else "Pending review."
    gradedBy = random.choice(instructors_ids) if s in ["graded", "returned"] else ""

    submissions.append({
        "submissionId": f"SUB{i+1:03d}",
        "assignmentId": random.choice(assignment_ids),
        "studentId": random.choice(students_ids),
        "courseId": random.choice(course_ids),
        "submittedAt": datetime.now() - timedelta(days=random.randint(1, 10)),
        "content": random.choice(contents),
        "fileUrl": fake.url(),
        "grade": grade,
        "feedback": feedback,
        "gradedAt": gradedAt,
        "gradedBy": gradedBy,
        "status": s
    })

# Insert submissions
db.Submissions.insert_many(submissions)
print(f"{len(submissions)} submissions inserted.")

# Save submissions to JSON file
with open("./data/submissions.json", "w") as f:
    json.dump(submissions, f, indent=4, default=str)


12 submissions inserted.
