In [None]:
# This code was created with the help of AI
from faker import Faker
import numpy as np
import pandas as pd

fake = Faker()

In [None]:
np.random.seed(42)
Faker.seed(42)

In [None]:
n_records = 27549

In [None]:
# Generate student data
student_ids = [
    f"ID{''.join(np.random.choice(list('0123456789'), 9))}" for _ in range(n_records)
]
student_names = [fake.name() for _ in range(n_records)]
student_emails = [fake.email() for _ in range(n_records)]

In [None]:
# Categories and tags
categories = [
    ", ".join(
        np.random.choice(
            ["Academic", "Extracurricular", "Financial", "Residential", "Online"],
            np.random.randint(1, 4),
            replace=False,
        )
    )
    for _ in range(n_records)
]
tags = [
    np.random.choice(
        ["Active", "Pending", "Completed", "Enrolled", ""],
        p=[0.2, 0.2, 0.2, 0.2, 0.2],
    )
    for _ in range(n_records)
]

In [None]:
# Academic info
classifications = np.random.choice(
    ["Freshman", "Sophomore", "Junior", "Senior", "Graduate"], n_records
)
majors = np.random.choice(
    [
        "Computer Science",
        "Business Administration",
        "Psychology",
        "Education",
        "Engineering",
    ],
    n_records,
)
gpas = np.clip(np.random.normal(3, 0.5, n_records), 0, 4)

In [None]:
# Course info
course_depts = ["MTH", "ENG", "HSC", "EDL"]
course_numbers = [
    f"{dept}-{str(i).zfill(3)}" for dept in course_depts for i in range(1, 26)
]  # 25 courses per dept
course_titles = [
    fake.unique.sentence(nb_words=3) for _ in range(len(course_numbers))
]  # Unique course titles
sections = ["".join(np.random.choice(list("0123456789"), 5)) for _ in range(n_records)]
instructors = [
    f"{fake.last_name()},"
    f" {fake.first_name()} (ID{''.join(np.random.choice(list('0123456789'), 9))})"
    f" <{fake.email()}>"
    for _ in range(150)
]

In [None]:
# Enrollment info
dropped = np.random.choice(["Yes", "No"], p=[0.24, 0.76], size=n_records)
dropped_dates = [
    fake.date_between(start_date="-1y", end_date="today") if drop == "Yes" else None
    for drop in dropped
]

In [None]:
# Grades
midterm_grades = [
    np.random.choice(
        ["A", "B", "C", "D", "F", "P", "I"], p=[0.3, 0.25, 0.2, 0.1, 0.1, 0.025, 0.025]
    )
    for _ in range(n_records)
]
final_grades = [
    np.random.choice(
        ["A", "B", "C", "D", "F", "P", "I"], p=[0.3, 0.25, 0.2, 0.1, 0.1, 0.025, 0.025]
    )
    for _ in range(n_records)
]

In [None]:
# Attendance
total_progress_reports = np.random.poisson(0.4, n_records)
absences = np.random.poisson(0.5, n_records)

In [None]:
# Schedule
start_dates = [
    fake.date_between(start_date="-3m", end_date="+3m") for _ in range(n_records)
]
end_dates = [start_date + pd.Timedelta(weeks=16) for start_date in start_dates]
start_times = [f"{hour}:00 AM CT" for hour in np.random.choice(range(8, 12), n_records)]
end_times = [
    f"{hour + np.random.choice([1, 2, 3])}:00 PM CT"
    for hour in np.random.choice(range(1, 5), n_records)
]
class_days = [
    ", ".join(
        np.random.choice(
            ["M", "T", "W", "R", "F", "Sa"], size=np.random.randint(1, 3), replace=False
        )
    )
    for _ in range(n_records)
]

In [None]:
# Create DataFrame
df = pd.DataFrame(
    {
        "Student ID": student_ids,
        "Student Name": student_names,
        "Student Email": student_emails,
        "Categories": categories,
        "Tags": tags,
        "Classification": classifications,
        "Major": majors,
        "Cumulative GPA": gpas,
        "Course Number": np.random.choice(course_numbers, n_records),
        "Course Title": [
            course_titles[course_numbers.index(num)]
            for num in np.random.choice(course_numbers, n_records)
        ],
        "Section": np.random.choice(sections, n_records),
        "Instructors": np.random.choice(instructors, n_records),
        "Dropped?": dropped,
        "Dropped Date": dropped_dates,
        "Midterm Grade": midterm_grades,
        "Final Grade": final_grades,
        "Total Progress Reports": total_progress_reports,
        "Absences": absences,
        "Start Date": start_dates,
        "End Date": end_dates,
        "Start Time": start_times,
        "End Time": end_times,
        "Class Days": class_days,
    }
)

In [None]:
df