In [None]:
# This code was mostly created by ChatGPT
from __future__ import annotations

import random

from faker import Faker
import numpy as np
import pandas as pd

fake = Faker()

In [None]:
np.random.seed(42)
Faker.seed(42)
random.seed(42)

In [None]:
n_records = 85_253
average_courses_per_student = 3.1

In [None]:
classification_probs = {
    "Foo (Winter 2024)": 0.6141,
    "Foo (Fall 2023)": 0.2039,
    "Graduate (Winter 2024)": 0.0479,
    "None": 0.0403,
    "Foo (Spring 2024)": 0.0313,
    "Graduate (Fall 2023)": 0.0154,
    "Foo (Spring 2023)": 0.0126,
    "Foo (Summer 2023)": 0.0112,
    "Foo (Winter 2023)": 0.0057,
    "Foo (Fall 2022)": 0.0048,
    "Graduate (Spring 2023)": 0.0016,
    "Foo (Spring 2022)": 0.0014,
    "Graduate (Summer 2023)": 0.0014,
    "Graduate (Spring 2024)": 0.001,
    "Graduate (Winter 2023)": 0.0008,
    "Graduate (Fall 2022)": 0.0005,
    "Graduate (Summer 2022)": 0.0004,
    "Foo (Fall 2021)": 0.0004,
    "Foo (Spring 2020)": 0.0004,
    "Foo (Fall 2019)": 0.0003,
    "Foo (Summer 2024)": 0.0003,
    "Foo (Winter 2022)": 0.0003,
    "Foo (Summer 2022)": 0.0003,
    "Graduate (Fall 2018)": 0.0002,
    "Graduate (Winter 2022)": 0.0002,
    "Foo (Fall 2020)": 0.0002,
    "Foo (Winter 2019)": 0.0002,
    "Foo (Winter 2020)": 0.0002,
    "Graduate (Winter 2018)": 0.0002,
    "Graduate (Spring 2021)": 0.0002,
    "Foo (Winter 2017)": 0.0002,
    "Graduate (Winter 2021)": 0.0002,
    "Graduate (Summer 2021)": 0.0002,
    "Freshman (Winter 2024)": 0.0001,
    "Foo (Winter 2021)": 0.0001,
    "Foo (Spring 2016)": 0.0001,
    "Graduate (Spring 2017)": 0.0001,
    "Graduate (Summer 2020)": 0.0001,
    "Graduate (Spring 2020)": 0.0001,
    "Foo (Spring 2019)": 0.0001,
    "Graduate (Fall 2020)": 0.0001,
    "Graduate (Spring 2019)": 0.0001,
    "Graduate (Spring 2022)": 0.0001,
    "Foo (Summer 2021)": 0.0001,
    "Graduate (Fall 2021)": 0.0001,
    "Foo (Fall 2016)": 0.0001,
    "Foo (Summer 2019)": 0.0001,
    "Freshman (Fall 2022)": 0.0001,
    "Graduate (Summer 2024)": 0.0001,
    "Foo (Fall 2024)": 0.0001,
    "Foo (Spring 2021)": 0.0001,
}

# Ensure the probs sum to 1
classification_probs = {
    key: value / sum(classification_probs.values())
    for key, value in classification_probs.items()
}

In [None]:
# Calculate the number of unique students based on the average number of courses per
# student
n_unique_students = int(n_records / average_courses_per_student)

# Generate unique student data
unique_students = pd.DataFrame(
    {
        "Student ID": [
            # The builtin `random.sample` method from the Python standard lib can
            # efficiently sample from `range` objects, while `np.random.sample` needs to
            # construct the entire list in memory first.
            f"ID{rand_id:09}"
            for rand_id in random.sample(range(10**9), n_unique_students)
        ],
        "Student Alternate ID": np.NaN,
        "Student Name": [fake.name() for _ in range(n_unique_students)],
        "Student E-mail": [fake.email() for _ in range(n_unique_students)],
        "Classification": np.random.choice(
            list(classification_probs.keys()),
            n_unique_students,
            p=list(classification_probs.values()),
        ),
        "Major": np.random.choice(
            [
                "Computer Science",
                "Business Administration",
                "Psychology",
                "Education",
                "Engineering",
            ],
            n_unique_students,
        ),
        "Cumulative GPA": np.clip(np.random.normal(3, 0.5, n_unique_students), 0, 4),
    }
).replace("None", np.NaN)

In [None]:
# Generate Categories
def select_categories() -> str | None:
    categories = []

    def _include_category(chance: float = 0.9) -> bool:
        """Randomly decide to include a category or not"""
        return np.random.rand() < chance

    campus_options = ["On-line", "Main Campus", "Satellite Campus"]
    graduated_options = [  # Mutually exclusive with certain other categories
        "Graduated: Yes",
        "Graduated: No",
    ]
    level_options = [
        "Undergraduate",
        "Graduate",
    ]
    hold_types = ["Hold: Financial", "Hold: Academic", "Hold: Administrative"]
    comp_rate_options = ["Completion Rate: >= 66.67%", "Completion Rate: < 66.67%"]
    start_term_options = [
        f"Start Term: {season} {year}"
        for year in range(2000, 2024)
        for season in ["Fall", "Winter", "Spring", "Summer"]
    ]
    term_status_options = [  # Mutually exclusive with "Graduated: Yes"
        "Term Status: Registered",
        "Term Status: Not Registered",
    ]
    fafsa_options = [  # Mutually exclusive with "Graduated: Yes"
        "FAFSA: Yes",
        "FAFSA: No",
    ]

    # Build up categories one-by-one, randomly deciding which will be included.
    if _include_category():
        categories.append(
            f"Campus: {np.random.choice(campus_options, p=[0.5, 0.4, 0.1])}"
        )
    if _include_category():
        categories.append(np.random.choice(graduated_options, p=[0.1, 0.9]))
    if _include_category():
        categories.append(np.random.choice(level_options))
    if _include_category(chance=0.10):  # Only 10% chance of showing a Hold
        categories.append(np.random.choice(hold_types))
    if _include_category():
        categories.append(np.random.choice(comp_rate_options, p=[0.75, 0.25]))
    if _include_category():
        total_weight = sum(range(1, len(start_term_options) + 1))
        start_term_weights = [
            i / total_weight for i in range(1, len(start_term_options) + 1)
        ]
        categories.append(np.random.choice(start_term_options, p=start_term_weights))

    if "Graduated: Yes" not in categories:
        # We can only be here if you do NOT show "Graduated: Yes"
        if _include_category():
            categories.append(np.random.choice(term_status_options))
        if _include_category():
            categories.append(np.random.choice(fafsa_options))

    return ", ".join(categories) if categories else None


unique_students["Categories"] = [
    select_categories() for _ in range(len(unique_students))
]

In [None]:
def select_tags() -> str | None:
    tags = []

    # General tags
    general_tags = [
        "Honor Student",
        "Scholarship Recipient",
        "At Risk",
        "Needs Tutoring",
        "Athlete",
        "International",
        "Transfer",
    ]
    # Tuples of mutually exclusive tags
    mutually_exclusive_tags = [("Part-Time", "Full-Time")]

    max_tags = len(general_tags) + len(mutually_exclusive_tags)

    def _include_tag(chance: float = 1 - 0.675 ** (1 / max_tags)) -> bool:
        """Randomly decide to include a tag or not.
        We want a 67.5% chance that a student has no tags whatsoever.
        With 8 possible tags, a little algebra reveals that the default chance must be
        `1 - 0.675 ** (1 / num_possible_tags)`."""
        return np.random.rand() < chance

    for tag in general_tags:
        if _include_tag():
            tags.append(tag)

    for tuple_of_mutually_exclusive_tags in mutually_exclusive_tags:
        if _include_tag():
            tags.append(np.random.choice(tuple_of_mutually_exclusive_tags))

    return ", ".join(tags) if tags else None


unique_students["Tags"] = [select_tags() for _ in range(len(unique_students))]

In [None]:
# Simulate the number of courses per student using a distribution (you can adjust this
# distribution to better fit your data)
num_courses_per_student_distribution = {
    1: 0.2114755722921415,
    2: 0.21167376870478644,
    3: 0.2546823902487365,
    4: 0.1274402933306907,
    5: 0.09315231394311763,
    6: 0.05054008522445744,
    7: 0.020909721534040235,
    8: 0.008621543950054504,
    9: 0.005450401347735606,
    10: 0.005648597760380537,
    11: 0.0016846695074819145,
    12: 0.004657615697155881,
    13: 0.0027747497770290357,
    14: 0.0004954910316123278,
    15: 9.909820632246556e-05,
    16: 0.0004954910316123278,
    17: 0.00019819641264493112,
}
num_courses_per_student = np.random.choice(
    list(num_courses_per_student_distribution.keys()),
    n_unique_students,
    p=list(num_courses_per_student_distribution.values()),
)

In [None]:
# Replicate each student entry based on the number of courses they're taking
replicated_students = unique_students.loc[
    unique_students.index.repeat(num_courses_per_student)
].reset_index(drop=True)

In [None]:
# Course info
# Generate course-specific information for each enrollment
course_depts = ["MTH", "ENG", "HSC", "EDL"]
course_numbers = [
    f"{dept}-{str(i).zfill(3)}" for dept in course_depts for i in range(1, 101)
]  # Assuming 100 courses per department
course_titles = [
    fake.unique.sentence(nb_words=3) for _ in range(len(course_numbers))
]  # Unique course titles
replicated_students["Course Number"] = np.random.choice(
    course_numbers, len(replicated_students)
)
replicated_students["Course Name"] = [
    course_titles[course_numbers.index(cn)]
    for cn in replicated_students["Course Number"]
]
replicated_students["Section"] = [
    "".join(np.random.choice(list("0123456789"), 5))
    for _ in range(len(replicated_students))
]
replicated_students["Instructors"] = [
    (
        f"{fake.last_name()},"
        f" {fake.first_name()} (ID{''.join(np.random.choice(list('0123456789'), 9))})"
        f" <{fake.email()}>"
    )
    for _ in range(len(replicated_students))
]

In [None]:
# Enrollment info
replicated_students["Dropped?"] = np.random.choice(
    ["Yes", "No"], p=[0.24, 0.76], size=len(replicated_students)
)
replicated_students["Dropped Date"] = dropped_dates = [
    fake.date_between(start_date="-1y", end_date="today") if drop == "Yes" else None
    for drop in replicated_students["Dropped?"]
]

In [None]:
# Add other enrollment-specific information (e.g., grades, attendance) in a similar
# manner
replicated_students["Midterm Grade"] = np.random.choice(
    ["A", "B", "C", "D", "F"], len(replicated_students)
)
replicated_students["Final Grade"] = np.random.choice(
    ["A", "B", "C", "D", "F"], len(replicated_students)
)
replicated_students["Total Progress Reports"] = np.random.poisson(
    0.4, len(replicated_students)
)
replicated_students["Absences"] = np.random.poisson(0.5, len(replicated_students))

In [None]:
# Schedule information
replicated_students["Start Date"] = [
    fake.date_between(start_date="-3m", end_date="+3m")
    for _ in range(len(replicated_students))
]
replicated_students["End Date"] = [
    sd + pd.Timedelta(weeks=16) for sd in replicated_students["Start Date"]
]
replicated_students["Start Time"] = [
    f"{hour}:00 AM CT"
    for hour in np.random.choice(range(8, 12), len(replicated_students))
]
replicated_students["End Time"] = [
    f"{hour + np.random.choice([1, 2, 3])}:00 PM CT"
    for hour in np.random.choice(range(1, 5), len(replicated_students))
]
replicated_students["Class Days"] = [
    ", ".join(
        np.random.choice(
            ["M", "T", "W", "R", "F", "Sa"], size=np.random.randint(1, 3), replace=False
        )
    )
    for _ in range(len(replicated_students))
]

In [None]:
# Create DataFrame
# and ensure that the dataframe is not longer than `n_records`
df_unordered_columns = replicated_students.iloc[:n_records]

# Order the columns as expected:
df = df_unordered_columns[
    [
        "Student Name",
        "Student E-mail",
        "Student ID",
        "Student Alternate ID",
        "Categories",
        "Tags",
        "Classification",
        "Major",
        "Cumulative GPA",
        # "Assigned Staff",
        "Course Name",
        "Course Number",
        "Section",
        "Instructors",
        "Dropped?",
        "Dropped Date",
        "Midterm Grade",
        "Final Grade",
        "Total Progress Reports",
        "Absences",
        # "Unexcused Absences",
        # "Excused Absences",
        # "Credit Hours",
        "Start Date",
        "End Date",
        "Start Time",
        "End Time",
        "Class Days",
    ]
]

In [None]:
df