In [1]:
import pandas as pd
import numpy as np
import random
from datetime import datetime

# Set seed for reproducibility
random.seed(42)
np.random.seed(42)

# Distribution of student numbers per year
student_distribution = {
    '25': 400,
    '24': 380,
    '23': 250,
    '22': 220,
    '21': 80
}

# Courses
courses = [
    "Business Intelligence & Analytics (BSBA‑BIA)",
    "Business Solutions & Applications (BSBA‑BSAA)",
    "Human Resource Management (BSBA‑HRM)",
    "Information Systems (BS‑IS)",
    "Cybersecurity (BSCSEC)",
    "Interactive Entertainment & Multimedia Computing (BS‑IEMC)"
]

# Municipalities in NCR + nearby
municipalities = [
    "Manila", "Quezon City", "Makati", "Taguig", "Pasig", "Mandaluyong", 
    "Caloocan", "San Juan", "Marikina", "Las Piñas", "Muntinlupa", 
    "Parañaque", "Pasay", "Valenzuela", "Navotas", "Malabon", 
    "Antipolo", "San Mateo", "Rodriguez", "Cainta", "Bacoor", "Imus"
]

# Nationalities & weights
nationality_options = ['Filipino', 'Chinese', 'Korean', 'Japanese', 'Vietnamese', 'American']
nationality_weights = [95, 1, 1, 1, 1, 1]

# Civil Status
civil_status_options = ['Single', 'Married']
civil_status_weights = [97, 3]

# Academic probation options
probation_options = ['advising', 'monitoring', 'restrictions']

# GWA choices
gwa_choices = [4.0, 3.5, 3.0, 2.0, 1.0]
gwa_weights = [0.25, 0.25, 0.3, 0.15, 0.05]

# 📌 Track used student numbers for uniqueness
used_student_numbers = set()

def generate_unique_student_number(prefix):
    while True:
        suffix = random.randint(1000, 9999)
        student_number = f"{prefix}-{suffix}"
        if student_number not in used_student_numbers:
            used_student_numbers.add(student_number)
            return student_number

# Data collection
students = []

for year_prefix, count in student_distribution.items():
    for _ in range(count):
        student_number = generate_unique_student_number(year_prefix)
        course = random.choice(courses)

        # Outlier: birth years 1990–1995 for 5 random students
        if len([s for s in students if int(s['birthdate'][:4]) < 2000]) < 5 and random.random() < 0.01:
            birth_year = random.randint(1990, 1995)
        else:
            birth_year = random.randint(2000, 2007)
        birthdate = datetime(birth_year, random.randint(1, 12), random.randint(1, 28))

        nationality = random.choices(nationality_options, weights=nationality_weights, k=1)[0]
        civil_status = random.choices(civil_status_options, weights=civil_status_weights, k=1)[0]
        gender = random.choice(['Male', 'Female'])
        municipality = random.choice(municipalities)

        has_scholarship = random.random() < 0.10
        if has_scholarship:
            payment_status = "Paid"
            balance = 0
        else:
            payment_status = random.choices(["Paid", "With Balance"], weights=[70, 30])[0]
            balance = 0 if payment_status == "Paid" else random.randint(10000, 40000)

        units = random.choice([15, 18, 21, 24])
        is_on_probation = random.random() < 0.15
        probation_status = random.choice(probation_options) if is_on_probation else ""

        last_term_gwa = random.choices(gwa_choices, weights=gwa_weights, k=1)[0]

        students.append({
            "student_number": student_number,
            "course": course,
            "birthdate": birthdate.strftime("%Y-%m-%d"),
            "nationality": nationality,
            "civil_status": civil_status,
            "gender": gender,
            "municipality": municipality,
            "payment_status": payment_status,
            "balance": balance,
            "has_scholarship": has_scholarship,
            "number_of_units_enrolled": units,
            "academic_probation_status": probation_status,
            "last_term_GWA": last_term_gwa
        })

# Convert to DataFrame
df_students = pd.DataFrame(students)

# Save to CSV
df_students.to_csv("student_dataset.csv", index=False)
print("✅ student_dataset.csv with unique student numbers has been generated!")


✅ student_dataset.csv with unique student numbers has been generated!
