# Import Required Libraries
Import libraries such as pandas, numpy, and random for data manipulation and generation.

In [1]:
import pandas as pd  # For data manipulation and storage
import numpy as np  # For numerical operations
import random  # For random data generation

# Load and Inspect Existing Data
Load the provided CSV file to understand its structure and existing data.

In [2]:
# Load and Inspect Existing Data

# Load the provided CSV file into a DataFrame
file_path = r'c:\xampp\htdocs\SMS\sms3_admissions_data.csv'
existing_data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
print("First 5 rows of the dataset:")
print(existing_data.head())

# Display the column names and data types
print("\nColumn names and data types:")
print(existing_data.dtypes)

# Display basic statistics of the dataset
print("\nBasic statistics of the dataset:")
print(existing_data.describe(include='all'))

First 5 rows of the dataset:
   id  student_number first_name middle_name last_name  department_id branch  \
0   5        25100006       asfa        fgds       asf              1   Main   
1   6        25100006       asfa        fgds       asf              1   Main   
2   7        25100006       asfa        fgds       asf              1   Main   
3   8        25100006       asfa        fgds       asf              1   Main   
4   9        25100006       asfa        fgds       asf              1   Main   

  admission_type year_level   sex  ... good_moral    form137  \
0    New Regular        1st  Male  ...  Submitted  Submitted   
1    New Regular        1st  Male  ...  Submitted  Submitted   
2    New Regular        1st  Male  ...  Submitted  Submitted   
3    New Regular        1st  Male  ...  Submitted  Submitted   
4    New Regular        1st  Male  ...  Submitted  Submitted   

  birth_certificate  brgy_clearance  honorable_dismissal  \
0    To Be Followed  To Be Followed          

# Define Data Generation Functions
Create functions to generate random values for each column based on the specified constraints, such as branch, department_id, student_number, admission type, and year level.

In [3]:
# Function to generate random branch
def generate_branch():
    return random.choice(['Main', 'Bulacan'])

# Function to generate random department_id
def generate_department_id():
    return random.randint(1, 12)

# Function to generate random student_number
def generate_student_number(year):
    year_suffix = str(year)[-2:]  # Extract the last two digits of the year
    random_digits = ''.join([str(random.randint(0, 9)) for _ in range(6)])  # Generate 6 random digits
    return year_suffix + random_digits

# Function to generate random admission type
def generate_admission_type():
    return random.choice(['Freshmen', 'Transferee', 'Returnee'])

# Function to generate random year level based on admission type
def generate_year_level(admission_type):
    if admission_type == 'Freshmen':
        return '1st'
    else:
        return random.choice(['1st', '2nd', '3rd', '4th'])

# Function to generate random sex
def generate_sex():
    return random.choice(['Male', 'Female'])

# Function to generate random civil status
def generate_civil_status():
    return random.choice(['Single', 'Married'])

# Function to generate random religion
def generate_religion():
    return random.choice(['Christianity', 'Roman Catholic', 'Protestant', 'Other'])

# Function to generate random birthday
def generate_birthday(year):
    start_date = pd.Timestamp(year - 25, 1, 1)  # Assume students are between 16 and 25 years old
    end_date = pd.Timestamp(year - 16, 12, 31)
    return pd.Timestamp(random.randint(start_date.value, end_date.value)).strftime('%Y-%m-%d')

# Function to generate random email
def generate_email(first_name, last_name):
    domains = ['example.com', 'school.edu', 'university.org']
    return f"{first_name.lower()}.{last_name.lower()}@{random.choice(domains)}"

# Function to generate random contact number
def generate_contact_number():
    return f"09{random.randint(100000000, 999999999)}"

# Function to generate random working student status
def generate_working_student():
    return random.choice(['Yes', 'No'])

# Function to generate random address
def generate_address(branch):
    address_number = f"{random.randint(1, 999)}"
    barangay = random.choice(['Barangay 1', 'Barangay 2', 'Barangay 3', 'Barangay 4', 'Barangay 5'])
    municipalities = [
        'Quezon City', 'Manila', 'Davao City', 'Cebu City', 'Zamboanga City',
        'Taguig', 'Pasig', 'Cagayan de Oro', 'Baguio City', 'Iloilo City',
        'Makati', 'Bacolod City', 'General Santos', 'Antipolo', 'Pasay',
        'Calamba', 'Marikina', 'Mandaluyong', 'San Fernando', 'Batangas City'
    ]
    municipality = random.choice(municipalities)
    region = random.choice([
        'NCR', 'CAR', 'BARMM', 'Region I - Ilocos', 'Region II - Cagayan Valley',
        'Region III - Central Luzon', 'Region IV - A - CALABARZON', 'Region IV - B - MIMAROPA',
        'Region V - Bicol', 'Region VI - Western Visayas', 'Region VII - Central Visayas',
        'Region VIII - Southern Visayas', 'Region IX - Zamboanga', 'Region X - Northern Mindanao',
        'Region XI - Davao', 'Region XII - SOCCSKSARGEN', 'Region XIII - Caraga'
    ])
    return f"{address_number}, {barangay}, {municipality}, {region}"

# Function to generate random guardian name
def generate_guardian_name():
    first_names = ['Guardian', 'Parent', 'Relative']
    last_names = ['Smith', 'Johnson', 'Garcia', 'Martinez', 'Brown']
    return f"{random.choice(first_names)} {random.choice(last_names)}"

# Function to generate random guardian contact
def generate_guardian_contact():
    return f"09{random.randint(100000000, 999999999)}"

# Function to generate random school information
def generate_school_info(school_type):
    school_names = ['ABC', 'DEF', 'GHI', 'JKL', 'MNO', 'PQR', 'UVW', 'XYZ']
    year = random.randint(2000, 2025)
    return f"{random.choice(school_names)} {school_type}", year

# Function to generate random referral source
def generate_referral_source():
    return random.choice(['Social Media', 'Friend', 'Relative', 'Referral'])

# Function to generate random document submission status
def generate_document_status():
    return random.choice(['Submitted', 'To Be Followed'])

# Function to generate random status and receipt status
def generate_status():
    return 'Accepted', 'Paid'

# Function to generate random created_at timestamp
def generate_created_at(year):
    month = random.choice([7, 8])  # July or August
    day = random.randint(1, 28)  # To avoid invalid dates
    hour = random.randint(0, 23)
    minute = random.randint(0, 59)
    second = random.randint(0, 59)
    return pd.Timestamp(year, month, day, hour, minute, second).strftime('%Y-%m-%d %H:%M:%S')

def generate_name():
    first_names = [
        'Ethan', 'Olivia', 'Sophia', 'Liam', 'Emma', 'Noah', 'Ava', 'Isabella', 'Mason', 'Mia',
        'Lucas', 'Charlotte', 'Elijah', 'Amelia', 'James', 'Harper', 'Benjamin', 'Evelyn', 'Alexander', 'Abigail',
        'Daniel', 'Victoria', 'Sebastian', 'Scarlett', 'Matthew', 'Chloe', 'Henry', 'Grace', 'Samuel', 'Hannah',
        'David', 'Luna', 'Joseph', 'Zoe', 'Michael', 'Ellie', 'Gabriel', 'Aria', 'William', 'Layla'
    ]
    last_names = [
        'Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Martinez', 'Hernandez', 'Lopez', 'Gonzalez',
        'Wilson', 'Anderson', 'Thomas', 'Taylor', 'Moore', 'Jackson', 'Martin', 'Lee', 'Perez', 'Thompson',
        'Clark', 'Rodriguez', 'Lewis', 'Walker', 'Hall', 'Allen', 'Young', 'King', 'Wright', 'Scott',
        'Green', 'Adams', 'Baker', 'Nelson', 'Carter', 'Mitchell', 'Perez', 'Roberts', 'Turner', 'Phillips'
    ]
    return random.choice(first_names), random.choice(last_names)

# Generate Synthetic Data
Use the defined functions to generate approximately 100,000 rows of synthetic data, ensuring all constraints are met.

In [4]:
# Generate approximately 100,000 rows of synthetic data
synthetic_data = []

for year in range(2010, 2025):  # Loop through years from 2010 to 2024
    for _ in range(7000):  # Generate around 7,000 rows per year to reach ~100,000 rows
        # Generate random values for each column
        admission_type = generate_admission_type()
        branch = generate_branch()
        department_id = generate_department_id()
        student_number = generate_student_number(year)
        year_level = generate_year_level(admission_type)
        sex = generate_sex()
        civil_status = generate_civil_status()
        religion = generate_religion()
        birthday = generate_birthday(year)
        first_name = random.choice(['John', 'Jane', 'Michael', 'Emily', 'Ethan', 'Sophia', 'Olivia', 'Daniel'])
        last_name = random.choice(['Smith', 'Johnson', 'Williams', 'Brown', 'Jones', 'Garcia', 'Martinez', 'Taylor'])
        email = generate_email(first_name, last_name)
        contact_number = generate_contact_number()
        working_student = generate_working_student()
        address = generate_address(branch)
        guardian_name = generate_guardian_name()
        guardian_contact = generate_guardian_contact()
        primary_school, primary_year = generate_school_info('Elementary')
        secondary_school, secondary_year = generate_school_info('High School')
        last_school, last_school_year = generate_school_info('College')
        referral_source = generate_referral_source()
        form138 = generate_document_status()
        good_moral = generate_document_status()
        form137 = generate_document_status()
        birth_certificate = generate_document_status()
        brgy_clearance = generate_document_status()
        honorable_dismissal = generate_document_status()
        transcript_of_records = generate_document_status()
        certificate_of_grades = generate_document_status()
        status, receipt_status = generate_status()
        created_at = generate_created_at(year)

        # Append the generated row to the synthetic data list
        synthetic_data.append([
            len(synthetic_data) + 1,  # Auto-increment ID
            student_number,
            first_name,
            '',  # Middle name left blank
            last_name,
            department_id,
            branch,
            admission_type,
            year_level,
            sex,
            civil_status,
            religion,
            birthday,
            email,
            contact_number,
            working_student,
            address,
            guardian_name,
            guardian_contact,
            primary_school,
            primary_year,
            secondary_school,
            secondary_year,
            last_school,
            last_school_year,
            referral_source,
            form138,
            good_moral,
            form137,
            birth_certificate,
            brgy_clearance,
            honorable_dismissal,
            transcript_of_records,
            certificate_of_grades,
            status,
            receipt_status,
            created_at
        ])

# Convert the synthetic data into a DataFrame
columns = existing_data.columns  # Use the same column names as the existing data
synthetic_data_df = pd.DataFrame(synthetic_data, columns=columns)

# Save the synthetic data to a new CSV file
output_file_path = r'c:\xampp\htdocs\SMS\synthetic_admissions_data.csv'
synthetic_data_df.to_csv(output_file_path, index=False)

print(f"Synthetic data generated and saved to {output_file_path}")

Synthetic data generated and saved to c:\xampp\htdocs\SMS\synthetic_admissions_data.csv


# Save Generated Data to CSV
Save the generated data to a new CSV file for further use.

In [5]:
# Save the synthetic data to a new CSV file
output_file_path = r'c:\xampp\htdocs\SMS\synthetic_admissions_data.csv'
synthetic_data_df.to_csv(output_file_path, index=False)

print(f"Synthetic data generated and saved to {output_file_path}")

Synthetic data generated and saved to c:\xampp\htdocs\SMS\synthetic_admissions_data.csv
