In [1]:
import numpy as np
import pandas as pd

# Set random seed for reproducibility
np.random.seed(42)

# Number of samples
num_samples = 1000

# Generate synthetic data
data = {
    'id': range(1, num_samples + 1),
    'name': [f'Student_{i}' for i in range(1, num_samples + 1)],
    'sex': np.random.choice(['Male', 'Female'], size=num_samples),
    'urban/rural': np.random.choice(['Urban', 'Rural'], size=num_samples),
    'income': np.random.choice(['Low', 'Medium', 'High'], size=num_samples),
    'GPA': np.random.uniform(2.0, 4.0, size=num_samples),
    'average attendance': np.random.uniform(50, 100, size=num_samples),
    'Parents Qualification': np.random.choice(['Low', 'Medium', 'High'], size=num_samples),
}

# Introduce outliers
outlier_indices = np.random.choice(num_samples, size=int(0.05 * num_samples), replace=False)
data['Parents Qualification'][outlier_indices] = 'Low'
data['income'][outlier_indices] = 'Low'

# Based on the specified rules, create 'is dropout?' column
criteria = (
    (data['GPA'] > 3.0) &
    (data['average attendance'] > 75) &
    (data['Parents Qualification'] == 'High') &
    ((data['urban/rural'] == 'Urban') | (data['income'] == 'High'))
)

data['is dropout?'] = np.where(criteria, 'No', 'Yes')

# Create DataFrame
df = pd.DataFrame(data)

# Save to CSV file
df.to_csv('student_dropout_data.csv', index=False)

# Display the first few rows of the generated data
print(df.head())


   id       name     sex urban/rural  income       GPA  average attendance  \
0   1  Student_1    Male       Rural    High  3.984316           80.387606   
1   2  Student_2  Female       Urban  Medium  3.702850           64.765074   
2   3  Student_3    Male       Urban    High  2.417021           56.830045   
3   4  Student_4    Male       Urban     Low  3.861190           82.581988   
4   5  Student_5    Male       Urban     Low  2.232733           86.929873   

  Parents Qualification is dropout?  
0                   Low         Yes  
1                   Low         Yes  
2                   Low         Yes  
3                   Low         Yes  
4                   Low         Yes  
