In [6]:
!pip install faker



In [7]:
import pandas as pd
import numpy as np
import random
from faker import Faker

Faker.seed(42) #keeps data generated constant each time
fake = Faker("en_IN")
random.seed(42) #keeps data generated constant each time
np.random.seed(42)

N_STUDENTS = 800 #500+ students
WEEKS = 20  # weeks of attendance data

branches = [
    "Computer Science Engineering",
    "Information Technology",
    "Electronics and Communication Engineering",
    "Electrical Engineering",
    "Mechanical Engineering",
    "Civil Engineering",
    "Chemical Engineering",
    "Biotechnology",
    "Production and Industrial Engineering",
    "Mathematics and Computing"
]

students = []

for i in range(1, N_STUDENTS + 1):
    student_id = f"S{i:04d}"
    name = fake.name()
    age = fake.random_int(min=17,max=23) # changed dob to age
    admission_year = random.choice([2020, 2021, 2022, 2023, 2024, 2025])
    branch = random.choice(branches)

    # Attendance time-series (simulate trend + noise)
    base_att = np.clip(np.random.normal(75, 10), 30, 100) #attendance percentage randomle
    trend = np.clip(np.random.normal(0, 1.5), -3, 3) 
    weekly_att = []
    for w in range(WEEKS):
        noise = np.random.normal(0, 5)
        val = np.clip(base_att + trend * w + noise, 0, 100)
        weekly_att.append(val)

    avg_attendance_overall = round(np.mean(weekly_att), 2)
    last_4_week_attendance = round(np.mean(weekly_att[-4:]), 2)
    attendance_slope_weekly = round(np.polyfit(range(WEEKS), weekly_att, 1)[0], 4)

    # Academic performance
    year = random.choice([1, 2, 3, 4]) # Year of study randomly among 1,2,3,4
    N_SEMS = year * 2  
    sgpas = []
    if random.random() < 0.1: 
        # 10% weak students
        base_sgpa = np.clip(np.random.normal(4.5, 0.5), 0.0, 6.0)
    else:  # 90% normal students
        base_sgpa = np.clip(np.random.normal(7.0, 0.8), 5.0, 9.9)
    sem_trend = np.clip(np.random.normal(0, 0.15), -0.5, 0.5)

    for s in range(N_SEMS):
        noise = np.random.normal(0, 0.2)
        sg = float(np.clip(base_sgpa + sem_trend*s + noise, 0.0, 10.0))
        sgpas.append(round(sg, 2))

    current_sgpa = sgpas[-1]
    cgpa = round(np.mean(sgpas), 2)

    # Backlogs
    backlog_prev = np.random.poisson(0.3)
    backlog_increase = 1 if (current_sgpa < 5.0 and random.random() < 0.4) else 0
    backlog_curr = backlog_prev + backlog_increase
    backlog_trend = backlog_curr - backlog_prev

    # Fees
    if random.random() < 0.75:
        fee_status = "paid"
    else:
        fee_status = "overdue"

    # LMS logins
    lms_logins_30d = int(np.clip(np.random.normal(8, 4),0, 30))  # average logins in last 30 days, Int cause can return double


    # Dropout rule
    dropout_label = 1 if (
        (last_4_week_attendance < 50 and current_sgpa < 5.0) or
        backlog_curr >= 3 or
        (cgpa < 5.0 and current_sgpa < 5.5)
    ) else 0

    students.append({
        "student_id": student_id,
        "name": name,
        "branch": branch,
        "year": year,
        "admission_year": admission_year,
        "age": age,
        "avg_attendance_overall": avg_attendance_overall,
        "last_4_week_attendance": last_4_week_attendance,
        "current_sgpa": current_sgpa,
        "cgpa": cgpa,
        "backlog_prev": backlog_prev,
        "backlog_curr": backlog_curr,
        "fee_status": fee_status,
        "lms_logins_30d": lms_logins_30d,
        "dropout_label": dropout_label
    })

df = pd.DataFrame(students)
df.to_csv("synthetic_students_final.csv", index=False) #show index
print(df.head())
print("\nDropout label distribution:\n", df['dropout_label'].value_counts())
#removed fee overdue days
#removed attendence slope, backlog trend

  student_id           name                                     branch  year  \
0      S0001  Aryan Maharaj                     Information Technology     1   
1      S0002   Liam Chaudry                     Information Technology     1   
2      S0003    Pahal Balay                     Electrical Engineering     2   
3      S0004     Tejas Kaul      Production and Industrial Engineering     4   
4      S0005   Rushil Saini  Electronics and Communication Engineering     4   

   admission_year  age  avg_attendance_overall  last_4_week_attendance  \
0            2025   18                   77.36                   74.99   
1            2021   22                   87.75                   87.24   
2            2020   17                   62.85                   55.98   
3            2021   21                   77.51                   80.25   
4            2020   18                   58.01                   43.38   

   current_sgpa  cgpa  backlog_prev  backlog_curr fee_status  lms_logins_3

In [12]:
df.iloc[0:20,:]

Unnamed: 0,student_id,name,branch,year,admission_year,age,avg_attendance_overall,last_4_week_attendance,current_sgpa,cgpa,backlog_prev,backlog_curr,fee_status,lms_logins_30d,dropout_label
0,S0001,Aryan Maharaj,Information Technology,1,2025,18,77.36,74.99,6.86,6.9,0,0,paid,5,0
1,S0002,Liam Chaudry,Information Technology,1,2021,22,87.75,87.24,7.37,7.33,0,0,paid,11,0
2,S0003,Pahal Balay,Electrical Engineering,2,2020,17,62.85,55.98,6.08,6.68,0,0,paid,7,0
3,S0004,Tejas Kaul,Production and Industrial Engineering,4,2021,21,77.51,80.25,8.2,8.01,0,0,paid,6,0
4,S0005,Rushil Saini,Electronics and Communication Engineering,4,2020,18,58.01,43.38,6.09,6.97,0,0,paid,10,0
5,S0006,Arunima Ahuja,Information Technology,1,2022,23,91.01,94.46,7.56,7.5,0,0,paid,7,0
6,S0007,Pahal Oak,Mathematics and Computing,3,2022,19,71.09,80.78,5.42,5.78,0,0,paid,8,0
7,S0008,Tanveer Nayar,Information Technology,4,2024,17,60.75,51.57,5.89,5.22,0,0,paid,1,0
8,S0009,Lajita Chatterjee,Mathematics and Computing,3,2025,19,74.49,86.11,7.86,7.89,1,1,paid,14,0
9,S0010,Aishani Bassi,Electrical Engineering,3,2020,22,87.03,100.0,8.22,7.62,0,0,overdue,11,0


In [9]:
df['dropout_label'].value_counts()


dropout_label
0    738
1     62
Name: count, dtype: int64

In [10]:
df.isnull().sum()

student_id                0
name                      0
branch                    0
year                      0
admission_year            0
age                       0
avg_attendance_overall    0
last_4_week_attendance    0
current_sgpa              0
cgpa                      0
backlog_prev              0
backlog_curr              0
fee_status                0
lms_logins_30d            0
dropout_label             0
dtype: int64