<a href="https://colab.research.google.com/github/nour-ezzehi/Detecting-Student-Dropout-Risk/blob/main/dropout_risk.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [12]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from xgboost import XGBClassifier
from sklearn.metrics import classification_report

In [7]:
def load_data():
    data = {
        'student_id': range(1000),
        'grade_avg': np.random.normal(75, 10, 1000),
        'attendance_rate': np.random.uniform(0.6, 1.0, 1000),
        'lms_logins': np.random.poisson(20, 1000),
        'discussion_posts': np.random.poisson(5, 1000),
        'late_submissions': np.random.poisson(3, 1000),
        'socio_economic_status': np.random.choice(['low', 'medium', 'high'], 1000),
        'commute_time': np.random.uniform(10, 60, 1000),
        'extracurricular_hours': np.random.uniform(0, 15, 1000),
        'dropout': np.random.choice([0, 1], 1000, p=[0.8, 0.2])
        }
    return pd.DataFrame(data)


In [9]:
def engineer_features(df):
    df['grade_trend'] = df['grade_avg'].diff().fillna(0)

    df['lms_engagement'] = df['lms_logins'] * df['discussion_posts']

    df['high_absence'] = (df['attendance_rate'] < 0.8).astype(int)
    df['frequent_late'] = (df['late_submissions'] > 5).astype(int)

    df = pd.get_dummies(df, columns=['socio_economic_status'], drop_first=True)

    return df

In [14]:
def train_model(df):
    features = [col for col in df.columns if col not in ['student_id', 'dropout']]
    X = df[features]
    y = df['dropout']

    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_test_scaled = scaler.transform(X_test)

    model = XGBClassifier(n_estimators=100, learning_rate=0.1, random_state=42)
    model.fit(X_train_scaled, y_train)

    y_pred = model.predict(X_test_scaled)
    print("Model Performance:")
    print(classification_report(y_test, y_pred))

    return model, scaler

In [15]:
def main():

    df = load_data()
    df = engineer_features(df)

    model, scaler = train_model(df)

    new_student = df.iloc[0:1].drop(['student_id', 'dropout'], axis=1)
    new_student_scaled = scaler.transform(new_student)
    risk_score = model.predict_proba(new_student_scaled)[0][1]
    print(f"Predicted dropout risk for new student: {risk_score:.2%}")

if __name__ == "__main__":
    main()

Model Performance:
              precision    recall  f1-score   support

           0       0.84      0.97      0.90       168
           1       0.17      0.03      0.05        32

    accuracy                           0.82       200
   macro avg       0.50      0.50      0.48       200
weighted avg       0.73      0.82      0.76       200

Predicted dropout risk for new student: 14.13%
