# Mahmoud Abdelnasser

## Task 3

In [12]:
import pandas as pd
import numpy as np 
from sklearn.model_selection import train_test_split, KFold, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, mean_absolute_error, mean_squared_error, roc_auc_score, roc_curve
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier


In [13]:
# Load the dataset
df = pd.read_csv('student_dropout.csv')

# Preprocess the target variable
df['Target'] = df['Target'].map({
    'Graduate': 0,
    'Dropout': 1,
    'Enrolled': 2,
})


In [14]:
# Select features with high correlation
corr = df.corr()
target_corr = corr['Target']
high_corr = target_corr[target_corr.abs() >= 0.1]
data_filtered = df[high_corr.index]


In [15]:
# Remove duplicates
data_filtered = data_filtered.drop_duplicates()


In [16]:
# Define features (X) and target (Y)
X = data_filtered.drop(['Target'], axis=1)
Y = data_filtered['Target']


In [17]:
# Split into train (70%), validation (15%), and test (15%)
X_train, X_temp, Y_train, Y_temp = train_test_split(X, Y, test_size=0.30, random_state=42, stratify=Y)
X_val, X_test, Y_val, Y_test = train_test_split(X_temp, Y_temp, test_size=0.50, random_state=42, stratify=Y_temp)


In [18]:
# Scale the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(X_test)


In [19]:
# Models to evaluate
models = {
    "Logistic Regression": LogisticRegression(),
    "Random Forest": RandomForestClassifier(n_estimators=300),
    "SVM": SVC(kernel='linear', probability=True)  # Enable probability estimates for ROC AUC
}

# K-Fold Cross Validation setup
kf = KFold(n_splits=5, shuffle=True, random_state=42)


In [20]:
# Train and evaluate models
for name, model in models.items():
    print(f"\nEvaluating {name}")
    
    # K-Fold Cross Validation Accuracy
    cv_scores = cross_val_score(model, X_train, Y_train, cv=kf, scoring='accuracy')
    print(f"{name} Cross-Validation Accuracy: {np.mean(cv_scores):.4f}")
    
    # Train the model
    model.fit(X_train, Y_train)
    
    # Predictions on validation set
    Y_val_pred = model.predict(X_val)
    
    # Evaluation Metrics
    accuracy = accuracy_score(Y_val, Y_val_pred)
    mae = mean_absolute_error(Y_val, Y_val_pred)
    mse = mean_squared_error(Y_val, Y_val_pred)
    
    print(f"{name} Validation Accuracy: {accuracy:.4f}")
    print(f"{name} MAE: {mae:.4f}")
    print(f"{name} MSE: {mse:.4f}")
    
    # ROC AUC
    if hasattr(model, "predict_proba"):
        Y_val_prob = model.predict_proba(X_val)[:, 1]  # Use the probability of class 1
        roc_auc = roc_auc_score((Y_val == 1).astype(int), Y_val_prob)
        print(f"{name} ROC-AUC: {roc_auc:.4f}")



Evaluating Logistic Regression
Logistic Regression Cross-Validation Accuracy: 0.7581
Logistic Regression Validation Accuracy: 0.7753
Logistic Regression MAE: 0.3333
Logistic Regression MSE: 0.5505
Logistic Regression ROC-AUC: 0.9253

Evaluating Random Forest
Random Forest Cross-Validation Accuracy: 0.7581
Random Forest Validation Accuracy: 0.7843
Random Forest MAE: 0.3107
Random Forest MSE: 0.5008
Random Forest ROC-AUC: 0.9164

Evaluating SVM
SVM Cross-Validation Accuracy: 0.7523
SVM Validation Accuracy: 0.7738
SVM MAE: 0.3394
SVM MSE: 0.5656
SVM ROC-AUC: 0.9252


In [21]:
# Final evaluation on the test set
for name, model in models.items():
    Y_test_pred = model.predict(X_test)
    test_accuracy = accuracy_score(Y_test, Y_test_pred)
    test_mae = mean_absolute_error(Y_test, Y_test_pred)
    test_mse = mean_squared_error(Y_test, Y_test_pred)
    
    print(f"\n{name} Test Accuracy: {test_accuracy:.4f}")
    print(f"{name} Test MAE: {test_mae:.4f}")
    print(f"{name} Test MSE: {test_mse:.4f}")
    
    if hasattr(model, "predict_proba"):
        Y_test_prob = model.predict_proba(X_test)[:, 1]
        test_roc_auc = roc_auc_score((Y_test == 1).astype(int), Y_test_prob)
        print(f"{name} Test ROC-AUC: {test_roc_auc:.4f}")



Logistic Regression Test Accuracy: 0.7605
Logistic Regression Test MAE: 0.3524
Logistic Regression Test MSE: 0.5783
Logistic Regression Test ROC-AUC: 0.9208

Random Forest Test Accuracy: 0.7771
Random Forest Test MAE: 0.3298
Random Forest Test MSE: 0.5437
Random Forest Test ROC-AUC: 0.9088

SVM Test Accuracy: 0.7560
SVM Test MAE: 0.3599
SVM Test MSE: 0.5919
SVM Test ROC-AUC: 0.9203
