# MLP for Student Dropout and Academic Success Dataset

## Importing the Data

In [1]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('all_datasets/student_droupout_data.csv', sep=";")

X = df.drop(columns=["Target"])
y = df["Target"]

In [2]:
# Encode categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
categorical_cols += [
    "Marital status", "Application mode", "Application order", "Course",
    "Daytime/evening attendance\t", "Previous qualification", "Nacionality",
    "Mother's qualification", "Father's qualification", "Mother's occupation",
    "Father's occupation", "Displaced", "Educational special needs", "Debtor",
    "Tuition fees up to date", "Gender", "Scholarship holder", "International"
]
categorical_cols = list(set(categorical_cols))

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X = X.fillna(0)

In [3]:
# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

In [4]:
class MLP(nn.Module):
    def __init__(self, input_size, output_size):
        super(MLP, self).__init__()
        self.model = nn.Sequential(
            nn.Linear(input_size, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Linear(32, output_size)
        )
        
    def forward(self, x):
        return self.model(x)

# Train/test split ratios to evaluate
split_ratios = [0.5, 0.3, 0.2, 0.1]


In [5]:
for split in split_ratios:
    print(f"\n--- Evaluating {int((1-split)*100)}/{int(split*100)} Train/Test Split ---")
    
    X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=split, random_state=0, stratify=y_encoded)

    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_test = scaler.transform(X_test)

    X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
    y_train_tensor = torch.tensor(y_train, dtype=torch.long)
    X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
    y_test_tensor = torch.tensor(y_test, dtype=torch.long)

    input_dim = X.shape[1]
    output_dim = len(np.unique(y_encoded))
    model = MLP(input_dim, output_dim)

    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    epochs = 1500
    for epoch in range(epochs):
        model.train()
        optimizer.zero_grad()
        logits = model(X_train_tensor)
        loss = criterion(logits, y_train_tensor)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        preds = model(X_test_tensor).argmax(dim=1).numpy()
        y_true = y_test_tensor.numpy()

        acc = accuracy_score(y_true, preds)
        prec = precision_score(y_true, preds, average='macro', zero_division=0)
        rec = recall_score(y_true, preds, average='macro', zero_division=0)
        f1 = f1_score(y_true, preds, average='macro', zero_division=0)

        print(f"Accuracy:  {acc:.4f}")
        print(f"Precision: {prec:.4f}")
        print(f"Recall:    {rec:.4f}")
        print(f"F1 Score:  {f1:.4f}")


--- Evaluating 50/50 Train/Test Split ---
Accuracy:  0.7622
Precision: 0.7028
Recall:    0.6834
F1 Score:  0.6900

--- Evaluating 70/30 Train/Test Split ---
Accuracy:  0.7508
Precision: 0.6832
Recall:    0.6707
F1 Score:  0.6754

--- Evaluating 80/20 Train/Test Split ---
Accuracy:  0.7661
Precision: 0.7081
Recall:    0.6915
F1 Score:  0.6979

--- Evaluating 90/10 Train/Test Split ---
Accuracy:  0.7675
Precision: 0.7072
Recall:    0.6860
F1 Score:  0.6934


## Testing out different MLPs

In [6]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('all_datasets/student_droupout_data.csv', sep=";")

X = df.drop(columns=["Target"])
y = df["Target"]

# Encode categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
categorical_cols += [
    "Marital status", "Application mode", "Application order", "Course",
    "Daytime/evening attendance\t", "Previous qualification", "Nacionality",
    "Mother's qualification", "Father's qualification", "Mother's occupation",
    "Father's occupation", "Displaced", "Educational special needs", "Debtor",
    "Tuition fees up to date", "Gender", "Scholarship holder", "International"
]
categorical_cols = list(set(categorical_cols))

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X = X.fillna(0)

# Encode target
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# 70/30 split
X_train, X_test, y_train, y_test = train_test_split(X, y_encoded, test_size=0.3, random_state=0, stratify=y_encoded)

# Normalize
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

# Convert to tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Define architectures
input_dim = X.shape[1]
output_dim = len(np.unique(y_encoded))
model_architectures = [
    nn.Sequential(
        nn.Linear(input_dim, 64),
        nn.ReLU(),
        nn.Linear(64, output_dim)
    ),
    nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.ReLU(),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, output_dim)
    ),
    nn.Sequential(
        nn.Linear(input_dim, 128),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(64, output_dim)
    ),
    nn.Sequential(
        nn.Linear(input_dim, 256),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(256, 128),
        nn.ReLU(),
        nn.Dropout(0.3),
        nn.Linear(128, 64),
        nn.ReLU(),
        nn.Linear(64, output_dim)
    )
]

# Train + Evaluate
results = []

for i, architecture in enumerate(model_architectures, 1):
    class MLP(nn.Module):
        def __init__(self):
            super().__init__()
            self.model = architecture
        def forward(self, x):
            return self.model(x)

    model = MLP()
    criterion = nn.CrossEntropyLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

    for epoch in range(1500):
        model.train()
        optimizer.zero_grad()
        outputs = model(X_train_tensor)
        loss = criterion(outputs, y_train_tensor)
        loss.backward()
        optimizer.step()

    model.eval()
    with torch.no_grad():
        preds = model(X_test_tensor).argmax(dim=1).numpy()
        y_true = y_test_tensor.numpy()

        acc = accuracy_score(y_true, preds)
        prec = precision_score(y_true, preds, average='macro', zero_division=0)
        rec = recall_score(y_true, preds, average='macro', zero_division=0)
        f1 = f1_score(y_true, preds, average='macro', zero_division=0)

        results.append((f"MLP{i}", acc, prec, rec, f1))

# Print table
print("\nMODEL\tAccuracy\tPrecision\tRecall\t\tF1-Score")
for model_name, acc, prec, rec, f1 in results:
    print(f"{model_name}\t{acc:.4f}\t\t{prec:.4f}\t\t{rec:.4f}\t\t{f1:.4f}")



MODEL	Accuracy	Precision	Recall		F1-Score
MLP1	0.7304		0.6710		0.6677		0.6692
MLP2	0.7056		0.6361		0.6360		0.6361
MLP3	0.7613		0.6988		0.6849		0.6903
MLP4	0.7440		0.6796		0.6713		0.6748


## Cross Validation
using the best model from the previous section

In [7]:
import pandas as pd
import numpy as np 
import torch
import torch.nn as nn
import torch.optim as optim
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score

# Load dataset
df = pd.read_csv('all_datasets/student_droupout_data.csv', sep=";")

X = df.drop(columns=["Target"])
y = df["Target"]

# Encode categorical columns
categorical_cols = X.select_dtypes(include=["object"]).columns.tolist()
categorical_cols += [
    "Marital status", "Application mode", "Application order", "Course",
    "Daytime/evening attendance\t", "Previous qualification", "Nacionality",
    "Mother's qualification", "Father's qualification", "Mother's occupation",
    "Father's occupation", "Displaced", "Educational special needs", "Debtor",
    "Tuition fees up to date", "Gender", "Scholarship holder", "International"
]
categorical_cols = list(set(categorical_cols))

for col in categorical_cols:
    le = LabelEncoder()
    X[col] = le.fit_transform(X[col].astype(str))

X = X.fillna(0)

# Encode target labels
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Normalize features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Device setup
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

# Define custom architecture
class MLP(nn.Module):
    def __init__(self, input_dim, output_dim):
        super().__init__()
        self.model = nn.Sequential(
            nn.Linear(input_dim, 128),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, output_dim)
        )

    def forward(self, x):
        return self.model(x)

# Cross-validation function
def run_cv(X, y, n_splits):
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    input_dim = X.shape[1]
    output_dim = len(np.unique(y))
    results = []

    for fold, (train_idx, test_idx) in enumerate(skf.split(X, y), 1):
        print(f"\n[Fold {fold}/{n_splits}]")

        X_train, X_test = X[train_idx], X[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
        y_train_tensor = torch.tensor(y_train, dtype=torch.long).to(device)
        X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)
        y_test_tensor = torch.tensor(y_test, dtype=torch.long).to(device)

        model = MLP(input_dim, output_dim).to(device)
        criterion = nn.CrossEntropyLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001, weight_decay=1e-5)

        for epoch in range(1500):
            model.train()
            optimizer.zero_grad()
            outputs = model(X_train_tensor)
            loss = criterion(outputs, y_train_tensor)
            loss.backward()
            optimizer.step()

        model.eval()
        with torch.no_grad():
            preds = model(X_test_tensor).argmax(dim=1).cpu().numpy()
            y_true = y_test_tensor.cpu().numpy()

            acc = accuracy_score(y_true, preds)
            prec = precision_score(y_true, preds, average='macro', zero_division=0)
            rec = recall_score(y_true, preds, average='macro', zero_division=0)
            f1 = f1_score(y_true, preds, average='macro', zero_division=0)

            results.append((acc, prec, rec, f1))

    return results

# Run 5-fold and 10-fold CV
results_5fold = run_cv(X_scaled, y_encoded, n_splits=5)
results_10fold = run_cv(X_scaled, y_encoded, n_splits=10)

# Print results
def print_results(results, label):
    accs, precs, recs, f1s = zip(*results)
    print(f"\n=== {label} Results ===")
    print("MODEL\tAccuracy\tPrecision\tRecall\t\tF1-Score")
    print(f"{label}\t{np.mean(accs):.4f}\t\t{np.mean(precs):.4f}\t\t{np.mean(recs):.4f}\t\t{np.mean(f1s):.4f}")

print_results(results_5fold, "5-Fold")
print_results(results_10fold, "10-Fold")


[Fold 1/5]

[Fold 2/5]

[Fold 3/5]

[Fold 4/5]

[Fold 5/5]

[Fold 1/10]

[Fold 2/10]

[Fold 3/10]

[Fold 4/10]

[Fold 5/10]

[Fold 6/10]

[Fold 7/10]

[Fold 8/10]

[Fold 9/10]

[Fold 10/10]

=== 5-Fold Results ===
MODEL	Accuracy	Precision	Recall		F1-Score
5-Fold	0.7547		0.6892		0.6741		0.6789

=== 10-Fold Results ===
MODEL	Accuracy	Precision	Recall		F1-Score
10-Fold	0.7568		0.6954		0.6777		0.6837
