<a href="https://colab.research.google.com/github/nmaketh/Summative-Assignment---Model-Training-and-Evaluation/blob/main/nhial_model_training%20_%26_evaluation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

 STEP 1: DATA PREPROCESSING
Let's start with loading, cleaning, and preparing your dataset.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import classification_report, accuracy_score, precision_score, recall_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
import xgboost as xgb
import joblib
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.regularizers import l2
import csv

In [None]:
# Dataset: Factors affecting university student grades
# Mission: Classify peer influence risk level based on behavioral/social indicators

df = pd.read_csv("Factors_ affecting_ university_student_grades_dataset.csv")

# Fill missing peer columns
peer_cols = ['Peer_Group', 'Bullying', 'Lack_of_Interest', 'Motivation', 'Parental_Involvement', 'Time_Wasted_on_Social_Media']
df[peer_cols] = df[peer_cols].fillna({
    'Peer_Group': 'Unknown',
    'Bullying': 'Unknown',
    'Lack_of_Interest': 'Unknown',
    'Motivation': 'Unknown',
    'Parental_Involvement': 'Unknown',
    'Time_Wasted_on_Social_Media': 0
})

# Compute score

def compute_peer_risk(row):
    score = 0
    if row['Peer_Group'] == "Negative": score += 1
    if row['Bullying'] == "Yes": score += 1
    if row['Lack_of_Interest'] == "High": score += 1
    if row['Motivation'] == "Low": score += 1
    if row['Parental_Involvement'] == "Low": score += 1
    if row['Time_Wasted_on_Social_Media'] >= 5: score += 1
    return score

df['Peer_Influence_Score'] = df.apply(compute_peer_risk, axis=1)

def assign_risk_label(score):
    if score >= 5: return "High"
    elif score >= 3: return "Moderate"
    return "Low"

df['Peer_Risk_Level'] = df['Peer_Influence_Score'].apply(assign_risk_label)
df = df.dropna(subset=['Peer_Risk_Level'])

# Select features
features = df.drop(columns=['Grades', 'Peer_Influence_Score', 'Peer_Risk_Level'])
target = df['Peer_Risk_Level']

categorical_cols = features.select_dtypes(include='object').columns
features[categorical_cols] = features[categorical_cols].fillna('Unknown')
features_encoded = pd.get_dummies(features, columns=categorical_cols)
features_encoded = features_encoded.fillna(features_encoded.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_encoded)
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

num_classes = len(np.unique(y_train))
y_train_cat = to_categorical(y_train, num_classes)
y_val_cat = to_categorical(y_val, num_classes)
y_test_cat = to_categorical(y_test, num_classes)


In [None]:
# Compute peer influence score
def compute_peer_risk(row):
    score = 0
    if row['Peer_Group'] == "Negative": score += 1
    if row['Bullying'] == "Yes": score += 1
    if row['Lack_of_Interest'] == "High": score += 1
    if row['Motivation'] == "Low": score += 1
    if row['Parental_Involvement'] == "Low": score += 1
    if row['Time_Wasted_on_Social_Media'] >= 5: score += 1
    return score

df['Peer_Influence_Score'] = df.apply(compute_peer_risk, axis=1)

def assign_risk_label(score):
    if score >= 5: return "High"
    elif score >= 3: return "Moderate"
    return "Low"

df['Peer_Risk_Level'] = df['Peer_Influence_Score'].apply(assign_risk_label)
df = df.dropna(subset=['Peer_Risk_Level'])

Feature Selection and Encoding

In [None]:

# 🚀 Feature Selection and Encoding
features = df.drop(columns=['Grades', 'Peer_Influence_Score', 'Peer_Risk_Level'])
target = df['Peer_Risk_Level']

categorical_cols = features.select_dtypes(include='object').columns
features[categorical_cols] = features[categorical_cols].fillna('Unknown')
features_encoded = pd.get_dummies(features, columns=categorical_cols)
features_encoded = features_encoded.fillna(features_encoded.median())

scaler = StandardScaler()
X_scaled = scaler.fit_transform(features_encoded)

label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(target)

X_train, X_temp, y_train, y_temp = train_test_split(X_scaled, y_encoded, test_size=0.3, stratify=y_encoded, random_state=42)
X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=0.5, stratify=y_temp, random_state=42)

num_classes = len(np.unique(y_train))
y_train_cat = to_categorical(y_train, num_classes)
y_val_cat = to_categorical(y_val, num_classes)
y_test_cat = to_categorical(y_test, num_classes)


Step 3: Define Model Builder

In [None]:
def build_model(input_dim, layers=[64, 32], optimizer='adam', lr=None, dropout_rates=None, regularizer=None):
    if lr:
        if optimizer == 'adam': optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
        elif optimizer == 'rmsprop': optimizer = tf.keras.optimizers.RMSprop(learning_rate=lr)
        elif optimizer == 'sgd': optimizer = tf.keras.optimizers.SGD(learning_rate=lr)
    model = Sequential()
    for i, units in enumerate(layers):
        kwargs = {'activation': 'relu', 'kernel_regularizer': regularizer}
        if i == 0:
            model.add(Dense(units, input_shape=(input_dim,), **kwargs))
        else:
            model.add(Dense(units, **kwargs))
        if dropout_rates and i < len(dropout_rates):
            model.add(Dropout(dropout_rates[i]))
    model.add(Dense(3, activation='softmax'))
    model.compile(optimizer=optimizer, loss='categorical_crossentropy', metrics=['accuracy'])
    return model


 Step 4: Evaluation Function

In [None]:
def evaluate_and_record(model, X_test, y_test_cat, y_test, config):
    loss, acc = model.evaluate(X_test, y_test_cat, verbose=0)
    preds = model.predict(X_test)
    y_pred = np.argmax(preds, axis=1)
    return {
        **config,
        'Accuracy': round(acc, 4),
        'F1_Score': round(f1_score(y_test, y_pred, average='macro'), 4),
        'Recall': round(recall_score(y_test, y_pred, average='macro'), 4),
        'Precision': round(precision_score(y_test, y_pred, average='macro'), 4)
    }


Step 5: Train and Save Neural Network Models



In [None]:
models_results = []
early_stop = EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True)
input_dim = X_train.shape[1]

# Instance 1 - Baseline
model_1 = build_model(input_dim)
model_1.fit(X_train, y_train_cat, validation_data=(X_val, y_val_cat), epochs=10, batch_size=32, verbose=1)
model_1.save("saved_models/instance1.h5")
models_results.append(evaluate_and_record(model_1, X_test, y_test_cat, y_test, {
    'Instance': '1', 'Optimizer': 'adam', 'Regularizer': 'None', 'Epochs': 10, 'Early Stopping': 'No', 'Number of Layers': '2', 'Learning Rate': 'default'
}))







Epoch 1/10


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 4ms/step - accuracy: 0.7268 - loss: 0.6236 - val_accuracy: 0.9132 - val_loss: 0.2405
Epoch 2/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9358 - loss: 0.1970 - val_accuracy: 0.9444 - val_loss: 0.1494
Epoch 3/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9703 - loss: 0.1028 - val_accuracy: 0.9550 - val_loss: 0.1136
Epoch 4/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9836 - loss: 0.0591 - val_accuracy: 0.9603 - val_loss: 0.0989
Epoch 5/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9944 - loss: 0.0355 - val_accuracy: 0.9629 - val_loss: 0.0930
Epoch 6/10
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9981 - loss: 0.0217 - val_accuracy: 0.9629 - val_loss: 0.0879
Epoch 7/10
[1m221/221[0m [32m━━━━━━━



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [None]:
# Instance 2
model_2 = build_model(input_dim, layers=[128, 64], optimizer='adam', regularizer=l2(0.001))
model_2.fit(X_train, y_train_cat, validation_data=(X_val, y_val_cat), epochs=50, callbacks=[early_stop], batch_size=32, verbose=1)
model_2.save("saved_models/instance2.h5")
models_results.append(evaluate_and_record(model_2, X_test, y_test_cat, y_test, {
    'Instance': '2', 'Optimizer': 'adam', 'Regularizer': 'L2', 'Epochs': 50, 'Early Stopping': 'Yes', 'Number of Layers': '2', 'Learning Rate': 'default'
}))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 4ms/step - accuracy: 0.7411 - loss: 0.7908 - val_accuracy: 0.9358 - val_loss: 0.3847
Epoch 2/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9562 - loss: 0.3215 - val_accuracy: 0.9603 - val_loss: 0.2699
Epoch 3/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9809 - loss: 0.2310 - val_accuracy: 0.9596 - val_loss: 0.2441
Epoch 4/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9958 - loss: 0.1789 - val_accuracy: 0.9689 - val_loss: 0.2197
Epoch 5/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9982 - loss: 0.1515 - val_accuracy: 0.9662 - val_loss: 0.1987
Epoch 6/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.9994 - loss: 0.1272 - val_accuracy: 0.9689 - val_loss: 0.1798
Epoch 7/50
[1m221/221[0m [32m━━━━━━━



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


In [None]:

# Instance 3
model_3 = build_model(input_dim, layers=[128, 64], optimizer='rmsprop', dropout_rates=[0.3, 0.2])
model_3.fit(X_train, y_train_cat, validation_data=(X_val, y_val_cat), epochs=50, callbacks=[early_stop], batch_size=32, verbose=1)
model_3.save("saved_models/instance3.h5")
models_results.append(evaluate_and_record(model_3, X_test, y_test_cat, y_test, {
    'Instance': '3', 'Optimizer': 'rmsprop', 'Regularizer': 'Dropout', 'Epochs': 50, 'Early Stopping': 'Yes', 'Number of Layers': '2 + Dropout', 'Learning Rate': 'default'
}))

Epoch 1/50


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 6ms/step - accuracy: 0.7361 - loss: 0.5846 - val_accuracy: 0.9212 - val_loss: 0.2088
Epoch 2/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.9057 - loss: 0.2415 - val_accuracy: 0.9550 - val_loss: 0.1229
Epoch 3/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9313 - loss: 0.1669 - val_accuracy: 0.9583 - val_loss: 0.1038
Epoch 4/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9519 - loss: 0.1164 - val_accuracy: 0.9603 - val_loss: 0.0914
Epoch 5/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9591 - loss: 0.0982 - val_accuracy: 0.9603 - val_loss: 0.0901
Epoch 6/50
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.9672 - loss: 0.0802 - val_accuracy: 0.9656 - val_loss: 0.0825
Epoch 7/50
[1m221/221[0m [32m━━━━━━━



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 2ms/step


In [None]:
# Instance 4
model_4 = build_model(input_dim, layers=[256, 128, 64], optimizer='sgd', lr=0.01, dropout_rates=[0.4, 0.3, 0.2])
model_4.fit(X_train, y_train_cat, validation_data=(X_val, y_val_cat), epochs=60, callbacks=[early_stop], batch_size=32, verbose=1)
model_4.save("saved_models/instance4.h5")
models_results.append(evaluate_and_record(model_4, X_test, y_test_cat, y_test, {
    'Instance': '4', 'Optimizer': 'sgd', 'Regularizer': 'Dropout', 'Epochs': 60, 'Early Stopping': 'Yes', 'Number of Layers': '3 + Dropout', 'Learning Rate': 0.01
}))

Epoch 1/60


  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 5ms/step - accuracy: 0.5935 - loss: 0.8782 - val_accuracy: 0.6927 - val_loss: 0.6138
Epoch 2/60
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.7025 - loss: 0.6383 - val_accuracy: 0.7298 - val_loss: 0.5181
Epoch 3/60
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7344 - loss: 0.5729 - val_accuracy: 0.8258 - val_loss: 0.3959
Epoch 4/60
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.7910 - loss: 0.4708 - val_accuracy: 0.8954 - val_loss: 0.3116
Epoch 5/60
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 4ms/step - accuracy: 0.8270 - loss: 0.4103 - val_accuracy: 0.9053 - val_loss: 0.2546
Epoch 6/60
[1m221/221[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 3ms/step - accuracy: 0.8512 - loss: 0.3551 - val_accuracy: 0.9272 - val_loss: 0.2161
Epoch 7/60
[1m221/221[0m [32m━━━━━━━



[1m48/48[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 3ms/step


Step 6: Display NN Results Table




In [None]:
nn_df = pd.DataFrame(models_results)
display(nn_df)
nn_df.to_csv("saved_models/final_nn_report.csv", index=False)

Unnamed: 0,Instance,Optimizer,Regularizer,Epochs,Early Stopping,Number of Layers,Learning Rate,Accuracy,F1_Score,Recall,Precision
0,1,adam,,10,No,2,default,0.9596,0.8802,0.841,0.9444
1,2,adam,L2,50,Yes,2,default,0.9768,0.9038,0.8683,0.9588
2,3,rmsprop,Dropout,50,Yes,2 + Dropout,default,0.9669,0.8936,0.868,0.9255
3,4,sgd,Dropout,60,Yes,3 + Dropout,0.01,0.9748,0.9267,0.9542,0.9038


Step 7: Classical ML Models

In [None]:
ml_reports = []

# Logistic Regression
lr_model = GridSearchCV(LogisticRegression(max_iter=1000, multi_class='ovr'), param_grid={'C': [0.1, 1, 10]}, cv=3)
lr_model.fit(X_train, y_train)
y_pred_lr = lr_model.predict(X_test)
ml_reports.append({
    'Instance': 'LogReg', 'Optimizer': 'liblinear', 'Regularizer': 'L2', 'Epochs': 'n/a', 'Early Stopping': 'No', 'Number of Layers': 'n/a', 'Learning Rate': 'n/a',
    'Accuracy': round(accuracy_score(y_test, y_pred_lr), 4),
    'F1 Score': round(f1_score(y_test, y_pred_lr, average='macro'), 4),
    'Recall': round(recall_score(y_test, y_pred_lr, average='macro'), 4),
    'Precision': round(precision_score(y_test, y_pred_lr, average='macro'), 4)
})
joblib.dump(lr_model.best_estimator_, "saved_models/logistic_regression.pkl")

# SVM
svm_model = GridSearchCV(SVC(probability=True), param_grid={'C': [1, 10], 'kernel': ['rbf', 'linear']}, cv=3)
svm_model.fit(X_train, y_train)
y_pred_svm = svm_model.predict(X_test)
ml_reports.append({
    'Instance': 'SVM', 'Optimizer': 'n/a', 'Regularizer': 'n/a', 'Epochs': 'n/a', 'Early Stopping': 'No', 'Number of Layers': 'n/a', 'Learning Rate': 'n/a',
    'Accuracy': round(accuracy_score(y_test, y_pred_svm), 4),
    'F1 Score': round(f1_score(y_test, y_pred_svm, average='macro'), 4),
    'Recall': round(recall_score(y_test, y_pred_svm, average='macro'), 4),
    'Precision': round(precision_score(y_test, y_pred_svm, average='macro'), 4)
})
joblib.dump(svm_model.best_estimator_, "saved_models/svm_model.pkl")

# XGBoost
xgb_model = xgb.XGBClassifier(use_label_encoder=False, eval_metric='mlogloss')
xgb_model.fit(X_train, y_train)
y_pred_xgb = xgb_model.predict(X_test)
ml_reports.append({
    'Instance': 'XGBoost', 'Optimizer': 'tree booster', 'Regularizer': 'L1 & L2', 'Epochs': 'auto', 'Early Stopping': 'No', 'Number of Layers': 'n/a', 'Learning Rate': '0.3 (default)',
    'Accuracy': round(accuracy_score(y_test, y_pred_xgb), 4),
    'F1 Score': round(f1_score(y_test, y_pred_xgb, average='macro'), 4),
    'Recall': round(recall_score(y_test, y_pred_xgb, average='macro'), 4),
    'Precision': round(precision_score(y_test, y_pred_xgb, average='macro'), 4)
})
joblib.dump(xgb_model, "saved_models/xgboost_model.pkl")


Parameters: { "use_label_encoder" } are not used.



['saved_models/xgboost_model.pkl']

 Step 8: Display Classical ML Results Table

In [None]:
ml_df = pd.DataFrame(ml_reports)
display(ml_df)
ml_df.to_csv("saved_models/final_ml_report.csv", index=False)

Unnamed: 0,Instance,Optimizer,Regularizer,Epochs,Early Stopping,Number of Layers,Learning Rate,Accuracy,F1 Score,Recall,Precision
0,LogReg,liblinear,L2,,No,,,0.9861,0.751,0.72,0.9851
1,SVM,,,,No,,,0.994,0.9378,0.9429,0.9329
2,XGBoost,tree booster,L1 & L2,auto,No,,0.3 (default),1.0,1.0,1.0,1.0
