In [None]:
# --- FIRST CELL IN COLAB ---
import os, random, numpy as np
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
os.environ["TF_DETERMINISTIC_OPS"] = "1"
os.environ["TF_CUDNN_DETERMINISTIC"] = "1"
os.environ["TF_NUM_INTRAOP_THREADS"] = "1"
os.environ["TF_NUM_INTEROP_THREADS"] = "1"
os.environ["OMP_NUM_THREADS"] = "1"

import tensorflow as tf
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

try:
    tf.config.experimental.enable_op_determinism()  # ok if available
except Exception:
    pass



Data Loading,

In [None]:
import pandas as pd, numpy as np, glob, os, random, tensorflow as tf

# Set TensorFlow parallelism threads at the very beginning
tf.config.threading.set_intra_op_parallelism_threads(1)
tf.config.threading.set_inter_op_parallelism_threads(1)

from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error

# 1) Make runs reproducible
SEED = 42
os.environ["PYTHONHASHSEED"] = str(SEED)
random.seed(SEED)
np.random.seed(SEED)
tf.random.set_seed(SEED)
try:
    tf.config.experimental.enable_op_determinism()  # TF 2.12+
except Exception:
    pass


# ==== 1) Load data ====

CSV_PATH = "/content/sample_data/student_exam_scores.csv"
print("Using CSV:", CSV_PATH)

df = pd.read_csv(CSV_PATH)
print("Shape:", df.shape)
print(df.head(3))

#  Set target column 
TARGET = "exam_score"  # <-- change if your column is named differently
assert TARGET in df.columns, f"{TARGET} not found. Columns are: {df.columns.tolist()}"

# ==== 3) Quick EDA (very light) ====
print("\nMissing per column (count):\n", df.isna().sum())
print("\nTarget describe:\n", df[TARGET].describe())

# ==== 4) Basic cleaning: drop obvious IDs if present (optional) ====
for col in ["student_id","id","Index","index"]:
    if col in df.columns:
        df = df.drop(columns=[col])

# ==== 5) Split X/y ====
y = df[TARGET].astype(float)
X = df.drop(columns=[TARGET])

print("\nDataTypes are: \n", df.dtypes)





Splitting the Train/Val/Test

In [None]:
# Train/Val/Test split (80/10/10)
X_trainval, X_test, y_trainval, y_test = train_test_split(X, y, test_size=0.10, random_state=42)
X_train,   X_val,  y_train,   y_val  = train_test_split(X_trainval, y_trainval, test_size=0.1111, random_state=42)
# (0.1111 of 90% ≈ 10%, so final ~80/10/10)

print(f"\nX shapes → train {X_train.shape}, val {X_val.shape}, test {X_test.shape}")

Simplest Model (no scaling, no regularisation)

In [None]:

# Convert to float32 arrays for Keras
X_train = X_train.astype("float32").values
X_val   = X_val.astype("float32").values
X_test  = X_test.astype("float32").values
y_train = y_train.values.astype("float32")
y_val   = y_val.values.astype("float32")
y_test  = y_test.values.astype("float32")

# ==== 6) Simple deep learning model (Keras MLP) ====

from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Input
from tensorflow.keras.callbacks import EarlyStopping

model = Sequential([
    Input(shape=(X_train.shape[1],)),
    Dense(64, activation="relu"),
    Dense(32, activation="relu"),
    Dense(1)  # regression output
])

model.compile(optimizer="adam", loss="mse", metrics=["mae"])  # MSE loss, MAE metric

# Early stopping is basic (not fancy) and prevents overfitting on tiny data
es = EarlyStopping(patience=10, restore_best_weights=True, monitor="val_loss")

history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=200,
    batch_size=32,    # with ~200 rows this is fine; you can also set to len(X_train) for full-batch
    callbacks=[es]
)

# ==== 7) Evaluate on test set ====
pred = model.predict(X_test, verbose=0).ravel()
mae  = mean_absolute_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
r2   = r2_score(y_test, pred)
print(f"\nTest MAE:  {mae:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R^2:  {r2:.4f}")

Simple Model with scaled features and regularisation

In [None]:
# === (A) Scale features (fit on train, transform val/test) ===
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
X_train_s = scaler.fit_transform(X_train)  # X_train, X_val, X_test from your split
X_val_s   = scaler.transform(X_val)
X_test_s  = scaler.transform(X_test)

X_train_s = X_train_s.astype("float32")
X_val_s   = X_val_s.astype("float32")
X_test_s  = X_test_s.astype("float32")

# === (B) Simple Keras MLP with mild regularization ===
import tensorflow as tf
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
tf.random.set_seed(42)

model = Sequential([
    Input(shape=(X_train_s.shape[1],)),
    Dense(64, activation="relu", kernel_regularizer=l2(1e-4)),
    Dropout(0.25),
    Dense(32, activation="relu", kernel_regularizer=l2(1e-4)),
    Dense(1)
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss="mse",
    #tf.keras.losses.Huber(delta=1.0),   # robust regression loss
    metrics=["mae"]
)

#es  = EarlyStopping(monitor="val_mae", patience=10, restore_best_weights=True)
es = EarlyStopping(
    monitor="val_loss", patience=8, restore_best_weights=True, min_delta=1e-4
)

rlr = ReduceLROnPlateau(monitor="val_mae", factor=0.5, patience=1, min_lr=1e-5)

history = model.fit(
    X_train_s, y_train,
    validation_data=(X_val_s, y_val),
    epochs=400,
    batch_size=min(32, len(X_train_s)),  # tiny dataset → small batch
    shuffle=False,
    verbose=0,
    callbacks=[es, rlr]
)

for i, (tr, vl) in enumerate(zip(history.history["loss"], history.history["val_loss"]), start=1):
    print(f"Epoch {i:03d} | train_loss {tr:.4f} | val_loss {vl:.4f}")

# === (C) Test metrics ===
pred = model.predict(X_test_s).ravel()
mae  = mean_absolute_error(y_test, pred)
rmse = root_mean_squared_error(y_test, pred)
r2   = r2_score(y_test, pred)
print(f"Test MAE:  {mae:.4f}")
print(f"Test RMSE: {rmse:.4f}")
print(f"Test R^2:  {r2:.4f}")


In [None]:
import numpy as np, pandas as pd
from sklearn.metrics import mean_absolute_error

# ---- 1) Permutation importance for a Keras regressor ----
def permutation_importance_keras(model, X_s, y, n_repeats=20, seed=42):
    """
    X_s: 2D numpy array (scaled features) of shape (n_samples, n_features)
    y:   1D numpy array of true targets
    returns: (importances, baseline_mae)
    """
    rng = np.random.RandomState(seed)
    base_pred = model.predict(X_s, verbose=0).ravel()
    baseline_mae = mean_absolute_error(y, base_pred)

    Xw = X_s.copy()
    imps = np.zeros(X_s.shape[1], dtype=float)

    for j in range(X_s.shape[1]):
        scores = []
        saved_col = Xw[:, j].copy()
        for _ in range(n_repeats):
            idx = rng.permutation(len(y))
            Xw[:, j] = saved_col[idx]          # shuffle column j
            pred = model.predict(Xw, verbose=0).ravel()
            scores.append(mean_absolute_error(y, pred))
        imps[j] = np.mean(scores) - baseline_mae
        Xw[:, j] = saved_col                  # restore column
    return imps, baseline_mae

# feature names (after get_dummies)
feature_names = X.columns.tolist()

# compute importance on TEST set (you can use VAL instead)
imps, base_mae = permutation_importance_keras(model, X_test_s, y_test, n_repeats=30, seed=42)

imp_df = pd.DataFrame({
    "feature": feature_names,
    "mae_increase": imps
}).sort_values("mae_increase", ascending=False)

print(f"Baseline MAE on test: {base_mae:.4f}")
display(imp_df.head(5))


N-fold cross validation for a simple Keras regressor

In [None]:
# ===== N-fold cross validation for a simple Keras regressor =====
from sklearn.model_selection import KFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from tensorflow.keras import Sequential
from tensorflow.keras.layers import Dense, Dropout, Input
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau
from tensorflow.keras.regularizers import l2

# --- reproducibility (simple) ---
SEED = 42
random.seed(SEED); np.random.seed(SEED); tf.random.set_seed(SEED)

# X, y should already exist (X = pd.get_dummies(...); y = df[TARGET].astype(float))
X_arr = X.astype("float32").values
y_arr = y.values.astype("float32")

K = 5  # <-- change to your desired number of folds (e.g., 5 or 10)

def build_model(input_dim):
    model = Sequential([
        Input(shape=(input_dim,)),
        Dense(64, activation="relu", kernel_regularizer=l2(1e-4)),
        Dropout(0.25),
        Dense(32, activation="relu", kernel_regularizer=l2(1e-4)),
        Dense(1)
    ])
    model.compile(optimizer=tf.keras.optimizers.Adam(1e-3),
                  loss=tf.keras.losses.Huber(delta=3.0),  # or "mse"
                  metrics=["mae"])
    return model

kf = KFold(n_splits=K, shuffle=True, random_state=SEED)

maes, rmses, r2s = [], [], []

fold = 0
for train_idx, val_idx in kf.split(X_arr, y_arr):
    fold += 1
    X_tr, X_va = X_arr[train_idx], X_arr[val_idx]
    y_tr, y_va = y_arr[train_idx], y_arr[val_idx]

    # Scale features *inside each fold* (fit on train, transform val)
    sc = StandardScaler()
    X_trs = sc.fit_transform(X_tr).astype("float32")
    X_vas = sc.transform(X_va).astype("float32")

    model = build_model(X_trs.shape[1])

    es  = EarlyStopping(monitor="val_loss", patience=10, restore_best_weights=True, min_delta=1e-4)
    rlr = ReduceLROnPlateau(monitor="val_loss", factor=0.5, patience=5, min_lr=1e-5)

    history = model.fit(
        X_trs, y_tr,
        validation_data=(X_vas, y_va),
        epochs=400,
        batch_size=min(32, len(X_trs)),
        shuffle=False,
        verbose=0,
        callbacks=[es, rlr]
    )

    # Evaluate on the held-out fold
    pred = model.predict(X_vas, verbose=0).ravel()
    mae  = mean_absolute_error(y_va, pred)
    rmse = root_mean_squared_error(y_va, pred)
    r2   = r2_score(y_va, pred)

    maes.append(mae); rmses.append(rmse); r2s.append(r2)
    print(f"Fold {fold}/{K} -> MAE {mae:.4f} | RMSE {rmse:.4f} | R^2 {r2:.4f} | epochs {len(history.history['loss'])}")

print("\n==== Cross-validated results ====")
print(f"MAE : {np.mean(maes):.4f} ± {np.std(maes):.4f}")
print(f"RMSE: {np.mean(rmses):.4f} ± {np.std(rmses):.4f}")
print(f"R^2 : {np.mean(r2s):.4f} ± {np.std(r2s):.4f}")


In [None]:
import numpy as np, pandas as pd
from sklearn.metrics import mean_absolute_error

# ---- 1) Permutation importance for a Keras regressor ----
def permutation_importance_keras(model, X_s, y, n_repeats=20, seed=42):
    """
    X_s: 2D numpy array (scaled features) of shape (n_samples, n_features)
    y:   1D numpy array of true targets
    returns: (importances, baseline_mae)
    """
    rng = np.random.RandomState(seed)
    base_pred = model.predict(X_s, verbose=0).ravel()
    baseline_mae = mean_absolute_error(y, base_pred)

    Xw = X_s.copy()
    imps = np.zeros(X_s.shape[1], dtype=float)

    for j in range(X_s.shape[1]):
        scores = []
        saved_col = Xw[:, j].copy()
        for _ in range(n_repeats):
            idx = rng.permutation(len(y))
            Xw[:, j] = saved_col[idx]          # shuffle column j
            pred = model.predict(Xw, verbose=0).ravel()
            scores.append(mean_absolute_error(y, pred))
        imps[j] = np.mean(scores) - baseline_mae
        Xw[:, j] = saved_col                  # restore column
    return imps, baseline_mae

# feature names (after get_dummies)
feature_names = X.columns.tolist()

# compute importance on TEST set (you can use VAL instead)
imps, base_mae = permutation_importance_keras(model, X_vas, y_va, n_repeats=30, seed=42)

imp_df = pd.DataFrame({
    "feature": feature_names,
    "mae_increase": imps
}).sort_values("mae_increase", ascending=False)

print(f"Baseline MAE on test: {base_mae:.4f}")
display(imp_df.head(5))


In [None]:
y_va.std(), y_va.min(), y_va.max()
nrmse = 3.0614 / y_va.std()  # RMSE / std of y_va
perc_range = 2.5437 / (y_va.max() - y_va.min())
print("NRMSE:", nrmse, "MAE as % of range:", perc_range)

In [None]:
y_test.std(), y_test.min(), y_test.max()
nrmse = 3.0614 / y_test.std()  # RMSE / std of y_test
perc_range = 2.5437 / (y_test.max() - y_test.min())
print("NRMSE:", nrmse, "MAE as % of range:", perc_range)


In [None]:
import matplotlib.pyplot as plt
h = history.history
plt.plot(h["loss"], label="train")
plt.plot(h["val_loss"], label="val")
plt.title("Learning curves (loss)")
plt.xlabel("epoch"); plt.ylabel("loss"); plt.legend(); plt.show()


In [None]:
from sklearn.metrics import mean_absolute_error

train_pred = model.predict(X_train_s).ravel()
val_pred   = model.predict(X_val_s).ravel()

train_mae = mean_absolute_error(y_train, train_pred)
val_mae   = mean_absolute_error(y_val,   val_pred)

print(f"Train MAE: {train_mae:.3f}   Val MAE: {val_mae:.3f}")
