In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models

###############################################
# 1) LOAD DATA
###############################################
data_path = "/content/gdrive/MyDrive/Colab Notebooks/Untitled folder/new_PISA_cleaned_dataset.csv"  # Update if needed

if not os.path.exists(data_path):
    print("❌ File not found. Please upload 'PISA_cleaned_dataset.csv' into your environment.")
    raise SystemExit

data = pd.read_csv(data_path)
print("Data loaded. Full shape:", data.shape)
display(data.head())

###############################################
# 2) MAKE A COPY OF THE DATA FOR MODELING
###############################################
model_data = data.copy()
print("\nInitial model_data shape:", model_data.shape)

###############################################
# 3) DROP UNWANTED COLUMNS
###############################################
columns_to_remove = [
    "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR",
    "CURIOAGR", "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR",
    "INFOSEEK", "FAMSUP", "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA",
    "EXPO21ST", "MATHEFF", "MATHEF21", "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF",
    "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", "CREATOP", "OPENART", "IMAGINE",
    "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", "SDLEFF", "ICTRES",
    "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", "ACCESSFA",
    "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ",
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC",
    "BODYIMA", "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP",
    "PQMIMP", "PQMCAR", "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME",
    "CREATACT", "CREATOPN", "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD",
    "ENCOURPG", "DIGDVPOL", "TEAFDBK", "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT",
    "EDUSHORT", "STUBEHA", "TEACHBEHA", "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC",
    "CREENVSC", "ACTCRESC", "OPENCUL", "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP",
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH",
    "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]
model_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')
print("After dropping specified columns:", model_data.shape)

###############################################
# 4) REMOVE FULLY EMPTY (ALL-NaN) COLUMNS
###############################################
all_nan_cols = model_data.columns[model_data.isnull().all()]
if len(all_nan_cols) > 0:
    print("Dropping all-NaN columns:", list(all_nan_cols))
    model_data.drop(columns=all_nan_cols, inplace=True)

###############################################
# 5) DETECT & DROP ZERO-VARIANCE COLUMNS
###############################################
numeric_cols_all = model_data.select_dtypes(include=[np.number]).columns
zero_var_cols = []
for col in numeric_cols_all:
    # If a column has <= 1 unique value (including NaN), it's effectively constant
    if model_data[col].nunique(dropna=False) <= 1:
        zero_var_cols.append(col)

if len(zero_var_cols) > 0:
    print("Dropping zero-variance columns:", zero_var_cols)
    model_data.drop(columns=zero_var_cols, inplace=True, errors='ignore')

###############################################
# 6) CHECK FOR AND HANDLE INFINITE VALUES
###############################################
num_df = model_data.select_dtypes(include=[np.number])
inf_cols = num_df.columns[np.isinf(num_df).any()]
if len(inf_cols) > 0:
    print("Found inf/-inf in columns:", list(inf_cols))
    # Replace inf/-inf with NaN to be imputed later
    for c in inf_cols:
        model_data[c] = np.where(np.isinf(model_data[c]), np.nan, model_data[c])

###############################################
# 7) REORDER COLUMNS (TARGET FIRST)
###############################################
target_col = "MATH_Proficient"
if target_col not in model_data.columns:
    raise ValueError(f"Target column '{target_col}' not found in data.")

other_cols = [c for c in model_data.columns if c != target_col]
model_data = model_data[[target_col] + other_cols]

###############################################
# 8) TRAIN/VAL/TEST SPLIT
###############################################
X = model_data.drop(columns=[target_col])
y = model_data[target_col].values

# Ensure we have at least two classes
if len(np.unique(y)) < 2:
    raise ValueError("Target column has only one unique class, cannot train a binary classifier.")

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=1729, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=1729, shuffle=True
)

print("\nData Splits:")
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

###############################################
# 9) LABEL ENCODE OBJECT COLUMNS
###############################################
obj_cols = X_train.select_dtypes(include=['object']).columns
print("\nLabel-encoding these columns:", list(obj_cols))
for col in obj_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_val[col]   = le.transform(X_val[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

###############################################
# 10) FILL NUMERIC MISSING WITH TRAIN MEAN
###############################################
num_cols = X_train.select_dtypes(include=[np.number]).columns
for col in num_cols:
    train_mean = X_train[col].mean()
    X_train[col].fillna(train_mean, inplace=True)
    X_val[col].fillna(train_mean, inplace=True)
    X_test[col].fillna(train_mean, inplace=True)

print("\nNaNs remaining in X_train:", X_train.isnull().sum().sum())
print("NaNs remaining in X_val:", X_val.isnull().sum().sum())
print("NaNs remaining in X_test:", X_test.isnull().sum().sum())

###############################################
# 11) STANDARD SCALING (Numeric Columns)
###############################################
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

# Double-check no new NaNs introduced
print("NaNs in X_train after scaling:", X_train.isnull().sum().sum())
print("NaNs in X_val after scaling:", X_val.isnull().sum().sum())
print("NaNs in X_test after scaling:", X_test.isnull().sum().sum())

###############################################
# 12) BUILD THE MODEL
###############################################
tf.keras.backend.clear_session()

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

# EarlyStopping to avoid overfitting
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=5,
    mode='max',
    restore_best_weights=True
)

###############################################
# 13) TRAIN THE MODEL
###############################################
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

###############################################
# 14) EVALUATE ON VALIDATION
###############################################
val_preds = model.predict(X_val).ravel()
print("\nAny NaNs in val_preds?", np.isnan(val_preds).any())
if np.isnan(val_preds).any():
    print("❌ NaNs present in validation predictions. Consider lowering the learning rate or checking outliers.")
else:
    val_auc = roc_auc_score(y_val, val_preds)
    print("Validation AUC:", round(val_auc, 4))

###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))
    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training (global model)!")


Data loaded. Full shape: (591857, 1121)


Unnamed: 0,CNT,CNTSCHID,CNTSTUID,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,...,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
0,Albania,800282.0,800001.0,0.0,,,1.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,Albania,800115.0,800002.0,0.0,,2.0,2.0,2.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,Albania,800242.0,800003.0,0.0,,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,Albania,800245.0,800005.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,Albania,800285.0,800006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0



Initial model_data shape: (591857, 1121)
After dropping specified columns: (591857, 1084)

Data Splits:
Train shape: (414299, 1083) (414299,)
Validation shape: (88779, 1083) (88779,)
Test shape: (88779, 1083) (88779,)

Label-encoding these columns: ['CNT']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(train_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(train_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw


NaNs remaining in X_train: 0
NaNs remaining in X_val: 0
NaNs remaining in X_test: 0
NaNs in X_train after scaling: 0
NaNs in X_val after scaling: 0
NaNs in X_test after scaling: 0
Epoch 1/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m36s[0m 3ms/step - auc: 0.9140 - loss: 0.3684 - val_auc: 0.9288 - val_loss: 0.3375
Epoch 2/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - auc: 0.9319 - loss: 0.3300 - val_auc: 0.9293 - val_loss: 0.3385
Epoch 3/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - auc: 0.9374 - loss: 0.3167 - val_auc: 0.9308 - val_loss: 0.3353
Epoch 4/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - auc: 0.9421 - loss: 0.3047 - val_auc: 0.9299 - val_loss: 0.3382
Epoch 5/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - auc: 0.9450 - loss: 0.2970 - val_auc: 0.9282 - val_loss: 0.3447
Epoch 6/50
[1m12947/12947[0m [3

In [None]:
###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))

        # ======================
        # CONFUSION MATRIX CODE
        # ======================

        # 1. Create a copy of X_test for confusion matrix analysis
        test_data_small = X_test.copy()
        # Insert the actual label into the DataFrame
        test_data_small['MATH_Proficient'] = y_test

        # 2. Assign the predictions and threshold
        predictions_small = test_preds
        threshold = 0.66  # Adjust as desired

        # 3. Create confusion matrix via crosstab
        cm_small = pd.crosstab(
            index=test_data_small['MATH_Proficient'],
            columns=np.round((predictions_small >= threshold).astype(int)),
            rownames=['actuals'],
            colnames=['predictions']
        )

        # 4. Extract TN, FP, FN, TP
        TN_small = cm_small.loc[0.0, 0.0]
        FP_small = cm_small.loc[0.0, 1.0]
        FN_small = cm_small.loc[1.0, 0.0]
        TP_small = cm_small.loc[1.0, 1.0]

        # 5. Calculate various metrics
        accuracy_small = (TP_small + TN_small) / (TP_small + TN_small + FP_small + FN_small) * 100
        precision_small = (TP_small / (TP_small + FP_small) * 100) if (TP_small + FP_small) > 0 else 0
        recall_small = (TP_small / (TP_small + FN_small) * 100) if (TP_small + FN_small) > 0 else 0
        f1_score_small = (2 * (precision_small * recall_small) / (precision_small + recall_small)
                          if (precision_small + recall_small) > 0 else 0)
        specificity_small = (TN_small / (TN_small + FP_small) * 100) if (TN_small + FP_small) > 0 else 0

        # 6. Print results
        print("\nConfusion Matrix:\n", cm_small)
        print("\nAccuracy (20 features): {:.1f}".format(accuracy_small))
        print("F1 Score (20 features): {:.1f}".format(f1_score_small))
        print("Precision (20 features): {:.1f}".format(precision_small))
        print("Recall (20 features): {:.1f}".format(recall_small))
        print("Specificity (20 features): {:.1f}".format(specificity_small))

    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training!")

[1m2775/2775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Any NaNs in test_preds? False
Test AUC: 0.9306

Confusion Matrix:
 predictions      0      1
actuals                  
0.0          36838   3735
1.0          10802  37404

Accuracy (20 features): 83.6
F1 Score (20 features): 83.7
Precision (20 features): 90.9
Recall (20 features): 77.6
Specificity (20 features): 90.8

✅ Finished end-to-end training!


In [None]:
# 1) Install shap if not installed (done once in your environment)
# !pip install shap

import shap
import numpy as np

# 2) We already have our trained model, X_train, X_test, etc.

# 3) Prepare background data
X_train_np = X_train.values if hasattr(X_train, "values") else X_train
background_size = 100
background_indices = np.random.choice(X_train_np.shape[0], background_size, replace=False)
background_data = X_train_np[background_indices]

# 4) Initialize DeepExplainer
explainer = shap.DeepExplainer(model, background_data)

# 5) Choose the data to explain (test set)
X_test_np = X_test.values if hasattr(X_test, "values") else X_test
shap_values = explainer.shap_values(X_test_np)


Expected: keras_tensor
Received: inputs=['Tensor(shape=(100, 1083))']
Expected: keras_tensor
Received: inputs=['Tensor(shape=(200, 1083))']
Expected: keras_tensor
Received: inputs=['Tensor(shape=(88779, 1083))']


In [None]:
import numpy as np
import pandas as pd

# 1) Ensure shap_values is a NumPy array of shape (n_samples, n_features).
#    For a single-output model, shap.DeepExplainer often returns a list with one array:
if isinstance(shap_values, list) and len(shap_values) == 1:
    shap_values_arr = shap_values[0]
else:
    shap_values_arr = shap_values

# 2) Compute mean absolute SHAP values per feature
mean_abs_shap = np.mean(np.abs(shap_values_arr), axis=0)

# 3) Retrieve feature names (if X_test is a DataFrame)
if isinstance(X_test, pd.DataFrame):
    feature_names = X_test.columns
else:
    # Fallback names if X_test is a NumPy array
    feature_names = [f"Feature_{i}" for i in range(shap_values_arr.shape[1])]

# 4) Sort features by their mean absolute SHAP value (descending order)
# Reshape mean_abs_shap to 1D for proper indexing
mean_abs_shap = mean_abs_shap.reshape(-1)  # This is the important change

sorted_idx = np.argsort(mean_abs_shap)[::-1]
top_n = 20  # how many top features to show
top_idx = sorted_idx[:top_n]

# 5) Create a DataFrame for easier display
top_features_df = pd.DataFrame({
    'Feature': [feature_names[i] for i in top_idx],
    'MeanAbsSHAP': mean_abs_shap[top_idx]
})

# 6) Print or display
print("Top 20 features by mean absolute SHAP value:")
display(top_features_df)

Top 20 features by mean absolute SHAP value:


Unnamed: 0,Feature,MeanAbsSHAP
0,ST253Q01JA,0.030462
1,ST004D01T,0.029439
2,ST059Q02JA,0.022255
3,ST255Q01JA,0.018002
4,LANGN_156,0.014288
5,GRADE,0.014119
6,ST349Q01JA_1,0.01396
7,SC211Q03JA,0.012966
8,ST259Q01JA,0.012462
9,ST256Q03JA,0.01231


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models

###############################################
# 1) LOAD DATA
###############################################
data_path = "/content/gdrive/MyDrive/Colab Notebooks/Untitled folder/new_PISA_cleaned_dataset.csv"  # Update if needed

if not os.path.exists(data_path):
    print("❌ File not found. Please upload 'PISA_cleaned_dataset.csv' into your environment.")
    raise SystemExit

data = pd.read_csv(data_path)
print("Data loaded. Full shape:", data.shape)
display(data.head())

###############################################
# 2) MAKE A COPY OF THE DATA FOR MODELING
###############################################
model_data = data.copy()
print("\nInitial model_data shape:", model_data.shape)

###############################################
# 3) DROP UNWANTED COLUMNS
###############################################
columns_to_remove = [
    "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR",
    "CURIOAGR", "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR",
    "INFOSEEK", "FAMSUP", "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA",
    "EXPO21ST", "MATHEFF", "MATHEF21", "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF",
    "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", "CREATOP", "OPENART", "IMAGINE",
    "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", "SDLEFF", "ICTRES",
    "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", "ACCESSFA",
    "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ",
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC",
    "BODYIMA", "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP",
    "PQMIMP", "PQMCAR", "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME",
    "CREATACT", "CREATOPN", "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD",
    "ENCOURPG", "DIGDVPOL", "TEAFDBK", "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT",
    "EDUSHORT", "STUBEHA", "TEACHBEHA", "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC",
    "CREENVSC", "ACTCRESC", "OPENCUL", "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP",
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH",
    "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]
model_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')
print("After dropping specified columns:", model_data.shape)

###############################################
# 3.5) LIMIT TO YOUR TOP FEATURES + TARGET
###############################################
target_col = "MATH_Proficient"

# Make sure this list matches exactly your column names in model_data:
features_to_keep = [
    "ST253Q01JA", "ST004D01T", "ST059Q02JA", "ST255Q01JA", "LANGN_156",
    "GRADE", "ST349Q01JA_1", "SC211Q03JA", "ST259Q01JA", "ST256Q03JA",
    "ST268Q04JA", "REPEAT", "ST297Q09JA", "WORKPAY", "ST251Q04JA",
    "EXERPRAC", "EXPECEDU", "ST349Q01JA_2", "ST251Q06JA", "CNT"
]

# Subset only target + these features
needed_cols = [target_col] + features_to_keep
model_data = model_data[needed_cols]
print("\nColumns retained:", list(model_data.columns))
print("Shape after subsetting:", model_data.shape)

###############################################
# 4) REMOVE FULLY EMPTY (ALL-NaN) COLUMNS
###############################################
all_nan_cols = model_data.columns[model_data.isnull().all()]
if len(all_nan_cols) > 0:
    print("Dropping all-NaN columns:", list(all_nan_cols))
    model_data.drop(columns=all_nan_cols, inplace=True)

###############################################
# 5) DETECT & DROP ZERO-VARIANCE COLUMNS
###############################################
numeric_cols_all = model_data.select_dtypes(include=[np.number]).columns
zero_var_cols = []
for col in numeric_cols_all:
    # If a column has <= 1 unique value (including NaN), it's effectively constant
    if model_data[col].nunique(dropna=False) <= 1:
        zero_var_cols.append(col)

if len(zero_var_cols) > 0:
    print("Dropping zero-variance columns:", zero_var_cols)
    model_data.drop(columns=zero_var_cols, inplace=True, errors='ignore')

###############################################
# 6) CHECK FOR AND HANDLE INFINITE VALUES
###############################################
num_df = model_data.select_dtypes(include=[np.number])
inf_cols = num_df.columns[np.isinf(num_df).any()]
if len(inf_cols) > 0:
    print("Found inf/-inf in columns:", list(inf_cols))
    # Replace inf/-inf with NaN to be imputed later
    for c in inf_cols:
        model_data[c] = np.where(np.isinf(model_data[c]), np.nan, model_data[c])

###############################################
# 7) REORDER COLUMNS (TARGET FIRST) -- optional
###############################################
# If not already first, do it explicitly:
if model_data.columns[0] != target_col:
    other_cols = [c for c in model_data.columns if c != target_col]
    model_data = model_data[[target_col] + other_cols]

###############################################
# 8) TRAIN/VAL/TEST SPLIT
###############################################
X = model_data.drop(columns=[target_col])
y = model_data[target_col].values

# Ensure we have at least two classes
if len(np.unique(y)) < 2:
    raise ValueError("Target column has only one unique class, cannot train a binary classifier.")

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=1729, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=1729, shuffle=True
)

print("\nData Splits:")
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

###############################################
# 9) LABEL ENCODE OBJECT COLUMNS
###############################################
obj_cols = X_train.select_dtypes(include=['object']).columns
print("\nLabel-encoding these columns:", list(obj_cols))
for col in obj_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_val[col]   = le.transform(X_val[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

###############################################
# 10) FILL NUMERIC MISSING WITH TRAIN MEAN
###############################################
num_cols = X_train.select_dtypes(include=[np.number]).columns
for col in num_cols:
    train_mean = X_train[col].mean()
    X_train[col].fillna(train_mean, inplace=True)
    X_val[col].fillna(train_mean, inplace=True)
    X_test[col].fillna(train_mean, inplace=True)

print("\nNaNs remaining in X_train:", X_train.isnull().sum().sum())
print("NaNs remaining in X_val:", X_val.isnull().sum().sum())
print("NaNs remaining in X_test:", X_test.isnull().sum().sum())

###############################################
# 11) STANDARD SCALING (Numeric Columns)
###############################################
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

# Double-check no new NaNs introduced
print("NaNs in X_train after scaling:", X_train.isnull().sum().sum())
print("NaNs in X_val after scaling:", X_val.isnull().sum().sum())
print("NaNs in X_test after scaling:", X_test.isnull().sum().sum())

###############################################
# 12) BUILD THE MODEL
###############################################
tf.keras.backend.clear_session()

model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(64, activation='relu'),
    layers.Dense(32, activation='relu'),
    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

# EarlyStopping to avoid overfitting
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=5,
    mode='max',
    restore_best_weights=True
)

###############################################
# 13) TRAIN THE MODEL
###############################################
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

###############################################
# 14) EVALUATE ON VALIDATION
###############################################
val_preds = model.predict(X_val).ravel()
print("\nAny NaNs in val_preds?", np.isnan(val_preds).any())
if np.isnan(val_preds).any():
    print("❌ NaNs present in validation predictions. Consider lowering the learning rate or checking outliers.")
else:
    val_auc = roc_auc_score(y_val, val_preds)
    print("Validation AUC:", round(val_auc, 4))

###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))
    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training with top features only!")


Data loaded. Full shape: (591857, 1121)


Unnamed: 0,CNT,CNTSCHID,CNTSTUID,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,...,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
0,Albania,800282.0,800001.0,0.0,,,1.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,Albania,800115.0,800002.0,0.0,,2.0,2.0,2.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,Albania,800242.0,800003.0,0.0,,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,Albania,800245.0,800005.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,Albania,800285.0,800006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0



Initial model_data shape: (591857, 1121)
After dropping specified columns: (591857, 1084)

Columns retained: ['MATH_Proficient', 'ST253Q01JA', 'ST004D01T', 'ST059Q02JA', 'ST255Q01JA', 'LANGN_156', 'GRADE', 'ST349Q01JA_1', 'SC211Q03JA', 'ST259Q01JA', 'ST256Q03JA', 'ST268Q04JA', 'REPEAT', 'ST297Q09JA', 'WORKPAY', 'ST251Q04JA', 'EXERPRAC', 'EXPECEDU', 'ST349Q01JA_2', 'ST251Q06JA', 'CNT']
Shape after subsetting: (591857, 21)

Data Splits:
Train shape: (414299, 20) (414299,)
Validation shape: (88779, 20) (88779,)
Test shape: (88779, 20) (88779,)

Label-encoding these columns: ['CNT']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(train_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(train_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw


NaNs remaining in X_train: 0
NaNs remaining in X_val: 0
NaNs remaining in X_test: 0
NaNs in X_train after scaling: 0
NaNs in X_val after scaling: 0
NaNs in X_test after scaling: 0
Epoch 1/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - auc: 0.8635 - loss: 0.4564 - val_auc: 0.8767 - val_loss: 0.4374
Epoch 2/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - auc: 0.8762 - loss: 0.4374 - val_auc: 0.8797 - val_loss: 0.4329
Epoch 3/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - auc: 0.8786 - loss: 0.4335 - val_auc: 0.8812 - val_loss: 0.4308
Epoch 4/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m29s[0m 2ms/step - auc: 0.8809 - loss: 0.4297 - val_auc: 0.8816 - val_loss: 0.4300
Epoch 5/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m30s[0m 2ms/step - auc: 0.8816 - loss: 0.4284 - val_auc: 0.8827 - val_loss: 0.4287
Epoch 6/50
[1m12947/12947[0m [3

In [None]:
###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))

        # ======================
        # CONFUSION MATRIX CODE
        # ======================

        # 1. Create a copy of X_test for confusion matrix analysis
        test_data_small = X_test.copy()
        # Insert the actual label into the DataFrame
        test_data_small['MATH_Proficient'] = y_test

        # 2. Assign the predictions and threshold
        predictions_small = test_preds
        threshold = 0.66  # Adjust as desired

        # 3. Create confusion matrix via crosstab
        cm_small = pd.crosstab(
            index=test_data_small['MATH_Proficient'],
            columns=np.round((predictions_small >= threshold).astype(int)),
            rownames=['actuals'],
            colnames=['predictions']
        )

        # 4. Extract TN, FP, FN, TP
        TN_small = cm_small.loc[0.0, 0.0]
        FP_small = cm_small.loc[0.0, 1.0]
        FN_small = cm_small.loc[1.0, 0.0]
        TP_small = cm_small.loc[1.0, 1.0]

        # 5. Calculate various metrics
        accuracy_small = (TP_small + TN_small) / (TP_small + TN_small + FP_small + FN_small) * 100
        precision_small = (TP_small / (TP_small + FP_small) * 100) if (TP_small + FP_small) > 0 else 0
        recall_small = (TP_small / (TP_small + FN_small) * 100) if (TP_small + FN_small) > 0 else 0
        f1_score_small = (2 * (precision_small * recall_small) / (precision_small + recall_small)
                          if (precision_small + recall_small) > 0 else 0)
        specificity_small = (TN_small / (TN_small + FP_small) * 100) if (TN_small + FP_small) > 0 else 0

        # 6. Print results
        print("\nConfusion Matrix:\n", cm_small)
        print("\nAccuracy (20 features): {:.1f}".format(accuracy_small))
        print("F1 Score (20 features): {:.1f}".format(f1_score_small))
        print("Precision (20 features): {:.1f}".format(precision_small))
        print("Recall (20 features): {:.1f}".format(recall_small))
        print("Specificity (20 features): {:.1f}".format(specificity_small))

    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training!")

[1m2775/2775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Any NaNs in test_preds? False
Test AUC: 0.8865

Confusion Matrix:
 predictions      0      1
actuals                  
0.0          35293   5280
1.0          13714  34492

Accuracy (20 features): 78.6
F1 Score (20 features): 78.4
Precision (20 features): 86.7
Recall (20 features): 71.6
Specificity (20 features): 87.0

✅ Finished end-to-end training!


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

# ==================================================
# 1) LOAD DATA
# ==================================================
data_path = "/content/gdrive/MyDrive/Colab Notebooks/Untitled folder/new_PISA_cleaned_dataset.csv"  # Update if needed

if not os.path.exists(data_path):
    print("❌ File not found. Please upload 'PISA_cleaned_dataset.csv'.")
    raise SystemExit

data = pd.read_csv(data_path)
print("Data loaded. Full shape:", data.shape)
display(data.head())

# ==================================================
# 2) COPY DATA
# ==================================================
model_data = data.copy()
print("\nInitial model_data shape:", model_data.shape)

# ==================================================
# 3) DROP UNWANTED COLUMNS
# ==================================================
columns_to_remove = [
    "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR",
    "CURIOAGR", "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR",
    "INFOSEEK", "FAMSUP", "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA",
    "EXPO21ST", "MATHEFF", "MATHEF21", "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF",
    "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", "CREATOP", "OPENART", "IMAGINE",
    "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", "SDLEFF", "ICTRES",
    "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", "ACCESSFA",
    "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ",
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC",
    "BODYIMA", "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP",
    "PQMIMP", "PQMCAR", "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME",
    "CREATACT", "CREATOPN", "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD",
    "ENCOURPG", "DIGDVPOL", "TEAFDBK", "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT",
    "EDUSHORT", "STUBEHA", "TEACHBEHA", "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC",
    "CREENVSC", "ACTCRESC", "OPENCUL", "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP",
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH",
    "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]
model_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')
print("After dropping specified columns:", model_data.shape)

# ==================================================
# 3.5) KEEP ONLY TOP FEATURES + TARGET
# ==================================================
target_col = "MATH_Proficient"
features_to_keep = [
    "ST253Q01JA", "ST004D01T", "ST059Q02JA", "ST255Q01JA", "LANGN_156",
    "GRADE", "ST349Q01JA_1", "SC211Q03JA", "ST259Q01JA", "ST256Q03JA",
    "ST268Q04JA", "REPEAT", "ST297Q09JA", "WORKPAY", "ST251Q04JA",
    "EXERPRAC", "EXPECEDU", "ST349Q01JA_2", "ST251Q06JA", "CNT"
]
model_data = model_data[[target_col] + features_to_keep]
print("\nColumns retained:", list(model_data.columns))
print("Shape after subsetting:", model_data.shape)

# ==================================================
# 4) REMOVE ALL-NaN COLUMNS
# ==================================================
all_nan_cols = model_data.columns[model_data.isnull().all()]
if len(all_nan_cols) > 0:
    print("Dropping all-NaN columns:", list(all_nan_cols))
    model_data.drop(columns=all_nan_cols, inplace=True)

# ==================================================
# 5) DETECT & DROP ZERO-VARIANCE COLUMNS
# ==================================================
numeric_cols_all = model_data.select_dtypes(include=[np.number]).columns
zero_var_cols = []
for col in numeric_cols_all:
    if model_data[col].nunique(dropna=False) <= 1:
        zero_var_cols.append(col)

if len(zero_var_cols) > 0:
    print("Dropping zero-variance columns:", zero_var_cols)
    model_data.drop(columns=zero_var_cols, inplace=True, errors='ignore')

# ==================================================
# 6) HANDLE INFINITE VALUES
# ==================================================
num_df = model_data.select_dtypes(include=[np.number])
inf_cols = num_df.columns[np.isinf(num_df).any()]
if len(inf_cols) > 0:
    print("Found inf/-inf in columns:", list(inf_cols))
    for c in inf_cols:
        model_data[c] = np.where(np.isinf(model_data[c]), np.nan, model_data[c])

# ==================================================
# 7) TRAIN/VAL/TEST SPLIT
# ==================================================
X = model_data.drop(columns=[target_col])
y = model_data[target_col].values

if len(np.unique(y)) < 2:
    raise ValueError("Target has only one unique class, cannot train a binary classifier.")

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=1729, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=1729, shuffle=True
)

print("\nData Splits:")
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

# ==================================================
# 8) ENCODE CATEGORICALS
# ==================================================
obj_cols = X_train.select_dtypes(include=['object']).columns
print("\nLabel-encoding these columns:", list(obj_cols))

# To avoid SettingWithCopyWarning, we can reassign columns.
for col in obj_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_val[col]   = le.transform(X_val[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

# ==================================================
# 9) FILL NUMERIC MISSING WITH TRAIN MEAN
# ==================================================
num_cols = X_train.select_dtypes(include=[np.number]).columns
for col in num_cols:
    train_mean = X_train[col].mean()
    X_train.loc[:, col] = X_train[col].fillna(train_mean)
    X_val.loc[:, col]   = X_val[col].fillna(train_mean)
    X_test.loc[:, col]  = X_test[col].fillna(train_mean)

print("\nNaNs remaining in X_train:", X_train.isnull().sum().sum())
print("NaNs remaining in X_val:", X_val.isnull().sum().sum())
print("NaNs remaining in X_test:", X_test.isnull().sum().sum())

# ==================================================
# 10) STANDARD SCALING
# ==================================================
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

print("NaNs in X_train after scaling:", X_train.isnull().sum().sum())
print("NaNs in X_val after scaling:", X_val.isnull().sum().sum())
print("NaNs in X_test after scaling:", X_test.isnull().sum().sum())

# ==================================================
# 11) BUILD IMPROVED MODEL
# ==================================================
tf.keras.backend.clear_session()

# Example: deeper network with BatchNorm, Dropout, and L2 regularization
model = models.Sequential([
    layers.Input(shape=(X_train.shape[1],)),
    layers.Dense(
        128, activation='relu',
        kernel_regularizer=regularizers.l2(0.0005)
    ),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(
        64, activation='relu',
        kernel_regularizer=regularizers.l2(0.0005)
    ),
    layers.BatchNormalization(),
    layers.Dropout(0.3),

    layers.Dense(32, activation='relu'),

    layers.Dense(1, activation='sigmoid')
])

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

# Callback: EarlyStopping to avoid overfitting
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=5,       # stop after 5 epochs with no improvement
    mode='max',
    restore_best_weights=True
)

# (Optional) Callback: ReduceLROnPlateau to lower LR if val_auc plateaus
reduce_lr = tf.keras.callbacks.ReduceLROnPlateau(
    monitor='val_auc',
    factor=0.5,       # reduce LR by half
    patience=3,       # wait 3 epochs of no improvement
    mode='max',
    min_lr=1e-6       # don't go lower than this
)

# ==================================================
# 12) TRAIN THE MODEL
# ==================================================
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop, reduce_lr],
    verbose=1
)

# ==================================================
# 13) EVALUATE
# ==================================================
val_preds = model.predict(X_val).ravel()
print("\nAny NaNs in val_preds?", np.isnan(val_preds).any())
if np.isnan(val_preds).any():
    print("❌ NaNs in val_preds. Consider adjusting LR or checking outliers.")
else:
    val_auc = roc_auc_score(y_val, val_preds)
    print("Validation AUC:", round(val_auc, 4))

if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))
    else:
        print("❌ NaNs in test_preds. Check numeric stability.")

print("\n✅ Finished training with an adjusted architecture!")


Data loaded. Full shape: (591857, 1121)


Unnamed: 0,CNT,CNTSCHID,CNTSTUID,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,...,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
0,Albania,800282.0,800001.0,0.0,,,1.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,Albania,800115.0,800002.0,0.0,,2.0,2.0,2.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,Albania,800242.0,800003.0,0.0,,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,Albania,800245.0,800005.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,Albania,800285.0,800006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0



Initial model_data shape: (591857, 1121)
After dropping specified columns: (591857, 1084)

Columns retained: ['MATH_Proficient', 'ST253Q01JA', 'ST004D01T', 'ST059Q02JA', 'ST255Q01JA', 'LANGN_156', 'GRADE', 'ST349Q01JA_1', 'SC211Q03JA', 'ST259Q01JA', 'ST256Q03JA', 'ST268Q04JA', 'REPEAT', 'ST297Q09JA', 'WORKPAY', 'ST251Q04JA', 'EXERPRAC', 'EXPECEDU', 'ST349Q01JA_2', 'ST251Q06JA', 'CNT']
Shape after subsetting: (591857, 21)

Data Splits:
Train shape: (414299, 20) (414299,)
Validation shape: (88779, 20) (88779,)
Test shape: (88779, 20) (88779,)

Label-encoding these columns: ['CNT']

NaNs remaining in X_train: 0
NaNs remaining in X_val: 0
NaNs remaining in X_test: 0
NaNs in X_train after scaling: 0
NaNs in X_val after scaling: 0
NaNs in X_test after scaling: 0
Epoch 1/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m43s[0m 3ms/step - auc: 0.8462 - loss: 0.5235 - val_auc: 0.8730 - val_loss: 0.4566 - learning_rate: 0.0010
Epoch 2/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━

In [None]:
###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))

        # ======================
        # CONFUSION MATRIX CODE
        # ======================

        # 1. Create a copy of X_test for confusion matrix analysis
        test_data_small = X_test.copy()
        # Insert the actual label into the DataFrame
        test_data_small['MATH_Proficient'] = y_test

        # 2. Assign the predictions and threshold
        predictions_small = test_preds
        threshold = 0.66  # Adjust as desired

        # 3. Create confusion matrix via crosstab
        cm_small = pd.crosstab(
            index=test_data_small['MATH_Proficient'],
            columns=np.round((predictions_small >= threshold).astype(int)),
            rownames=['actuals'],
            colnames=['predictions']
        )

        # 4. Extract TN, FP, FN, TP
        TN_small = cm_small.loc[0.0, 0.0]
        FP_small = cm_small.loc[0.0, 1.0]
        FN_small = cm_small.loc[1.0, 0.0]
        TP_small = cm_small.loc[1.0, 1.0]

        # 5. Calculate various metrics
        accuracy_small = (TP_small + TN_small) / (TP_small + TN_small + FP_small + FN_small) * 100
        precision_small = (TP_small / (TP_small + FP_small) * 100) if (TP_small + FP_small) > 0 else 0
        recall_small = (TP_small / (TP_small + FN_small) * 100) if (TP_small + FN_small) > 0 else 0
        f1_score_small = (2 * (precision_small * recall_small) / (precision_small + recall_small)
                          if (precision_small + recall_small) > 0 else 0)
        specificity_small = (TN_small / (TN_small + FP_small) * 100) if (TN_small + FP_small) > 0 else 0

        # 6. Print results
        print("\nConfusion Matrix:\n", cm_small)
        print("\nAccuracy (20 features): {:.1f}".format(accuracy_small))
        print("F1 Score (20 features): {:.1f}".format(f1_score_small))
        print("Precision (20 features): {:.1f}".format(precision_small))
        print("Recall (20 features): {:.1f}".format(recall_small))
        print("Specificity (20 features): {:.1f}".format(specificity_small))

    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training!")

[1m2775/2775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Any NaNs in test_preds? False
Test AUC: 0.8833

Confusion Matrix:
 predictions      0      1
actuals                  
0.0          35782   4791
1.0          15363  32843

Accuracy (20 features): 77.3
F1 Score (20 features): 76.5
Precision (20 features): 87.3
Recall (20 features): 68.1
Specificity (20 features): 88.2

✅ Finished end-to-end training!


In [None]:
!pip install keras-tuner



In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models, regularizers

# Keras Tuner imports
import keras_tuner as kt

##############################################################################
# 1) LOAD AND PREPARE DATA
##############################################################################
data_path = "/content/gdrive/MyDrive/Colab Notebooks/Untitled folder/new_PISA_cleaned_dataset.csv"  # Modify if needed

if not os.path.exists(data_path):
    print("❌ File not found. Please upload 'PISA_cleaned_dataset.csv'.")
    raise SystemExit

data = pd.read_csv(data_path)
print("Data loaded. Full shape:", data.shape)

model_data = data.copy()
print("\nInitial model_data shape:", model_data.shape)

##############################################################################
# 2) DROP UNWANTED COLUMNS
##############################################################################
columns_to_remove = [
    "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR",
    "CURIOAGR", "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR",
    "INFOSEEK", "FAMSUP", "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA",
    "EXPO21ST", "MATHEFF", "MATHEF21", "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF",
    "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", "CREATOP", "OPENART", "IMAGINE",
    "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", "SDLEFF", "ICTRES",
    "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", "ACCESSFA",
    "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ",
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC",
    "BODYIMA", "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP",
    "PQMIMP", "PQMCAR", "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME",
    "CREATACT", "CREATOPN", "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD",
    "ENCOURPG", "DIGDVPOL", "TEAFDBK", "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT",
    "EDUSHORT", "STUBEHA", "TEACHBEHA", "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC",
    "CREENVSC", "ACTCRESC", "OPENCUL", "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP",
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH",
    "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]
model_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')
print("After dropping specified columns:", model_data.shape)

##############################################################################
# 3) KEEP ONLY TOP FEATURES + TARGET
##############################################################################
target_col = "MATH_Proficient"
features_to_keep = [
    "ST253Q01JA", "ST004D01T", "ST059Q02JA", "ST255Q01JA", "LANGN_156",
    "GRADE", "ST349Q01JA_1", "SC211Q03JA", "ST259Q01JA", "ST256Q03JA",
    "ST268Q04JA", "REPEAT", "ST297Q09JA", "WORKPAY", "ST251Q04JA",
    "EXERPRAC", "EXPECEDU", "ST349Q01JA_2", "ST251Q06JA", "CNT"
]
model_data = model_data[[target_col] + features_to_keep]
print("\nColumns retained:", list(model_data.columns))
print("Shape after subsetting:", model_data.shape)

##############################################################################
# 4) REMOVE ALL-NaN COLUMNS & ZERO-VARIANCE
##############################################################################
all_nan_cols = model_data.columns[model_data.isnull().all()]
if len(all_nan_cols) > 0:
    print("Dropping all-NaN columns:", list(all_nan_cols))
    model_data.drop(columns=all_nan_cols, inplace=True)

numeric_cols_all = model_data.select_dtypes(include=[np.number]).columns
zero_var_cols = [
    c for c in numeric_cols_all
    if model_data[c].nunique(dropna=False) <= 1
]
if len(zero_var_cols) > 0:
    print("Dropping zero-variance columns:", zero_var_cols)
    model_data.drop(columns=zero_var_cols, inplace=True, errors='ignore')

##############################################################################
# 5) SPLIT DATA
##############################################################################
X = model_data.drop(columns=[target_col])
y = model_data[target_col].values

if len(np.unique(y)) < 2:
    raise ValueError("Target has only one unique class.")

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=1729, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=1729, shuffle=True
)

print("\nData Splits:")
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

##############################################################################
# 6) ENCODE CATEGORICAL + FILL MISSING
##############################################################################
obj_cols = X_train.select_dtypes(include=['object']).columns
print("\nLabel-encoding these columns:", list(obj_cols))

for col in obj_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_val[col]   = le.transform(X_val[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

num_cols = X_train.select_dtypes(include=[np.number]).columns
for col in num_cols:
    train_mean = X_train[col].mean()
    X_train.loc[:, col] = X_train[col].fillna(train_mean)
    X_val.loc[:, col]   = X_val[col].fillna(train_mean)
    X_test.loc[:, col]  = X_test[col].fillna(train_mean)

##############################################################################
# 7) SCALE DATA
##############################################################################
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

##############################################################################
# 8) DEFINE A HYPERMODEL FOR KERAS TUNER
##############################################################################
def build_model(hp):
    """
    Build a Keras model for hyperparameter tuning.

    hp: a HyperParameters instance from Keras Tuner
    """
    tf.keras.backend.clear_session()

    # Select how many Dense layers
    num_layers = hp.Int('num_layers', min_value=1, max_value=3, step=1)

    # Learning rate
    lr = hp.Choice('learning_rate', values=[1e-2, 1e-3, 1e-4])

    # Build the Sequential model
    model = models.Sequential()
    model.add(layers.Input(shape=(X_train.shape[1],)))

    for i in range(num_layers):
        # Number of units in each dense layer
        units = hp.Int(f'units_{i}', min_value=32, max_value=256, step=32)
        model.add(layers.Dense(
            units,
            activation='relu',
            # Optional: L2 regularization
            kernel_regularizer=regularizers.l2(
                hp.Float('l2_reg', min_value=1e-5, max_value=1e-3, sampling='log')
            )
        ))

        # Optional: dropout
        if hp.Boolean(f'dropout_{i}'):
            dropout_rate = hp.Float('dropout_rate', min_value=0.1, max_value=0.5, step=0.1)
            model.add(layers.Dropout(dropout_rate))

    # Output layer
    model.add(layers.Dense(1, activation='sigmoid'))

    # Compile
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=lr),
        loss='binary_crossentropy',
        metrics=[tf.keras.metrics.AUC(name='auc')]
    )

    return model

##############################################################################
# 9) CREATE A TUNER (RANDOM SEARCH OR HYPERBAND, ETC.)
##############################################################################
tuner = kt.RandomSearch(
    hypermodel=build_model,
    objective=kt.Objective("val_auc", direction="max"),
    max_trials=120,          # number of hyperparameter configurations to try
    executions_per_trial=1, # how many times to train each configuration
    overwrite=True,
    directory="my_tuner_dir",
    project_name="pisa_hp_tuning"
)

##############################################################################
# 10) TUNE HYPERPARAMETERS
##############################################################################
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=3,
    mode='max',
    restore_best_weights=True
)

tuner.search(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=10,     # keep epochs modest for faster tuning
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

##############################################################################
# 11) GET BEST HYPERPARAMS & BUILD FINAL MODEL
##############################################################################
best_hps = tuner.get_best_hyperparameters(num_trials=1)[0]
print("\nBest hyperparameters found:")
for param_name, param_value in best_hps.values.items():
    print(param_name, ":", param_value)

best_model = tuner.hypermodel.build(best_hps)

# OPTIONAL: retrain the best model more thoroughly
final_history = best_model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=30,            # more epochs now that we've found good hyperparams
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

##############################################################################
# 12) PRINT THE BEST MODEL'S ARCHITECTURE
##############################################################################
print("\nBest Model Architecture:")
best_model.summary()

##############################################################################
# 13) EVALUATE ON VALIDATION & TEST
##############################################################################
val_preds = best_model.predict(X_val).ravel()
val_auc = roc_auc_score(y_val, val_preds)
print(f"\nValidation AUC: {val_auc:.4f}")

test_preds = best_model.predict(X_test).ravel()
test_auc = roc_auc_score(y_test, test_preds)
print(f"Test AUC: {test_auc:.4f}")


Trial 120 Complete [00h 02m 19s]
val_auc: 0.8664931058883667

Best val_auc So Far: 0.8852567672729492
Total elapsed time: 09h 09m 36s

Best hyperparameters found:
num_layers : 2
learning_rate : 0.001
units_0 : 160
l2_reg : 1.162911054826589e-05
dropout_0 : False
dropout_rate : 0.5
units_1 : 64
dropout_1 : True
units_2 : 192
dropout_2 : False
Epoch 1/30
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m35s[0m 3ms/step - auc: 0.8593 - loss: 0.4666 - val_auc: 0.8768 - val_loss: 0.4398
Epoch 2/30
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - auc: 0.8727 - loss: 0.4469 - val_auc: 0.8792 - val_loss: 0.4356
Epoch 3/30
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 2ms/step - auc: 0.8748 - loss: 0.4440 - val_auc: 0.8816 - val_loss: 0.4321
Epoch 4/30
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m31s[0m 2ms/step - auc: 0.8772 - loss: 0.4400 - val_auc: 0.8817 - val_loss: 0.4332
Epoch 5/30
[1m12947/12947[

[1m2775/2775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step

Validation AUC: 0.8856
[1m2775/2775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Test AUC: 0.8841


In [None]:
import pandas as pd
import numpy as np
import os

from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import LabelEncoder, StandardScaler

import tensorflow as tf
from tensorflow.keras import layers, models

###############################################
# 1) LOAD DATA
###############################################
data_path = "/content/gdrive/MyDrive/Colab Notebooks/Untitled folder/new_PISA_cleaned_dataset.csv"  # Update if needed

if not os.path.exists(data_path):
    print("❌ File not found. Please upload 'PISA_cleaned_dataset.csv' into your environment.")
    raise SystemExit

data = pd.read_csv(data_path)
print("Data loaded. Full shape:", data.shape)
display(data.head())

###############################################
# 2) MAKE A COPY OF THE DATA FOR MODELING
###############################################
model_data = data.copy()
print("\nInitial model_data shape:", model_data.shape)

###############################################
# 3) DROP UNWANTED COLUMNS
###############################################
columns_to_remove = [
    "CNTSCHID", "CNTSTUID", "OECD",
    "HOMEPOS", "RELATST", "BELONG", "BULLIED", "FEELSAFE", "SCHRISK", "PERSEVAGR",
    "CURIOAGR", "COOPAGR", "EMPATAGR", "ASSERAGR", "STRESAGR", "EMOCOAGR", "GROSAGR",
    "INFOSEEK", "FAMSUP", "DISCLIM", "TEACHSUP", "COGACRCO", "COGACMCO", "EXPOFA",
    "EXPO21ST", "MATHEFF", "MATHEF21", "FAMCON", "ANXMAT", "MATHPERS", "CREATEFF",
    "CREATSCH", "CREATFAM", "CREATAS", "CREATOOS", "CREATOP", "OPENART", "IMAGINE",
    "SCHSUST", "LEARRES", "PROBSELF", "FAMSUPSL", "FEELLAH", "SDLEFF", "ICTRES",
    "FLSCHOOL", "FLMULTSB", "FLFAMILY", "ACCESSFP", "FLCONFIN", "FLCONICT", "ACCESSFA",
    "ATTCONFM", "FRINFLFM", "ICTSCH", "ICTHOME", "ICTQUAL", "ICTSUBJ", "ICTENQ",
    "ICTFEED", "ICTOUT", "ICTWKDY", "ICTWKEND", "ICTREG", "ICTINFO", "ICTEFFIC",
    "BODYIMA", "SOCONPA", "LIFESAT", "PSYCHSYM", "SOCCON", "EXPWB", "CURSUPP",
    "PQMIMP", "PQMCAR", "PARINVOL", "PQSCHOOL", "PASCHPOL", "ATTIMMP", "CREATHME",
    "CREATACT", "CREATOPN", "CREATOR", "SCHAUTO", "TCHPART", "EDULEAD", "INSTLEAD",
    "ENCOURPG", "DIGDVPOL", "TEAFDBK", "MTTRAIN", "DMCVIEWS", "NEGSCLIM", "STAFFSHORT",
    "EDUSHORT", "STUBEHA", "TEACHBEHA", "STDTEST", "TDTEST", "ALLACTIV", "BCREATSC",
    "CREENVSC", "ACTCRESC", "OPENCUL", "PROBSCRI", "SCPREPBP", "SCPREPAP", "DIGPREP",
    "ESCS", "BMMJ1", "BFMJ2", "EFFORT1", "EFFORT2", "Option_UH",
    "SC209Q04JA", "SC209Q05JA", "SC209Q06JA"
]
model_data.drop(columns=columns_to_remove, inplace=True, errors='ignore')
print("After dropping specified columns:", model_data.shape)

###############################################
# 3.5) LIMIT TO YOUR TOP FEATURES + TARGET
###############################################
target_col = "MATH_Proficient"

# Make sure this list matches exactly your column names in model_data:
features_to_keep = [
    "ST253Q01JA", "ST004D01T", "ST059Q02JA", "ST255Q01JA", "LANGN_156",
    "GRADE", "ST349Q01JA_1", "SC211Q03JA", "ST259Q01JA", "ST256Q03JA",
    "ST268Q04JA", "REPEAT", "ST297Q09JA", "WORKPAY", "ST251Q04JA",
    "EXERPRAC", "EXPECEDU", "ST349Q01JA_2", "ST251Q06JA", "CNT"
]

# Subset only target + these features
needed_cols = [target_col] + features_to_keep
model_data = model_data[needed_cols]
print("\nColumns retained:", list(model_data.columns))
print("Shape after subsetting:", model_data.shape)

###############################################
# 4) REMOVE FULLY EMPTY (ALL-NaN) COLUMNS
###############################################
all_nan_cols = model_data.columns[model_data.isnull().all()]
if len(all_nan_cols) > 0:
    print("Dropping all-NaN columns:", list(all_nan_cols))
    model_data.drop(columns=all_nan_cols, inplace=True)

###############################################
# 5) DETECT & DROP ZERO-VARIANCE COLUMNS
###############################################
numeric_cols_all = model_data.select_dtypes(include=[np.number]).columns
zero_var_cols = []
for col in numeric_cols_all:
    # If a column has <= 1 unique value (including NaN), it's effectively constant
    if model_data[col].nunique(dropna=False) <= 1:
        zero_var_cols.append(col)

if len(zero_var_cols) > 0:
    print("Dropping zero-variance columns:", zero_var_cols)
    model_data.drop(columns=zero_var_cols, inplace=True, errors='ignore')

###############################################
# 6) CHECK FOR AND HANDLE INFINITE VALUES
###############################################
num_df = model_data.select_dtypes(include=[np.number])
inf_cols = num_df.columns[np.isinf(num_df).any()]
if len(inf_cols) > 0:
    print("Found inf/-inf in columns:", list(inf_cols))
    # Replace inf/-inf with NaN to be imputed later
    for c in inf_cols:
        model_data[c] = np.where(np.isinf(model_data[c]), np.nan, model_data[c])

###############################################
# 7) REORDER COLUMNS (TARGET FIRST) -- optional
###############################################
# If not already first, do it explicitly:
if model_data.columns[0] != target_col:
    other_cols = [c for c in model_data.columns if c != target_col]
    model_data = model_data[[target_col] + other_cols]

###############################################
# 8) TRAIN/VAL/TEST SPLIT
###############################################
X = model_data.drop(columns=[target_col])
y = model_data[target_col].values

# Ensure we have at least two classes
if len(np.unique(y)) < 2:
    raise ValueError("Target column has only one unique class, cannot train a binary classifier.")

X_train, X_temp, y_train, y_temp = train_test_split(
    X, y, test_size=0.30, random_state=1729, shuffle=True
)
X_val, X_test, y_val, y_test = train_test_split(
    X_temp, y_temp, test_size=0.50, random_state=1729, shuffle=True
)

print("\nData Splits:")
print("Train shape:", X_train.shape, y_train.shape)
print("Validation shape:", X_val.shape, y_val.shape)
print("Test shape:", X_test.shape, y_test.shape)

###############################################
# 9) LABEL ENCODE OBJECT COLUMNS
###############################################
obj_cols = X_train.select_dtypes(include=['object']).columns
print("\nLabel-encoding these columns:", list(obj_cols))
for col in obj_cols:
    le = LabelEncoder()
    X_train[col] = le.fit_transform(X_train[col].astype(str))
    X_val[col]   = le.transform(X_val[col].astype(str))
    X_test[col]  = le.transform(X_test[col].astype(str))

###############################################
# 10) FILL NUMERIC MISSING WITH TRAIN MEAN
###############################################
num_cols = X_train.select_dtypes(include=[np.number]).columns
for col in num_cols:
    train_mean = X_train[col].mean()
    X_train[col].fillna(train_mean, inplace=True)
    X_val[col].fillna(train_mean, inplace=True)
    X_test[col].fillna(train_mean, inplace=True)

print("\nNaNs remaining in X_train:", X_train.isnull().sum().sum())
print("NaNs remaining in X_val:", X_val.isnull().sum().sum())
print("NaNs remaining in X_test:", X_test.isnull().sum().sum())

###############################################
# 11) STANDARD SCALING (Numeric Columns)
###############################################
scaler = StandardScaler()
X_train[num_cols] = scaler.fit_transform(X_train[num_cols])
X_val[num_cols]   = scaler.transform(X_val[num_cols])
X_test[num_cols]  = scaler.transform(X_test[num_cols])

# Double-check no new NaNs introduced
print("NaNs in X_train after scaling:", X_train.isnull().sum().sum())
print("NaNs in X_val after scaling:", X_val.isnull().sum().sum())
print("NaNs in X_test after scaling:", X_test.isnull().sum().sum())

###############################################
# 12) BUILD THE MODEL (With Best Hyperparams)
###############################################
tf.keras.backend.clear_session()

# Best hyperparameters found:
# num_layers : 2
# learning_rate : 0.001
# units_0 : 160
# l2_reg : 1.162911054826589e-05
# dropout_0 : False
# dropout_rate : 0.5
# units_1 : 64
# dropout_1 : True
# units_2 : 192  (ignored since num_layers=2)
# dropout_2 : False (ignored since num_layers=2)

l2_reg_value = 1.162911054826589e-05
dropout_rate = 0.5  # This will be used only where 'dropout_*' is True

model = tf.keras.Sequential()

# Input layer
model.add(layers.Input(shape=(X_train.shape[1],)))

# 1st hidden layer
model.add(layers.Dense(
    units=160,
    activation='relu',
    kernel_regularizer=tf.keras.regularizers.l2(l2_reg_value)
))
# dropout_0 = False -> no dropout here

# 2nd hidden layer
model.add(layers.Dense(
    units=64,
    activation='relu',
    kernel_regularizer=tf.keras.regularizers.l2(l2_reg_value)
))
# dropout_1 = True -> apply dropout
model.add(layers.Dropout(rate=dropout_rate))

# Output layer
model.add(layers.Dense(1, activation='sigmoid'))

# Compile model
model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=0.001),
    loss='binary_crossentropy',
    metrics=[tf.keras.metrics.AUC(name='auc')]
)

# EarlyStopping to avoid overfitting
early_stop = tf.keras.callbacks.EarlyStopping(
    monitor='val_auc',
    patience=5,
    mode='max',
    restore_best_weights=True
)

###############################################
# 13) TRAIN THE MODEL
###############################################
history = model.fit(
    X_train, y_train,
    validation_data=(X_val, y_val),
    epochs=50,
    batch_size=32,
    callbacks=[early_stop],
    verbose=1
)

###############################################
# 14) EVALUATE ON VALIDATION
###############################################
val_preds = model.predict(X_val).ravel()
print("\nAny NaNs in val_preds?", np.isnan(val_preds).any())
if np.isnan(val_preds).any():
    print("❌ NaNs present in validation predictions. Consider lowering the learning rate or checking outliers.")
else:
    val_auc = roc_auc_score(y_val, val_preds)
    print("Validation AUC:", round(val_auc, 4))

###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))
    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training with top features only!")

Data loaded. Full shape: (591857, 1121)


Unnamed: 0,CNT,CNTSCHID,CNTSTUID,MATH_Proficient,SISCO,ST250Q01JA,ST250Q02JA,ST250Q03JA,ST250Q04JA,ST250Q05JA,...,LANGN_667,LANGN_829,LANGN_854,LANGN_855,LANGN_857,LANGN_859,LANGN_860,LANGN_866,LANGN_877,LANGN_922
0,Albania,800282.0,800001.0,0.0,,,1.0,,,,...,0,0,0,0,0,0,0,0,0,0
1,Albania,800115.0,800002.0,0.0,,2.0,2.0,2.0,1.0,2.0,...,0,0,0,0,0,0,0,0,0,0
2,Albania,800242.0,800003.0,0.0,,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
3,Albania,800245.0,800005.0,0.0,1.0,1.0,1.0,2.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0
4,Albania,800285.0,800006.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,...,0,0,0,0,0,0,0,0,0,0



Initial model_data shape: (591857, 1121)
After dropping specified columns: (591857, 1084)

Columns retained: ['MATH_Proficient', 'ST253Q01JA', 'ST004D01T', 'ST059Q02JA', 'ST255Q01JA', 'LANGN_156', 'GRADE', 'ST349Q01JA_1', 'SC211Q03JA', 'ST259Q01JA', 'ST256Q03JA', 'ST268Q04JA', 'REPEAT', 'ST297Q09JA', 'WORKPAY', 'ST251Q04JA', 'EXERPRAC', 'EXPECEDU', 'ST349Q01JA_2', 'ST251Q06JA', 'CNT']
Shape after subsetting: (591857, 21)

Data Splits:
Train shape: (414299, 20) (414299,)
Validation shape: (88779, 20) (88779,)
Test shape: (88779, 20) (88779,)

Label-encoding these columns: ['CNT']


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_train[col].fillna(train_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  X_val[col].fillna(train_mean, inplace=True)
The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values alw


NaNs remaining in X_train: 0
NaNs remaining in X_val: 0
NaNs remaining in X_test: 0
NaNs in X_train after scaling: 0
NaNs in X_val after scaling: 0
NaNs in X_test after scaling: 0
Epoch 1/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 3ms/step - auc: 0.8606 - loss: 0.4645 - val_auc: 0.8765 - val_loss: 0.4398
Epoch 2/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m32s[0m 3ms/step - auc: 0.8720 - loss: 0.4480 - val_auc: 0.8795 - val_loss: 0.4352
Epoch 3/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - auc: 0.8744 - loss: 0.4445 - val_auc: 0.8804 - val_loss: 0.4341
Epoch 4/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - auc: 0.8757 - loss: 0.4424 - val_auc: 0.8817 - val_loss: 0.4318
Epoch 5/50
[1m12947/12947[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m33s[0m 3ms/step - auc: 0.8779 - loss: 0.4390 - val_auc: 0.8832 - val_loss: 0.4320
Epoch 6/50
[1m12947/12947[0m [3

In [None]:
###############################################
# 15) EVALUATE ON TEST SET (IF NO NaNs)
###############################################
if not np.isnan(val_preds).any():
    test_preds = model.predict(X_test).ravel()
    print("Any NaNs in test_preds?", np.isnan(test_preds).any())
    if not np.isnan(test_preds).any():
        test_auc = roc_auc_score(y_test, test_preds)
        print("Test AUC:", round(test_auc, 4))

        # ======================
        # CONFUSION MATRIX CODE
        # ======================

        # 1. Create a copy of X_test for confusion matrix analysis
        test_data_small = X_test.copy()
        # Insert the actual label into the DataFrame
        test_data_small['MATH_Proficient'] = y_test

        # 2. Assign the predictions and threshold
        predictions_small = test_preds
        threshold = 0.66  # Adjust as desired

        # 3. Create confusion matrix via crosstab
        cm_small = pd.crosstab(
            index=test_data_small['MATH_Proficient'],
            columns=np.round((predictions_small >= threshold).astype(int)),
            rownames=['actuals'],
            colnames=['predictions']
        )

        # 4. Extract TN, FP, FN, TP
        TN_small = cm_small.loc[0.0, 0.0]
        FP_small = cm_small.loc[0.0, 1.0]
        FN_small = cm_small.loc[1.0, 0.0]
        TP_small = cm_small.loc[1.0, 1.0]

        # 5. Calculate various metrics
        accuracy_small = (TP_small + TN_small) / (TP_small + TN_small + FP_small + FN_small) * 100
        precision_small = (TP_small / (TP_small + FP_small) * 100) if (TP_small + FP_small) > 0 else 0
        recall_small = (TP_small / (TP_small + FN_small) * 100) if (TP_small + FN_small) > 0 else 0
        f1_score_small = (2 * (precision_small * recall_small) / (precision_small + recall_small)
                          if (precision_small + recall_small) > 0 else 0)
        specificity_small = (TN_small / (TN_small + FP_small) * 100) if (TN_small + FP_small) > 0 else 0

        # 6. Print results
        print("\nConfusion Matrix:\n", cm_small)
        print("\nAccuracy (20 features): {:.1f}".format(accuracy_small))
        print("F1 Score (20 features): {:.1f}".format(f1_score_small))
        print("Precision (20 features): {:.1f}".format(precision_small))
        print("Recall (20 features): {:.1f}".format(recall_small))
        print("Specificity (20 features): {:.1f}".format(specificity_small))

    else:
        print("❌ NaNs in test_preds. Check for numeric instability.")

print("\n✅ Finished end-to-end training!")

[1m2775/2775[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 1ms/step
Any NaNs in test_preds? False
Test AUC: 0.8863

Confusion Matrix:
 predictions      0      1
actuals                  
0.0          35483   5090
1.0          14352  33854

Accuracy (20 features): 78.1
F1 Score (20 features): 77.7
Precision (20 features): 86.9
Recall (20 features): 70.2
Specificity (20 features): 87.5

✅ Finished end-to-end training!
