# Load packages

In [100]:
# skip reinstalling packages we already have
using Pkg

pkgs = [
    "MLJ", "MLJBase", "MLJModels", "MLJEnsembles", "MLJLinearModels",
    "DecisionTree", "MLJDecisionTreeInterface", "NaiveBayes", 
    "MLJNaiveBayesInterface", "EvoTrees", "CategoricalArrays", "Random",
    "LIBSVM", "MLJLIBSVMInterface", "Plots", "MLJModelInterface",
    "CSV", "DataFrames", "UrlDownload", "XGBoost", "NNlib"
]

# Filter out packages already installed
missing_pkgs = filter(pkg -> !(pkg in keys(Pkg.project().dependencies)), pkgs)

if !isempty(missing_pkgs)
    println("Installing missing packages: ", missing_pkgs)
    Pkg.add(missing_pkgs)
else
    println(" All required packages are already installed.")
end


 All required packages are already installed.


In [121]:
using MLJ
using MLJBase
using LIBSVM
using NNlib
using Flux
using Flux.Losses
using Statistics

In [122]:
# Load PCA
PCA_model = MLJ.@load PCA pkg="MultivariateStats"

import MLJMultivariateStatsInterface ✔


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 


MLJMultivariateStatsInterface.PCA

In [123]:
#Load your library of functions
include("utils.jl")
# Set a global random seed for reproducibility
using Random
Random.seed!(42)

TaskLocalRNG()

# Load Data

In [124]:
using CSV, DataFrames, Random
using CategoricalArrays

df = CSV.read("./data/updated_pollution_dataset.csv", DataFrame)

# Some log
println("First 5 rows of df:")
show(df[1:5, :], allcols=true)


# Convert last column to categorical (in-place!)
df[!, end] = categorical(df[!, end])

# Extract the integer codes of the categories
targets = Float32.(levelcode.(df[!, end]))

# Use all columns except the last one as inputs
inputs = Matrix{Float32}(df[:, 1:end-1])

println("First 5 inputs::")
for i in 1:5
    println(inputs[i, :])
end

println("\n\nFirst 5 targets:")
println(targets[1:5])

# Extract labels (categories) as strings
label_names = levels(df[!, 10])
println("Labels: ", label_names)

First 5 rows of df:
[1m5×10 DataFrame[0m
[1m Row [0m│[1m Temperature [0m[1m Humidity [0m[1m PM2.5   [0m[1m PM10    [0m[1m NO2     [0m[1m SO2     [0m[1m CO      [0m[1m Proximity_to_Industrial_Areas [0m[1m Population_Density [0m[1m Air Quality [0m
     │[90m Float64     [0m[90m Float64  [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64                       [0m[90m Int64              [0m[90m String15    [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │        29.8      59.1      5.2     17.9     18.9      9.2     1.72                            6.3                 319  Moderate
   2 │        28.3      75.6      2.3     12.2     30.8      9.7     1.64                            6.0                 611  Moderate
   3 │        23.1      74.7     26.7     33.8     24.4     12.6     1.63                   

[29.8, 59.1, 5.2, 17.9, 18.9, 9.2, 1.72, 6.3, 319.0]
Float32[28.3, 75.6, 2.3, 12.2, 30.8, 9.7, 1.64, 6.0, 611.0]
Float32[23.1, 74.7, 26.7, 33.8, 24.4, 12.6, 1.63, 5.2, 619.0]
Float32[27.1, 39.1, 6.1, 6.3, 13.5, 5.3, 1.15, 11.1, 551.0]
Float32[26.5, 70.7, 6.9, 16.0, 21.9, 5.6, 1.01, 12.7, 303.0]


First 5 targets:
Float32[3.0, 3.0, 3.0, 1.0, 1.0]
Labels: String15["Good", "Hazardous", "Moderate", "Poor"]


# Split train-test datasets

In [125]:
trainIdx, testIdx = holdOut(size(df,1), 0.2)

trainingInputs  = inputs[trainIdx, :]
testInputs      = inputs[testIdx, :]

trainingTargets = targets[trainIdx]
testTargets     = targets[testIdx]

1000-element Vector{Float32}:
 2.0
 3.0
 1.0
 2.0
 1.0
 1.0
 4.0
 4.0
 2.0
 1.0
 3.0
 1.0
 4.0
 ⋮
 3.0
 1.0
 1.0
 1.0
 1.0
 4.0
 4.0
 3.0
 3.0
 2.0
 1.0
 3.0

In [126]:
results = Dict()
dimsPCA = [6,7,8]
crossValidationIndices = crossvalidation(trainingTargets, 5)

4000-element Vector{Int64}:
 5
 5
 3
 2
 2
 1
 2
 1
 3
 2
 1
 2
 3
 ⋮
 1
 3
 2
 3
 4
 4
 4
 4
 4
 3
 5
 5

In [127]:
function printExperimentResult(model, hyperparams, dimPCA, results, class_labels)
    (
        (accuracy_mean, accuracy_std),
        (error_rate_mean, error_rate_std),
        (sensitivity_mean, sensitivity_std),
        (specificity_mean, specificity_std),
        (ppv_mean, ppv_std),
        (npv_mean, npv_std),
        (f1_mean, f1_std),
        cm
    ) = results

    println("\n=====================================================")
    println(" Model: $model | PCA outdim: $dimPCA")
    println(" Hyperparameters: $hyperparams")
    println("=====================================================")

    println(" Accuracy (mean)               : ", round(accuracy_mean, digits=4))
    println(" Accuracy (std)                : ", round(accuracy_std, digits=4))

    println(" Error Rate (mean)             : ", round(error_rate_mean, digits=4))
    println(" Error Rate (std)              : ", round(error_rate_std, digits=4))

    println(" Sensitivity/Recall (mean)     : ", round(sensitivity_mean, digits=4))
    println(" Sensitivity/Recall (std)      : ", round(sensitivity_std,  digits=4))

    println(" Specificity (mean)            : ", round(specificity_mean, digits=4))
    println(" Specificity (std)             : ", round(specificity_std,  digits=4))

    println(" PPV (mean)                    : ", round(ppv_mean,         digits=4))
    println(" PPV (std)                     : ", round(ppv_std,          digits=4))

    println(" NPV (mean)                    : ", round(npv_mean,         digits=4))
    println(" NPV (std)                     : ", round(npv_std,          digits=4))

    println(" F1 Score (mean)               : ", round(f1_mean,          digits=4))
    println(" F1 Score (std)                : ", round(f1_std,           digits=4))

    println("\nConfusion Matrix:")
    println(cm)

    PrettyTables.pretty_table(DataFrame(cm, :auto); header=class_labels, row_labels=class_labels)

    println("=====================================================\n")
end


printExperimentResult (generic function with 1 method)

# Artificial Neural Networks

In [9]:
############# 1. ARTIFICIAL NEURAL NETWORKS (8+ topologies) #############
default_ann = Dict(      
    "numExecutions" => 5,
    #"transferFunctions" => [σ, σ, σ, σ],
    "maxEpochs" => 200,
    "minLoss" => 0.0,
    "learningRate" => 0.01,
    "validationRatio" => 0.1,
    "maxEpochsVal" => 20
)

ann_search_space = [
    Dict("topology"=>[4, 4]),
    Dict("topology"=>[8, 8]),
    Dict("topology"=>[16, 16]),
    Dict("topology"=>[10, 4]),
    Dict("topology"=>[10, 6, 4]),
    Dict("topology"=>[10, 8, 4]),
    Dict("topology"=>[10, 8, 6, 4]),
    Dict("topology"=>[10, 12, 6, 4])
]

8-element Vector{Dict{String, Vector{Int64}}}:
 Dict("topology" => [4, 4])
 Dict("topology" => [8, 8])
 Dict("topology" => [16, 16])
 Dict("topology" => [10, 4])
 Dict("topology" => [10, 6, 4])
 Dict("topology" => [10, 8, 4])
 Dict("topology" => [10, 8, 6, 4])
 Dict("topology" => [10, 12, 6, 4])

In [10]:
########################
# 1. ANN GRID SEARCH
########################
ann_results = []

for hp in ann_search_space
    for dim in dimsPCA
        println("\n=== ANN experiment: topology = $(hp["topology"]) | PCA maxoutdim = $(dim) ===")
        full_hp = merge(default_ann, hp)
        res = modelCrossValidationPCA(:ANN, full_hp, (trainingInputs, trainingTargets), crossValidationIndices, dim)
        push!(ann_results, (model=:ANN, hyperparams=hp, dimPCA=dim, results=res))
    end
end

results[:ANN] = ann_results


=== ANN experiment: topology = [4, 4] | PCA maxoutdim = 6 ===

=== ANN experiment: topology = [4, 4] | PCA maxoutdim = 7 ===

=== ANN experiment: topology = [4, 4] | PCA maxoutdim = 8 ===

=== ANN experiment: topology = [8, 8] | PCA maxoutdim = 6 ===

=== ANN experiment: topology = [8, 8] | PCA maxoutdim = 7 ===

=== ANN experiment: topology = [8, 8] | PCA maxoutdim = 8 ===

=== ANN experiment: topology = [16, 16] | PCA maxoutdim = 6 ===

=== ANN experiment: topology = [16, 16] | PCA maxoutdim = 7 ===

=== ANN experiment: topology = [16, 16] | PCA maxoutdim = 8 ===

=== ANN experiment: topology = [10, 4] | PCA maxoutdim = 6 ===

=== ANN experiment: topology = [10, 4] | PCA maxoutdim = 7 ===

=== ANN experiment: topology = [10, 4] | PCA maxoutdim = 8 ===

=== ANN experiment: topology = [10, 6, 4] | PCA maxoutdim = 6 ===

=== ANN experiment: topology = [10, 6, 4] | PCA maxoutdim = 7 ===

=== ANN experiment: topology = [10, 6, 4] | PCA maxoutdim = 8 ===

=== ANN experiment: topology = [1

24-element Vector{Any}:
 (model = :ANN, hyperparams = Dict("topology" => [4, 4]), dimPCA = 6, results = ((0.94074315f0, 0.0042731683f0), (0.059256874f0, 0.0042731655f0), (0.94074315f0, 0.0042731683f0), (0.98274595f0, 0.0025842295f0), (0.9379606f0, 0.009482839f0), (0.98563397f0, 0.0011026161f0), (0.9388195f0, 0.007021723f0), Float32[267.36002 0.52 2.76 2.76; 3.64 173.36002 9.72 4.4800005; 2.3200002 4.0 211.23999 6.04; 2.76 4.5199995 3.8799999 100.64]))
 (model = :ANN, hyperparams = Dict("topology" => [4, 4]), dimPCA = 7, results = ((0.9431449f0, 0.004030533f0), (0.05685513f0, 0.0040305755f0), (0.9431449f0, 0.004030533f0), (0.9833366f0, 0.0019408528f0), (0.9432856f0, 0.004718385f0), (0.9857807f0, 0.0011526846f0), (0.942704f0, 0.0039187265f0), Float32[267.28 0.64 2.72 2.76; 3.48 175.16 7.84 4.72; 2.4 3.8799999 211.16 6.16; 2.84 4.12 3.92 100.920006]))
 (model = :ANN, hyperparams = Dict("topology" => [4, 4]), dimPCA = 8, results = ((0.9441973f0, 0.0050831344f0), (0.055802684f0, 0.00508313f

In [11]:
for entry in results[:ANN]
    printExperimentResult(entry.model, entry.hyperparams, entry.dimPCA, entry.results, label_names)
end


 Model: ANN | PCA outdim: 6
 Hyperparameters: Dict("topology" => [4, 4])
 Accuracy (mean)               : 0.9407
 Accuracy (std)                : 0.0043
 Error Rate (mean)             : 0.0593
 Error Rate (std)              : 0.0043
 Sensitivity/Recall (mean)     : 0.9407
 Sensitivity/Recall (std)      : 0.0043
 Specificity (mean)            : 0.9827
 Specificity (std)             : 0.0026
 PPV (mean)                    : 0.938
 PPV (std)                     : 0.0095
 NPV (mean)                    : 0.9856
 NPV (std)                     : 0.0011
 F1 Score (mean)               : 0.9388
 F1 Score (std)                : 0.007

Confusion Matrix:
Float32[267.36002 0.52 2.76 2.76; 3.64 173.36002 9.72 4.4800005; 2.3200002 4.0 211.23999 6.04; 2.76 4.5199995 3.8799999 100.64]
┌───────────┬────────┬───────────┬──────────┬────────┐
│[1m           [0m│[1m   Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m   Poor [0m│
├───────────┼────────┼───────────┼──────────┼────────┤
│[1m      Good 

In [12]:
########################
# 1. ANN FINAL TESTING
########################

# BEST HYPERPARAMETERS
topology = [10, 12, 6, 4]

hp = Dict(      
    "numExecutions" => 5,
    "maxEpochs" => 200,
    "minLoss" => 0.0,
    "learningRate" => 0.01,
    "validationRatio" => 0.1,
    "maxEpochsVal" => 20
)

learningRate = hp["learningRate"]
epochs = hp["maxEpochs"]
validationRatio = hp["validationRatio"]

dimPCA = 8

# ============================
# NORMALIZATION
# ============================
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
norm_train_inputs = normalizeMinMax(trainingInputs, normParams)
norm_test_inputs  = normalizeMinMax(testInputs,  normParams)

# ============================
# PCA
# ============================
pca_model = PCA_model(maxoutdim=dimPCA)
pca_mach = machine(pca_model, MLJ.table(norm_train_inputs))
MLJ.fit!(pca_mach, verbosity=0)

trainValInputs = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(norm_train_inputs)))
testInputsPCA  = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(norm_test_inputs)))

# ============================
# CREATE VALIDATION SPLIT
# ============================
N = size(inputs, 1)
nTrainVal = size(trainValInputs, 1)

nVal = Int(floor(validationRatio * N))
realValidationRatio = nVal / nTrainVal

(train_idx, val_idx) = holdOut(nTrainVal, realValidationRatio)

trainInputsPCA = trainValInputs[train_idx, :]
valInputsPCA   = trainValInputs[val_idx, :]

# ============================
# ONE–HOT ENCODING FOR TARGETS
# ============================
trainTargetsVec = trainingTargets[train_idx]
valTargetsVec   = trainingTargets[val_idx]
testTargetsVec  = testTargets

trainTargetsOH = Matrix(oneHotEncoding(vec(trainTargetsVec), sort(levels(categorical(trainingTargets)))))
valTargetsOH   = Matrix(oneHotEncoding(vec(valTargetsVec),   sort(levels(categorical(trainingTargets)))))
testTargetsOH  = Matrix(oneHotEncoding(vec(testTargetsVec),  sort(levels(categorical(trainingTargets)))))

# ============================
# TRAIN ANN
# ============================
finalAnn, trainLoss, valLoss, testLoss = trainClassANN(
    topology,
    (trainInputsPCA, trainTargetsOH),
    validationDataset = (valInputsPCA, valTargetsOH),
    testDataset = (testInputsPCA, testTargetsOH),
    maxEpochs = hp["maxEpochs"],
    minLoss = hp["minLoss"],
    learningRate = hp["learningRate"],
    maxEpochsVal = hp["maxEpochsVal"],
    showText = false
)

# ============================
# PREDICTION
# ============================
testOutputs = finalAnn(testInputsPCA')
testPredictions = classifyOutputs(testOutputs')   # boolean matrix (N × classes)

# ============================
# METRICS
# ============================
metrics = confusionMatrix(testPredictions, testTargetsOH)

printPCAANNResult(
    :ANN,
    Dict("topology" => topology),
    metrics,
    dimPCA,
    label_names
)



 Model: ANN   | PCA outdim = 8 
 Hyperparameters: Dict("topology" => [10, 12, 6, 4])
 Accuracy                : 0.803
 Error Rate              : 0.197
 Sensitivity/Recall      : 0.803
 Specificity             : 0.9308
 PPV                     : 0.7262
 NPV                     : 0.9625
 F1 Score                : 0.7597

Confusion Matrix:
[383 0 0 0; 0 0 4 99; 2 0 294 0; 0 0 92 126]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│  383 │         0 │        0 │    0 │
│[1m Hazardous [0m│    0 │         0 │        4 │   99 │
│[1m  Moderate [0m│    2 │         0 │      294 │    0 │
│[1m      Poor [0m│    0 │         0 │       92 │  126 │
└───────────┴──────┴───────────┴──────────┴──────┘


# Support Vector Machines

In [13]:
SVMClassifier = MLJ.@load SVC pkg=LIBSVM verbosity=0

MLJLIBSVMInterface.SVC

In [14]:
############# 2. SVM (8+ configs: kernels × C) #############
default_svm = Dict(
    "gamma" => 1.0,
    "degree" => 3,
    "coef0" => 0.0
)

svm_search_space = [
    Dict("kernel"=>"linear",  "C"=>0.1),
    Dict("kernel"=>"linear",  "C"=>1.0),
    Dict("kernel"=>"linear",  "C"=>10.0),
    Dict("kernel"=>"rbf",     "C"=>1.0, "gamma"=>2.0),
    Dict("kernel"=>"rbf",     "C"=>10.0,"gamma"=>0.5),
    Dict("kernel"=>"sigmoid", "C"=>1.0, "gamma"=>1.0),
    Dict("kernel"=>"poly",    "C"=>1.0, "degree"=>3, "gamma"=>1.0),
    Dict("kernel"=>"poly",    "C"=>5.0, "degree"=>4, "gamma"=>0.5),
]

8-element Vector{Dict{String, Any}}:
 Dict("C" => 0.1, "kernel" => "linear")
 Dict("C" => 1.0, "kernel" => "linear")
 Dict("C" => 10.0, "kernel" => "linear")
 Dict("C" => 1.0, "kernel" => "rbf", "gamma" => 2.0)
 Dict("C" => 10.0, "kernel" => "rbf", "gamma" => 0.5)
 Dict("C" => 1.0, "kernel" => "sigmoid", "gamma" => 1.0)
 Dict("C" => 1.0, "kernel" => "poly", "gamma" => 1.0, "degree" => 3)
 Dict("C" => 5.0, "kernel" => "poly", "gamma" => 0.5, "degree" => 4)

In [15]:
########################
# 2. SVM GRID SEARCH
########################
svm_results = []

for hp in svm_search_space
    for dim in dimsPCA
        println("\n=== SVM experiment: kernel=$(hp["kernel"]) C=$(get(hp,"C","-")) | PCA maxoutdim = $(dim) ===")
        full_hp = merge(default_svm, hp)
        res = modelCrossValidationPCA(:SVC, full_hp, (trainingInputs, trainingTargets), crossValidationIndices, dim)
        push!(svm_results, (model=:SVC, hyperparams=hp, dimPCA=dim, results=res))
    end
end

results[:SVC] = svm_results


=== SVM experiment: kernel=linear C=0.1 | PCA maxoutdim = 6 ===

=== SVM experiment: kernel=linear C=0.1 | PCA maxoutdim = 7 ===

=== SVM experiment: kernel=linear C=0.1 | PCA maxoutdim = 8 ===

=== SVM experiment: kernel=linear C=1.0 | PCA maxoutdim = 6 ===

=== SVM experiment: kernel=linear C=1.0 | PCA maxoutdim = 7 ===

=== SVM experiment: kernel=linear C=1.0 | PCA maxoutdim = 8 ===

=== SVM experiment: kernel=linear C=10.0 | PCA maxoutdim = 6 ===

=== SVM experiment: kernel=linear C=10.0 | PCA maxoutdim = 7 ===

=== SVM experiment: kernel=linear C=10.0 | PCA maxoutdim = 8 ===

=== SVM experiment: kernel=rbf C=1.0 | PCA maxoutdim = 6 ===

=== SVM experiment: kernel=rbf C=1.0 | PCA maxoutdim = 7 ===

=== SVM experiment: kernel=rbf C=1.0 | PCA maxoutdim = 8 ===

=== SVM experiment: kernel=rbf C=10.0 | PCA maxoutdim = 6 ===

=== SVM experiment: kernel=rbf C=10.0 | PCA maxoutdim = 7 ===

=== SVM experiment: kernel=rbf C=10.0 | PCA maxoutdim = 8 ===

=== SVM experiment: kernel=sigmoid C

24-element Vector{Any}:
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 0.1, "kernel" => "linear"), dimPCA = 6, results = ((0.92324656f0, 0.0020554946f0), (0.07675346f0, 0.0020554904f0), (0.92324656f0, 0.0020554946f0), (0.97217834f0, 0.001171714f0), (0.9237992f0, 0.002822116f0), (0.9826236f0, 0.0012845783f0), (0.92127085f0, 0.0025471877f0), Float32[267.6 0.0 5.2 0.6; 6.0 173.2 10.4 1.6; 3.0 6.6 208.2 5.8; 5.8 9.8 6.6 89.6]))
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 0.1, "kernel" => "linear"), dimPCA = 7, results = ((0.92374563f0, 0.0026537161f0), (0.0762544f0, 0.0026537185f0), (0.92374563f0, 0.0026537161f0), (0.972314f0, 0.0017245684f0), (0.92419225f0, 0.0026806244f0), (0.9825177f0, 0.0015376173f0), (0.9219354f0, 0.0028004707f0), Float32[267.4 0.0 5.4 0.6; 6.0 174.0 9.6 1.6; 2.8 6.6 207.8 6.4; 6.0 9.2 6.8 89.8]))
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 0.1, "kernel" => "linear"), dimPCA = 8, results = ((0.9304966f0, 0.005035769f0), (0.06950344f0, 0.0

In [16]:
for entry in results[:SVC]
    printExperimentResult(entry.model, entry.hyperparams, entry.dimPCA, entry.results, label_names)
end


 Model: SVC | PCA outdim: 6
 Hyperparameters: Dict{String, Any}("C" => 0.1, "kernel" => "linear")
 Accuracy (mean)               : 0.9232
 Accuracy (std)                : 0.0021
 Error Rate (mean)             : 0.0768
 Error Rate (std)              : 0.0021
 Sensitivity/Recall (mean)     : 0.9232
 Sensitivity/Recall (std)      : 0.0021
 Specificity (mean)            : 0.9722
 Specificity (std)             : 0.0012
 PPV (mean)                    : 0.9238
 PPV (std)                     : 0.0028
 NPV (mean)                    : 0.9826
 NPV (std)                     : 0.0013
 F1 Score (mean)               : 0.9213
 F1 Score (std)                : 0.0025

Confusion Matrix:
Float32[267.6 0.0 5.2 0.6; 6.0 173.2 10.4 1.6; 3.0 6.6 208.2 5.8; 5.8 9.8 6.6 89.6]
┌───────────┬───────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼───────┼───────────┼──────────┼──────┤
│[1m      Good [0m│ 267.6 │       0.0 │ 

In [17]:
########################
# 2. SVM FINAL TESTING 
########################

# BEST HYPERPARAMETERS
hp = Dict("kernel"=>"rbf", "C"=>1.0, "gamma"=>2.0)
dimPCA = 8

# Compute normalization parameters from TRAINING set only
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
train_df = normalizeMinMax(trainingInputs, normParams)
test_df = normalizeMinMax(testInputs, normParams)

pca_model = PCA_model(maxoutdim=dimPCA)
# Train the PCA model
pca_mach = machine(pca_model, MLJ.table(train_df))
MLJ.fit!(pca_mach, verbosity=0)
# Transform the data
train_df = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(train_df)))
test_df  = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(test_df)))

# --- Convert targets ---
train_y_cat = categorical(vec(trainingTargets))
test_y_cat  = categorical(vec(testTargets))

# Load model (insert hyperparameters manually)
model = SVMClassifier(kernel=LIBSVM.Kernel.RadialBasis, cost=Float64(1), gamma=Float64(2))
    
# --- Train machine ---
mach = machine(model, MLJ.table(train_df), train_y_cat)
MLJ.fit!(mach)
    
# --- Predict ---
ŷ = MLJ.predict(mach, test_df)              # probabilistic predictions
y_pred = CategoricalArray(ŷ)         # convert to class labels
    
# --- Compute metrics ---
metrics = confusionMatrix(y_pred, test_y_cat)

printPCAResult(:SVC, hp, metrics, dimPCA, label_names)

┌ Info: Training machine(SVC(kernel = RadialBasis, …), …).
└ @ MLJBase C:\Users\gianp\.julia\packages\MLJBase\7nGJF\src\machines.jl:499



 Model: SVC outdim 8 
 Hyperparameters: Dict{String, Any}("C" => 1.0, "kernel" => "rbf", "gamma" => 2.0)
 Accuracy                : 0.927
 Error Rate              : 0.073
 Sensitivity/Recall      : 0.927
 Specificity             : 0.9782
 PPV                     : 0.9264
 NPV                     : 0.982
 F1 Score                : 0.9262

Confusion Matrix:
[77 0 0 26; 0 283 1 12; 0 0 383 0; 12 22 0 184]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│   77 │         0 │        0 │   26 │
│[1m Hazardous [0m│    0 │       283 │        1 │   12 │
│[1m  Moderate [0m│    0 │         0 │      383 │    0 │
│[1m      Poor [0m│   12 │        22 │        0 │  184 │
└───────────┴──────┴───────────┴──────────┴──────┘


# Decission Trees

In [108]:
DTClassifier = MLJ.@load DecisionTreeClassifier pkg=DecisionTree verbosity=0

MLJDecisionTreeInterface.DecisionTreeClassifier

In [109]:
############# 3. DECISION TREES (6 depths) #############
default_dt = Dict(
    "rng" => Random.MersenneTwister(1)
)

dt_search_space = [
    Dict("max_depth"=>2),
    Dict("max_depth"=>3),
    Dict("max_depth"=>4),
    Dict("max_depth"=>5),
    Dict("max_depth"=>6),
    Dict("max_depth"=>8)
]

6-element Vector{Dict{String, Int64}}:
 Dict("max_depth" => 2)
 Dict("max_depth" => 3)
 Dict("max_depth" => 4)
 Dict("max_depth" => 5)
 Dict("max_depth" => 6)
 Dict("max_depth" => 8)

In [110]:
########################
# 3. DECISION TREE GRID SEARCH
########################
dt_results = []

for hp in dt_search_space
    for dim in dimsPCA
        println("\n=== Decision Tree experiment: max_depth=$(hp["max_depth"]) | PCA maxoutdim = $(dim) ===")
        full_hp = merge(default_dt, hp)
        res = modelCrossValidationPCA(:DecisionTreeClassifier, full_hp, (trainingInputs, trainingTargets), crossValidationIndices, dim)
        push!(dt_results, (model=:DT, hyperparams=hp, dimPCA=dim, results=res))
    end
end

results[:DT] = dt_results


=== Decision Tree experiment: max_depth=2 | PCA maxoutdim = 6 ===



=== Decision Tree experiment: max_depth=2 | PCA maxoutdim = 7 ===

=== Decision Tree experiment: max_depth=2 | PCA maxoutdim = 8 ===

=== Decision Tree experiment: max_depth=3 | PCA maxoutdim = 6 ===

=== Decision Tree experiment: max_depth=3 | PCA maxoutdim = 7 ===

=== Decision Tree experiment: max_depth=3 | PCA maxoutdim = 8 ===

=== Decision Tree experiment: max_depth=4 | PCA maxoutdim = 6 ===

=== Decision Tree experiment: max_depth=4 | PCA maxoutdim = 7 ===

=== Decision Tree experiment: max_depth=4 | PCA maxoutdim = 8 ===

=== Decision Tree experiment: max_depth=5 | PCA maxoutdim = 6 ===

=== Decision Tree experiment: max_depth=5 | PCA maxoutdim = 7 ===

=== Decision Tree experiment: max_depth=5 | PCA maxoutdim = 8 ===

=== Decision Tree experiment: max_depth=6 | PCA maxoutdim = 6 ===

=== Decision Tree experiment: max_depth=6 | PCA maxoutdim = 7 ===

=== Decision Tree experiment: max_depth=6 | PCA maxoutdim = 8 ===

=== Decision Tree experiment: max_depth=8 | PCA maxoutdim = 6

18-element Vector{Any}:
 (model = :DT, hyperparams = Dict("max_depth" => 2), dimPCA = 6, results = ((0.83324766f0, 0.0047318414f0), (0.1667523f0, 0.0047318325f0), (0.83324766f0, 0.0047318414f0), (0.94338644f0, 0.0029785472f0), (0.765292f0, 0.0050344025f0), (0.9658946f0, 0.0021908178f0), (0.79555875f0, 0.004722839f0), Float32[261.0 1.0 11.2 0.2; 8.6 158.8 23.8 0.0; 4.4 8.8 204.8 5.6; 20.6 31.8 17.4 42.0]))
 (model = :DT, hyperparams = Dict("max_depth" => 2), dimPCA = 7, results = ((0.83324766f0, 0.0047318414f0), (0.1667523f0, 0.0047318325f0), (0.83324766f0, 0.0047318414f0), (0.94338644f0, 0.0029785472f0), (0.765292f0, 0.0050344025f0), (0.9658946f0, 0.0021908178f0), (0.79555875f0, 0.004722839f0), Float32[261.0 1.0 11.2 0.2; 8.6 158.8 23.8 0.0; 4.4 8.8 204.8 5.6; 20.6 31.8 17.4 42.0]))
 (model = :DT, hyperparams = Dict("max_depth" => 2), dimPCA = 8, results = ((0.83324766f0, 0.0047318414f0), (0.1667523f0, 0.0047318325f0), (0.83324766f0, 0.0047318414f0), (0.94338644f0, 0.0029785472f0), (0.

In [111]:
for entry in results[:DT]
    printExperimentResult(entry.model, entry.hyperparams, entry.dimPCA, entry.results, label_names)
end




 Model: DT | PCA outdim: 6
 Hyperparameters: Dict("max_depth" => 2)
 Accuracy (mean)               : 0.8332
 Accuracy (std)                : 0.0047
 Error Rate (mean)             : 0.1668
 Error Rate (std)              : 0.0047
 Sensitivity/Recall (mean)     : 0.8332
 Sensitivity/Recall (std)      : 0.0047
 Specificity (mean)            : 0.9434
 Specificity (std)             : 0.003
 PPV (mean)                    : 0.7653
 PPV (std)                     : 0.005
 NPV (mean)                    : 0.9659
 NPV (std)                     : 0.0022
 F1 Score (mean)               : 0.7956
 F1 Score (std)                : 0.0047

Confusion Matrix:
Float32[261.0 1.0 11.2 0.2; 8.6 158.8 23.8 0.0; 4.4 8.8 204.8 5.6; 20.6 31.8 17.4 42.0]
┌───────────┬───────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼───────┼───────────┼──────────┼──────┤
│[1m      Good [0m│ 261.0 │       1.0 │     11.2 │  0.2 │
│[1m Hazar

In [118]:
########################
# 3. DT FINAL TESTING 
########################

# BEST HYPERPARAMETERS
hp = Dict("max_depth"=>5)
dimPCA = 8

# Compute normalization parameters from TRAINING set only
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
train_df = normalizeMinMax(trainingInputs, normParams)
test_df = normalizeMinMax(testInputs, normParams)

pca_model = PCA_model(maxoutdim=dimPCA)
# Train the PCA model
pca_mach = machine(pca_model, MLJ.table(train_df))
MLJ.fit!(pca_mach, verbosity=0)
# Transform the data
train_df = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(train_df)))
test_df  = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(test_df)))

# --- Convert targets ---
train_y_cat = categorical(vec(trainingTargets))
test_y_cat  = categorical(vec(testTargets))

# Load model (insert hyperparameters manually)
model = DTClassifier(max_depth=5, rng=Random.MersenneTwister(1))    

# --- Train machine ---
mach = machine(model, MLJ.table(train_df), train_y_cat)
MLJ.fit!(mach)
    
# --- Predict ---
ŷ = MLJ.predict(mach, MLJ.table(test_df))              # probabilistic predictions
y_pred = CategoricalArray(mode.(ŷ))         # convert to class labels
    
# --- Compute metrics ---
metrics = confusionMatrix(y_pred, test_y_cat)

printPCAResult(:DT, hp, metrics, dimPCA, label_names)




[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DecisionTreeClassifier(max_depth = 5, …), …).


 Model: DT outdim 8 
 Hyperparameters: Dict("max_depth" => 5)
 Accuracy                : 0.903
 Error Rate              : 0.097
 Sensitivity/Recall      : 0.903
 Specificity             : 0.9697
 PPV                     : 0.9023
 NPV                     : 0.9724
 F1 Score                : 0.9023

Confusion Matrix:
[76 0 0 27; 0 276 4 16; 0 6 377 0; 17 27 0 174]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│   76 │         0 │        0 │   27 │
│[1m Hazardous [0m│    0 │       276 │        4 │   16 │
│[1m  Moderate [0m│    0 │         6 │      377 │    0 │
│[1m      Poor [0m│   17 │        27 │        0 │  174 │
└───────────┴──────┴───────────┴──────────┴──────┘


# K-Nearest Neighbors

In [128]:
kNNClassifier = MLJ.@load KNNClassifier pkg=NearestNeighborModels verbosity=0

NearestNeighborModels.KNNClassifier

In [129]:
############# 4. kNN (6 values) #############
knn_search_space = [
    Dict("K"=>1),
    Dict("K"=>3),
    Dict("K"=>5),
    Dict("K"=>7),
    Dict("K"=>9),
    Dict("K"=>11)
]

6-element Vector{Dict{String, Int64}}:
 Dict("K" => 1)
 Dict("K" => 3)
 Dict("K" => 5)
 Dict("K" => 7)
 Dict("K" => 9)
 Dict("K" => 11)

In [130]:
########################
# 4. KNN GRID SEARCH
########################
knn_results = []

for hp in knn_search_space
    for dim in dimsPCA
        println("\n=== kNN experiment: K=$(hp["K"]) | PCA maxoutdim = $(dim) ===")
        res = modelCrossValidationPCA(:KNeighborsClassifier, hp, (trainingInputs, trainingTargets), crossValidationIndices, dim)
        push!(knn_results, (model=:KNN, hyperparams=hp, dimPCA=dim, results=res))
    end
end

results[:KNN] = knn_results


=== kNN experiment: K=1 | PCA maxoutdim = 6 ===

=== kNN experiment: K=1 | PCA maxoutdim = 7 ===

=== kNN experiment: K=1 | PCA maxoutdim = 8 ===

=== kNN experiment: K=3 | PCA maxoutdim = 6 ===

=== kNN experiment: K=3 | PCA maxoutdim = 7 ===

=== kNN experiment: K=3 | PCA maxoutdim = 8 ===

=== kNN experiment: K=5 | PCA maxoutdim = 6 ===

=== kNN experiment: K=5 | PCA maxoutdim = 7 ===

=== kNN experiment: K=5 | PCA maxoutdim = 8 ===

=== kNN experiment: K=7 | PCA maxoutdim = 6 ===

=== kNN experiment: K=7 | PCA maxoutdim = 7 ===

=== kNN experiment: K=7 | PCA maxoutdim = 8 ===

=== kNN experiment: K=9 | PCA maxoutdim = 6 ===

=== kNN experiment: K=9 | PCA maxoutdim = 7 ===

=== kNN experiment: K=9 | PCA maxoutdim = 8 ===

=== kNN experiment: K=11 | PCA maxoutdim = 6 ===

=== kNN experiment: K=11 | PCA maxoutdim = 7 ===

=== kNN experiment: K=11 | PCA maxoutdim = 8 ===


18-element Vector{Any}:
 (model = :KNN, hyperparams = Dict("K" => 1), dimPCA = 6, results = ((0.9144928f0, 0.014752504f0), (0.08550726f0, 0.014752496f0), (0.9144928f0, 0.014752504f0), (0.97475255f0, 0.0050279847f0), (0.91425735f0, 0.014948676f0), (0.97718126f0, 0.0047332426f0), (0.914081f0, 0.014821795f0), Float32[262.4 1.6 5.0 4.4; 2.6 173.8 10.0 4.8; 4.2 8.6 201.8 9.0; 5.0 6.6 6.6 93.6]))
 (model = :KNN, hyperparams = Dict("K" => 1), dimPCA = 7, results = ((0.91949683f0, 0.012265669f0), (0.080503166f0, 0.0122656645f0), (0.91949683f0, 0.012265669f0), (0.97496843f0, 0.0050757597f0), (0.91841316f0, 0.012384645f0), (0.9790126f0, 0.0044209934f0), (0.91873264f0, 0.012305275f0), Float32[264.2 0.8 5.6 2.8; 3.2 173.0 10.2 4.8; 5.0 7.0 204.2 7.4; 4.4 7.2 6.0 94.2]))
 (model = :KNN, hyperparams = Dict("K" => 1), dimPCA = 8, results = ((0.91624653f0, 0.011937856f0), (0.0837535f0, 0.011937853f0), (0.91624653f0, 0.011937856f0), (0.97444886f0, 0.0040559336f0), (0.91527414f0, 0.011805885f0), (0.9790

In [26]:
for entry in results[:KNN]
    printExperimentResult(entry.model, entry.hyperparams, entry.dimPCA, entry.results, label_names)
end


 Model: KNN | PCA outdim: 6
 Hyperparameters: Dict("K" => 1)
 Accuracy (mean)               : 0.9145
 Accuracy (std)                : 0.0148
 Error Rate (mean)             : 0.0855
 Error Rate (std)              : 0.0148
 Sensitivity/Recall (mean)     : 0.9145
 Sensitivity/Recall (std)      : 0.0148
 Specificity (mean)            : 0.9748
 Specificity (std)             : 0.005
 PPV (mean)                    : 0.9143
 PPV (std)                     : 0.0149
 NPV (mean)                    : 0.9772
 NPV (std)                     : 0.0047
 F1 Score (mean)               : 0.9141
 F1 Score (std)                : 0.0148

Confusion Matrix:
Float32[262.4 1.6 5.0 4.4; 2.6 173.8 10.0 4.8; 4.2 8.6 201.8 9.0; 5.0 6.6 6.6 93.6]
┌───────────┬───────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼───────┼───────────┼──────────┼──────┤
│[1m      Good [0m│ 262.4 │       1.6 │      5.0 │  4.4 │
│[1m Hazardous [0m

In [133]:
########################
# 4. KNN FINAL TESTING 
########################

# BEST HYPERPARAMETERS
hp = Dict("K"=>5)
dimPCA = 7

# Compute normalization parameters from TRAINING set only
normParams = calculateMinMaxNormalizationParameters(trainingInputs)
train_df = normalizeMinMax(trainingInputs, normParams)
test_df = normalizeMinMax(testInputs, normParams)

pca_model = PCA_model(maxoutdim=dimPCA)
# Train the PCA model
pca_mach = machine(pca_model, MLJ.table(train_df))
MLJ.fit!(pca_mach, verbosity=0)
# Transform the data
train_df = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(train_df)))
test_df  = MLJBase.matrix(MLJBase.transform(pca_mach, MLJ.table(test_df)))

# --- Convert targets ---
train_y_cat = categorical(vec(trainingTargets))
test_y_cat  = categorical(vec(testTargets))

# Load model (insert hyperparameters manually)
model = kNNClassifier(K=5)

# --- Train machine ---
mach = machine(model, MLJ.table(train_df), train_y_cat)
MLJ.fit!(mach)
    
# --- Predict ---
ŷ = MLJ.predict(mach, MLJ.table(test_df))              # probabilistic predictions
y_pred = CategoricalArray(mode.(ŷ))         # convert to class labels
    
# --- Compute metrics ---
metrics = confusionMatrix(y_pred, test_y_cat)

printPCAResult(:KNN, hp, metrics, dimPCA, label_names)


 Model: KNN outdim 7 
 Hyperparameters: Dict("K" => 5)
 Accuracy                : 0.912
 Error Rate              : 0.088
 Sensitivity/Recall      : 0.912
 Specificity             : 0.9709
 PPV                     : 0.9115
 NPV                     : 0.9779
 F1 Score                : 0.9105

Confusion Matrix:
[71 0 0 32; 0 277 7 12; 0 0 383 0; 10 27 0 181]

┌───────────┬──────┬───────────┬──────────┬──────┐
│[1m           [0m│[1m Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m Poor [0m│
├───────────┼──────┼───────────┼──────────┼──────┤
│[1m      Good [0m│   71 │         0 │        0 │   32 │
│[1m Hazardous [0m│    0 │       277 │        7 │   12 │
│[1m  Moderate [0m│    0 │         0 │      383 │    0 │
│[1m      Poor [0m│   10 │        27 │        0 │  181 │
└───────────┴──────┴───────────┴──────────┴──────┘


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(KNNClassifier(K = 5, …), …).


# Stacking Ensemble

In [134]:
SVMClassifier = @load ProbabilisticSVC pkg=LIBSVM verbosity=0
DTClassifier  = @load DecisionTreeClassifier pkg=DecisionTree verbosity=0
kNNClassifier = @load KNNClassifier pkg=NearestNeighborModels verbosity=0
PCA = MLJ.@load PCA pkg="MultivariateStats" verbosity=0

trainIdx, testIdx = holdOut(size(df,1), 0.2)

trainingInputs  = inputs[trainIdx, :]
testInputs      = inputs[testIdx, :]

trainingTargets = targets[trainIdx]
testTargets     = targets[testIdx]

# X as DataFrame
train_df = DataFrame(trainingInputs, :auto)
test_df  = DataFrame(testInputs, :auto)

# y as categorical
train_y_cat = categorical(trainingTargets)
test_y_cat  = categorical(testTargets)

# ---- Base Models ----
svm_pca = Pipeline(
    pca = PCA(maxoutdim=8),
    model = SVMClassifier(kernel=LIBSVM.Kernel.RadialBasis, cost=1.0, gamma=2.0)
)

dt_pca = Pipeline(
    pca = PCA(maxoutdim=8),
    model = DTClassifier(max_depth=5)
)

knn_pca = Pipeline(
    pca = PCA(maxoutdim=7),
    model = kNNClassifier(K=5)
)

# ---- Stacking Model ----
stack_model = Stack(;
    metalearner = DTClassifier(max_depth=3, rng=Random.MersenneTwister(1)),
    resampling = CV(nfolds=5, shuffle=true, rng=123),
    measure = accuracy, 
    svm = svm_pca,
    knn = knn_pca,
    dt = dt_pca
)

# Train the stacking model on your train dataset
stack_mach = machine(stack_model, train_df, train_y_cat) |> MLJ.fit!

y_pred = mode.(MLJ.predict(stack_mach, test_df))
acc = MLJ.accuracy(y_pred, test_y_cat)
println("Stack ensemble accuracy = $(round(acc*100, digits=2)) %")

[33m[1m└ [22m[39m[90m@ MLJBase ~/.julia/packages/MLJBase/GY2fM/src/composition/models/stacking.jl:187[39m
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(ProbabilisticStack(metalearner = DecisionTreeClassifier(max_depth = 3, …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:svm, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:knn, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:dt, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:svm, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:knn, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:dt, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:svm, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:knn, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:dt, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:svm, …).
[36m[1m[ [22

Stack ensemble accuracy = 63.0 %
