# Load packages

In [21]:
# skip reinstalling packages we already have
using Pkg

pkgs = [
    "MLJ", "MLJBase", "MLJModels", "MLJEnsembles", "MLJLinearModels",
    "DecisionTree", "MLJDecisionTreeInterface", "NaiveBayes", 
    "MLJNaiveBayesInterface", "EvoTrees", "CategoricalArrays", "Random",
    "LIBSVM", "MLJLIBSVMInterface", "Plots", "MLJModelInterface",
    "CSV", "DataFrames", "UrlDownload", "XGBoost", "NNlib"
]

# Filter out packages already installed
missing_pkgs = filter(pkg -> !(pkg in keys(Pkg.project().dependencies)), pkgs)

if !isempty(missing_pkgs)
    println("Installing missing packages: ", missing_pkgs)
    Pkg.add(missing_pkgs)
else
    println(" All required packages are already installed.")
end


 All required packages are already installed.


In [22]:
using MLJ
using LIBSVM
using NNlib
using Flux
using Flux.Losses
using Statistics

In [23]:
#Load your library of functions
include("utils.jl")
# Set a global random seed for reproducibility
using Random
Random.seed!(42)

TaskLocalRNG()

# Load Data

In [24]:
using CSV, DataFrames, Random
using CategoricalArrays

df = CSV.read("./data/updated_pollution_dataset.csv", DataFrame)

# Some log
println("First 5 rows of df:")
show(df[1:5, :], allcols=true)

# Convert last column to categorical (in-place!)
df[!, end] = categorical(df[!, end])

# Extract the integer codes of the categories
targets = Float32.(levelcode.(df[!, end]))

# Use all columns except the last one as inputs
inputs = Matrix{Float32}(df[:, 1:end-1])

println("First 5 inputs::")
for i in 1:5
    println(inputs[i, :])
end

println("\n\nFirst 5 targets:")
println(targets[1:5])

# Extract labels (categories) as strings
label_names = levels(df[!, 10])
println("Labels: ", label_names)

First 5 rows of df:
[1m5×10 DataFrame[0m
[1m Row [0m│[1m Temperature [0m[1m Humidity [0m[1m PM2.5   [0m[1m PM10    [0m[1m NO2     [0m[1m SO2     [0m[1m CO      [0m[1m Proximity_to_Industrial_Areas [0m[1m Population_Density [0m[1m Air Quality [0m
     │[90m Float64     [0m[90m Float64  [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64 [0m[90m Float64                       [0m[90m Int64              [0m[90m String15    [0m
─────┼────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
   1 │        29.8      59.1      5.2     17.9     18.9      9.2     1.72                            6.3                 319  Moderate
   2 │        28.3      75.6      2.3     12.2     30.8      9.7     1.64                            6.0                 611  Moderate
   3 │        23.1      74.7     26.7     33.8     24.4     12.6     1.63                   

[29.8, 59.1, 5.2, 17.9, 18.9, 9.2, 1.72, 6.3, 319.0]
Float32[28.3, 75.6, 2.3, 12.2, 30.8, 9.7, 1.64, 6.0, 611.0]
Float32[23.1, 74.7, 26.7, 33.8, 24.4, 12.6, 1.63, 5.2, 619.0]
Float32[27.1, 39.1, 6.1, 6.3, 13.5, 5.3, 1.15, 11.1, 551.0]
Float32[26.5, 70.7, 6.9, 16.0, 21.9, 5.6, 1.01, 12.7, 303.0]


First 5 targets:
Float32[3.0, 3.0, 3.0, 1.0, 1.0]
Labels: String15["Good", "Hazardous", "Moderate", "Poor"]


In [11]:
results = Dict()
crossValidationIndices = crossvalidation(targets, 5)

5000-element Vector{Int64}:
 1
 3
 5
 4
 1
 3
 3
 5
 1
 1
 5
 1
 2
 ⋮
 3
 4
 4
 4
 5
 1
 2
 4
 4
 3
 5
 1

In [12]:
function printExperimentResult(model, hyperparams, results, class_labels)
    (
        (accuracy_mean, accuracy_std),
        (error_rate_mean, error_rate_std),
        (sensitivity_mean, sensitivity_std),
        (specificity_mean, specificity_std),
        (ppv_mean, ppv_std),
        (npv_mean, npv_std),
        (f1_mean, f1_std),
        cm
    ) = results

    println("\n=====================================================")
    println(" Model: $model")
    println(" Hyperparameters: $hyperparams")
    println("=====================================================")

    println(" Accuracy (mean)               : ", round(accuracy_mean, digits=4))
    println(" Accuracy (std)                : ", round(accuracy_std, digits=4))

    println(" Error Rate (mean)             : ", round(error_rate_mean, digits=4))
    println(" Error Rate (std)              : ", round(error_rate_std, digits=4))

    println(" Sensitivity/Recall (mean)     : ", round(sensitivity_mean, digits=4))
    println(" Sensitivity/Recall (std)      : ", round(sensitivity_std,  digits=4))

    println(" Specificity (mean)            : ", round(specificity_mean, digits=4))
    println(" Specificity (std)             : ", round(specificity_std,  digits=4))

    println(" PPV (mean)                    : ", round(ppv_mean,         digits=4))
    println(" PPV (std)                     : ", round(ppv_std,          digits=4))

    println(" NPV (mean)                    : ", round(npv_mean,         digits=4))
    println(" NPV (std)                     : ", round(npv_std,          digits=4))

    println(" F1 Score (mean)               : ", round(f1_mean,          digits=4))
    println(" F1 Score (std)                : ", round(f1_std,           digits=4))

    println("\nConfusion Matrix:")
    println(cm)

    PrettyTables.pretty_table(DataFrame(cm, :auto); header=class_labels, row_labels=class_labels)

    println("=====================================================\n")
end


printExperimentResult (generic function with 1 method)

# Artificial Neural Networks

In [13]:
############# 1. ARTIFICIAL NEURAL NETWORKS (8+ topologies) #############
default_ann = Dict(      
    "numExecutions" => 5,
    #"transferFunctions" => [σ, σ, σ, σ],
    "maxEpochs" => 200,
    "minLoss" => 0.0,
    "learningRate" => 0.01,
    "validationRatio" => 0.1,
    "maxEpochsVal" => 20
)

ann_search_space = [
    Dict("topology"=>[4, 4]),
    Dict("topology"=>[8, 8]),
    Dict("topology"=>[16, 16]),
    Dict("topology"=>[10, 4]),
    Dict("topology"=>[10, 6, 4]),
    Dict("topology"=>[10, 8, 4]),
    Dict("topology"=>[10, 8, 6, 4]),
    Dict("topology"=>[10, 12, 6, 4])
]

8-element Vector{Dict{String, Vector{Int64}}}:
 Dict("topology" => [4, 4])
 Dict("topology" => [8, 8])
 Dict("topology" => [16, 16])
 Dict("topology" => [10, 4])
 Dict("topology" => [10, 6, 4])
 Dict("topology" => [10, 8, 4])
 Dict("topology" => [10, 8, 6, 4])
 Dict("topology" => [10, 12, 6, 4])

In [14]:
########################
# 1. ANN GRID SEARCH
########################
ann_results = []

for hp in ann_search_space
    println("\n=== ANN experiment: topology = $(hp["topology"]) ===")
    full_hp = merge(default_ann, hp)
    res = modelCrossValidation(:ANN, full_hp, (inputs, targets), crossValidationIndices)
    push!(ann_results, (model=:ANN, hyperparams=hp, results=res))
end

results[:ANN] = ann_results


=== ANN experiment: topology = [4, 4] ===

=== ANN experiment: topology = [8, 8] ===

=== ANN experiment: topology = [16, 16] ===

=== ANN experiment: topology = [10, 4] ===

=== ANN experiment: topology = [10, 6, 4] ===

=== ANN experiment: topology = [10, 8, 4] ===

=== ANN experiment: topology = [10, 8, 6, 4] ===

=== ANN experiment: topology = [10, 12, 6, 4] ===


8-element Vector{Any}:
 (model = :ANN, hyperparams = Dict("topology" => [4, 4]), results = ((0.94048005f0, 0.007690985f0), (0.05952f0, 0.0076910355f0), (0.94048005f0, 0.007690985f0), (0.9826414f0, 0.0017484212f0), (0.94006604f0, 0.0076740775f0), (0.98477715f0, 0.0020668781f0), (0.94012755f0, 0.007616604f0), Float32[332.28 0.71999997 6.6 0.4; 0.6 243.96 8.2 7.2399993; 11.96 8.12 175.48001 4.44; 0.4 5.12 5.72 188.76]))
 (model = :ANN, hyperparams = Dict("topology" => [8, 8]), results = ((0.94152004f0, 0.007279571f0), (0.058480002f0, 0.00727956f0), (0.94152004f0, 0.007279571f0), (0.98277295f0, 0.001393233f0), (0.9412157f0, 0.007079065f0), (0.9850707f0, 0.0020939773f0), (0.94120026f0, 0.007131832f0), Float32[332.03998 0.76000005 6.8 0.4; 0.68 244.6 8.12 6.6; 11.6 7.84 176.76 3.8; 0.44 5.2400002 6.2 188.12]))
 (model = :ANN, hyperparams = Dict("topology" => [16, 16]), results = ((0.94128f0, 0.007293288f0), (0.058720004f0, 0.0072932867f0), (0.94128f0, 0.007293288f0), (0.98263663f0, 0.0014149

In [15]:
for entry in results[:ANN]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: ANN
 Hyperparameters: Dict("topology" => [4, 4])
 Accuracy (mean)               : 0.9405
 Accuracy (std)                : 0.0077
 Error Rate (mean)             : 0.0595
 Error Rate (std)              : 0.0077
 Sensitivity/Recall (mean)     : 0.9405
 Sensitivity/Recall (std)      : 0.0077
 Specificity (mean)            : 0.9826
 Specificity (std)             : 0.0017
 PPV (mean)                    : 0.9401
 PPV (std)                     : 0.0077
 NPV (mean)                    : 0.9848
 NPV (std)                     : 0.0021
 F1 Score (mean)               : 0.9401
 F1 Score (std)                : 0.0076

Confusion Matrix:
Float32[332.28 0.71999997 6.6 0.4; 0.6 243.96 8.2 7.2399993; 11.96 8.12 175.48001 4.44; 0.4 5.12 5.72 188.76]
┌───────────┬────────┬───────────┬──────────┬────────┐
│[1m           [0m│[1m   Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m   Poor [0m│
├───────────┼────────┼───────────┼──────────┼────────┤
│[1m      Good [0m│ 332.28 │      0.72 │      

# Support Vector Machines

In [16]:
SVMClassifier = MLJ.@load SVC pkg=LIBSVM verbosity=0

MLJLIBSVMInterface.SVC

In [17]:
############# 2. SVM (8+ configs: kernels × C) #############
default_svm = Dict(
    "gamma" => 1.0,
    "degree" => Int32(3),
    "coef0" => 0.0
)
svm_search_space = [
    Dict("kernel"=>"linear", "C"=>0.1),
    Dict("kernel"=>"linear", "C"=>1.0),
    Dict("kernel"=>"linear", "C"=>10.0),

    Dict("kernel"=>"rbf", "C"=>1.0, "gamma" => 2.0),
    Dict("kernel"=>"rbf", "C"=>10.0, "gamma" => 0.5),

    Dict("kernel"=>"sigmoid", "C"=>1.0, "gamma" => 1.0),

    Dict("kernel"=>"poly", "C"=>1.0, "degree" => 3, "gamma" => 1),
    Dict("kernel"=>"poly", "C"=>5.0, "degree" => 4, "gamma" => 0.5),
]

8-element Vector{Dict{String, Any}}:
 Dict("C" => 0.1, "kernel" => "linear")
 Dict("C" => 1.0, "kernel" => "linear")
 Dict("C" => 10.0, "kernel" => "linear")
 Dict("C" => 1.0, "kernel" => "rbf", "gamma" => 2.0)
 Dict("C" => 10.0, "kernel" => "rbf", "gamma" => 0.5)
 Dict("C" => 1.0, "kernel" => "sigmoid", "gamma" => 1.0)
 Dict("C" => 1.0, "kernel" => "poly", "gamma" => 1, "degree" => 3)
 Dict("C" => 5.0, "kernel" => "poly", "gamma" => 0.5, "degree" => 4)

In [18]:
########################
# 2. SVM GRID SEARCH
########################
svm_results = []

for hp in svm_search_space
    println("\n=== SVM experiment: kernel=$(hp["kernel"]) C=$(get(hp,"C","-")) ===")
    full_hp = merge(default_svm, hp)
    res = modelCrossValidation(:SVC, full_hp, (inputs, targets), crossValidationIndices)
    push!(svm_results, (model=:SVC, hyperparams=hp, results=res))
end

results[:SVC] = svm_results


=== SVM experiment: kernel=linear C=0.1 ===

=== SVM experiment: kernel=linear C=1.0 ===

=== SVM experiment: kernel=linear C=10.0 ===

=== SVM experiment: kernel=rbf C=1.0 ===

=== SVM experiment: kernel=rbf C=10.0 ===

=== SVM experiment: kernel=sigmoid C=1.0 ===

=== SVM experiment: kernel=poly C=1.0 ===

=== SVM experiment: kernel=poly C=5.0 ===


8-element Vector{Any}:
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 0.1, "kernel" => "linear"), results = ((0.9252f0, 0.0096020885f0), (0.0748f0, 0.009602084f0), (0.9252f0, 0.0096020885f0), (0.97350633f0, 0.0036459977f0), (0.9255888f0, 0.008998138f0), (0.98270977f0, 0.001990787f0), (0.92350817f0, 0.010069727f0), Float32[332.0 3.2 3.4 1.4; 0.2 241.8 9.8 8.2; 18.8 9.0 170.6 1.6; 2.0 6.8 10.4 180.8]))
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 1.0, "kernel" => "linear"), results = ((0.9374f0, 0.00572712f0), (0.0626f0, 0.005727128f0), (0.9374f0, 0.00572712f0), (0.9799453f0, 0.002152039f0), (0.93695223f0, 0.005510538f0), (0.98477143f0, 0.0015064947f0), (0.9366585f0, 0.005720074f0), Float32[333.4 1.0 5.4 0.2; 0.2 244.4 7.6 7.8; 15.6 8.0 173.4 3.0; 1.0 5.6 7.2 186.2]))
 (model = :SVC, hyperparams = Dict{String, Any}("C" => 10.0, "kernel" => "linear"), results = ((0.94020003f0, 0.007085182f0), (0.0598f0, 0.0070851957f0), (0.94020003f0, 0.007085182f0), (0.98172456f0, 0.001

In [19]:
for entry in results[:SVC]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: SVC
 Hyperparameters: Dict{String, Any}("C" => 0.1, "kernel" => "linear")
 Accuracy (mean)               : 0.9252
 Accuracy (std)                : 0.0096
 Error Rate (mean)             : 0.0748
 Error Rate (std)              : 0.0096
 Sensitivity/Recall (mean)     : 0.9252
 Sensitivity/Recall (std)      : 0.0096
 Specificity (mean)            : 0.9735
 Specificity (std)             : 0.0036
 PPV (mean)                    : 0.9256
 PPV (std)                     : 0.009
 NPV (mean)                    : 0.9827
 NPV (std)                     : 0.002
 F1 Score (mean)               : 0.9235
 F1 Score (std)                : 0.0101

Confusion Matrix:
Float32[332.0 3.2 3.4 1.4; 0.2 241.8 9.8 8.2; 18.8 9.0 170.6 1.6; 2.0 6.8 10.4 180.8]
┌───────────┬───────┬───────────┬──────────┬───────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m  Poor [0m│
├───────────┼───────┼───────────┼──────────┼───────┤
│[1m      Good [0m│ 332.0 │       3.2 │      3.4 │   

# Decission Trees

In [20]:
DTClassifier = MLJ.@load DecisionTreeClassifier pkg=DecisionTree verbosity=0

MLJDecisionTreeInterface.DecisionTreeClassifier

In [21]:
############# 3. DECISION TREES (6 depths) #############
default_dt = Dict(
    "rng" => Random.MersenneTwister(1)
)

dt_search_space = [
    Dict("max_depth"=>2),
    Dict("max_depth"=>3),
    Dict("max_depth"=>4),
    Dict("max_depth"=>5),
    Dict("max_depth"=>6),
    Dict("max_depth"=>8)
]

6-element Vector{Dict{String, Int64}}:
 Dict("max_depth" => 2)
 Dict("max_depth" => 3)
 Dict("max_depth" => 4)
 Dict("max_depth" => 5)
 Dict("max_depth" => 6)
 Dict("max_depth" => 8)

In [22]:
########################
# 3. DECISION TREE GRID SEARCH
########################
dt_results = []

for hp in dt_search_space
    println("\n=== Decision Tree experiment: max_depth=$(hp["max_depth"]) ===")
    full_hp = merge(default_dt, hp) 
    res = modelCrossValidation(:DecisionTreeClassifier, full_hp, (inputs, targets), crossValidationIndices)
    push!(dt_results, (model=:DT, hyperparams=hp, results=res))
end

results[:DT] = dt_results


=== Decision Tree experiment: max_depth=2 ===

=== Decision Tree experiment: max_depth=3 ===

=== Decision Tree experiment: max_depth=4 ===

=== Decision Tree experiment: max_depth=5 ===

=== Decision Tree experiment: max_depth=6 ===

=== Decision Tree experiment: max_depth=8 ===


6-element Vector{Any}:
 (model = :DT, hyperparams = Dict("max_depth" => 2), results = ((0.8222f0, 0.0045497343f0), (0.1778f0, 0.0045497245f0), (0.8222f0, 0.0045497343f0), (0.9438714f0, 0.0021794732f0), (0.7564937f0, 0.004453325f0), (0.96373355f0, 0.0021378451f0), (0.7859414f0, 0.0045425417f0), Float32[325.0 0.6 12.6 1.8; 1.8 216.4 31.0 10.8; 32.2 22.2 144.8 0.8; 0.8 22.6 40.6 136.0]))
 (model = :DT, hyperparams = Dict("max_depth" => 3), results = ((0.8837999f0, 0.0050695124f0), (0.11619999f0, 0.005069519f0), (0.8837999f0, 0.0050695124f0), (0.96868813f0, 0.0024017014f0), (0.8834666f0, 0.0053703277f0), (0.97072375f0, 0.0022297632f0), (0.88238287f0, 0.005320001f0), Float32[325.0 0.6 12.6 1.8; 1.2 227.0 18.8 13.0; 14.8 17.8 158.6 8.8; 0.6 10.0 16.2 173.2]))
 (model = :DT, hyperparams = Dict("max_depth" => 4), results = ((0.8948f0, 0.00831866f0), (0.10520001f0, 0.008318653f0), (0.8948f0, 0.00831866f0), (0.97100556f0, 0.0038301658f0), (0.8956172f0, 0.008752802f0), (0.9716833f0, 0.0023273032f

In [23]:
for entry in results[:DT]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: DT
 Hyperparameters: Dict("max_depth" => 2)
 Accuracy (mean)               : 0.8222
 Accuracy (std)                : 0.0045
 Error Rate (mean)             : 0.1778
 Error Rate (std)              : 0.0045
 Sensitivity/Recall (mean)     : 0.8222
 Sensitivity/Recall (std)      : 0.0045
 Specificity (mean)            : 0.9439
 Specificity (std)             : 0.0022
 PPV (mean)                    : 0.7565
 PPV (std)                     : 0.0045
 NPV (mean)                    : 0.9637
 NPV (std)                     : 0.0021
 F1 Score (mean)               : 0.7859
 F1 Score (std)                : 0.0045

Confusion Matrix:
Float32[325.0 0.6 12.6 1.8; 1.8 216.4 31.0 10.8; 32.2 22.2 144.8 0.8; 0.8 22.6 40.6 136.0]
┌───────────┬───────┬───────────┬──────────┬───────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m  Poor [0m│
├───────────┼───────┼───────────┼──────────┼───────┤
│[1m      Good [0m│ 325.0 │       0.6 │     12.6 │   1.8 │
│[1m Hazardous 

# K-Nearest Neighbors

In [24]:
kNNClassifier = MLJ.@load KNNClassifier pkg=NearestNeighborModels verbosity=0

NearestNeighborModels.KNNClassifier

In [25]:
############# 4. kNN (6 values) #############
knn_search_space = [
    Dict("K"=>1),
    Dict("K"=>3),
    Dict("K"=>5),
    Dict("K"=>7),
    Dict("K"=>9),
    Dict("K"=>11)
]

6-element Vector{Dict{String, Int64}}:
 Dict("K" => 1)
 Dict("K" => 3)
 Dict("K" => 5)
 Dict("K" => 7)
 Dict("K" => 9)
 Dict("K" => 11)

In [26]:
########################
# 4. KNN GRID SEARCH
########################
knn_results = []

for hp in knn_search_space
    println("\n=== kNN experiment: K=$(hp["K"]) ===")
    res = modelCrossValidation(:KNeighborsClassifier, hp, (inputs, targets), crossValidationIndices)
    push!(knn_results, (model=:KNN, hyperparams=hp, results=res))
end

results[:KNN] = knn_results


=== kNN experiment: K=1 ===

=== kNN experiment: K=3 ===

=== kNN experiment: K=5 ===

=== kNN experiment: K=7 ===

=== kNN experiment: K=9 ===

=== kNN experiment: K=11 ===


6-element Vector{Any}:
 (model = :KNN, hyperparams = Dict("K" => 1), results = ((0.9122f0, 0.0056745023f0), (0.0878f0, 0.0056745037f0), (0.9122f0, 0.0056745023f0), (0.97318095f0, 0.0014626426f0), (0.9104317f0, 0.005780211f0), (0.97883034f0, 0.0013779422f0), (0.9107488f0, 0.0059098597f0), Float32[330.8 1.2 6.2 1.8; 0.2 238.6 13.0 8.2; 19.0 11.6 162.4 7.0; 1.0 7.8 10.8 180.4]))
 (model = :KNN, hyperparams = Dict("K" => 3), results = ((0.92399997f0, 0.0075166505f0), (0.076f0, 0.0075166477f0), (0.92399997f0, 0.0075166505f0), (0.9755238f0, 0.0017911857f0), (0.92290366f0, 0.0071387487f0), (0.98230803f0, 0.0011934639f0), (0.9225222f0, 0.007936818f0), Float32[332.6 1.0 4.4 2.0; 0.2 241.4 10.0 8.4; 18.4 9.6 167.2 4.8; 1.0 6.0 10.2 182.8]))
 (model = :KNN, hyperparams = Dict("K" => 5), results = ((0.92840004f0, 0.007956126f0), (0.0716f0, 0.0079561295f0), (0.92840004f0, 0.007956126f0), (0.9760127f0, 0.002605206f0), (0.92752886f0, 0.007603931f0), (0.98378193f0, 0.0013056949f0), (0.92684996f0, 0.00

In [27]:
for entry in results[:KNN]
    printExperimentResult(entry.model, entry.hyperparams, entry.results, label_names)
end


 Model: KNN
 Hyperparameters: Dict("K" => 1)
 Accuracy (mean)               : 0.9122
 Accuracy (std)                : 0.0057
 Error Rate (mean)             : 0.0878
 Error Rate (std)              : 0.0057
 Sensitivity/Recall (mean)     : 0.9122
 Sensitivity/Recall (std)      : 0.0057
 Specificity (mean)            : 0.9732
 Specificity (std)             : 0.0015
 PPV (mean)                    : 0.9104
 PPV (std)                     : 0.0058
 NPV (mean)                    : 0.9788
 NPV (std)                     : 0.0014
 F1 Score (mean)               : 0.9107
 F1 Score (std)                : 0.0059

Confusion Matrix:
Float32[330.8 1.2 6.2 1.8; 0.2 238.6 13.0 8.2; 19.0 11.6 162.4 7.0; 1.0 7.8 10.8 180.4]
┌───────────┬───────┬───────────┬──────────┬───────┐
│[1m           [0m│[1m  Good [0m│[1m Hazardous [0m│[1m Moderate [0m│[1m  Poor [0m│
├───────────┼───────┼───────────┼──────────┼───────┤
│[1m      Good [0m│ 330.8 │       1.2 │      6.2 │   1.8 │
│[1m Hazardous [0m│   0.2

# Stacking Ensemble

In [25]:
SVMClassifier = @load ProbabilisticSVC pkg=LIBSVM verbosity=0
DTClassifier  = @load DecisionTreeClassifier pkg=DecisionTree verbosity=0
kNNClassifier = @load KNNClassifier pkg=NearestNeighborModels verbosity=0

trainIdx, testIdx = holdOut(size(df,1), 0.2)

trainingInputs  = inputs[trainIdx, :]
testInputs      = inputs[testIdx, :]

trainingTargets = targets[trainIdx]
testTargets     = targets[testIdx]

# X as DataFrame
train_df = DataFrame(trainingInputs, :auto)
test_df  = DataFrame(testInputs, :auto)

# y as categorical
train_y_cat = categorical(trainingTargets)
test_y_cat  = categorical(testTargets)

# ---- Base Models ----
base_models_dict = Dict(
    "SVM" => SVMClassifier(kernel=LIBSVM.Kernel.RadialBasis, cost=1.0, gamma=2.0),
    "DT"  => DTClassifier(max_depth=8, rng=Random.MersenneTwister(1)),
    "kNN" => kNNClassifier(K=7)
)

# Convert to NamedTuple for Stack
base_models_NT = (; (Symbol(k)=>v for (k,v) in base_models_dict)...)

# ---- Stacking Model ----
stack_model = Stack(;
    metalearner = DTClassifier(max_depth=3, rng=Random.MersenneTwister(1)),
    resampling = CV(nfolds=5, shuffle=true, rng=123),  # deterministic
    measure = accuracy, 
    base_models_NT...
)

# Train the stacking model on your train dataset
stack_mach = machine(stack_model, train_df, train_y_cat) |> MLJ.fit!

y_pred = mode.(MLJ.predict(stack_mach, test_df))
acc = MLJ.accuracy(y_pred, test_y_cat)
println("Stack ensemble accuracy = $(round(acc*100, digits=2)) %")

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(ProbabilisticStack(metalearner = DecisionTreeClassifier(max_depth = 3, …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:kNN, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:kNN, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:kNN, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:SVM, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:DT, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining 

Stack ensemble accuracy = 91.7 %
