In [1]:
include("utils/preprocessing.jl")
include("utils/model_evaluation.jl")
include("utils/data_loader.jl")
include("utils/visualization.jl")
include("utils/ml1_utils.jl")

evaluateAndPrintMetricsRanking (generic function with 1 method)

### Analysis of data ###

In [2]:
using Random
Random.seed!(123)

data = DataLoader.load_data_for_analysis("dataset\\star_classification.csv");
#Visualization.entry_visualization(data);

### Preprocessing ###

In [3]:
using DataFrames

data = DataLoader.load_data("dataset\\star_classification.csv");

# Preprocess the data

"""    This function does the following:
        - Balance the data using the undersampling method
        - Parse the data: chosing the correct columns for inputs and targets
        - Convert the input into an 2D array of floats
        - Normalize the inputs  ------> this should be done after the train-test split!!!!
"""

inputs, targets = Preprocessing.preprocess_data(data)

""" I've changed the preprocess_data function so it doesn't OneHotEncode the targets
    because it's not needed&advised for KNN, DT & SVM, only for ANN. 
    The OneHotEncoding for the ANN will be done in the modelCrossValidation function,
    which is called by the evaluateAndPrintMetricsRanking function.
"""

" I've changed the preprocess_data function so it doesn't OneHotEncode the targets\n    because it's not needed&advised for KNN, DT & SVM, only for ANN. \n    The OneHotEncoding for the ANN will be done in the modelCrossValidation function,\n    which is called by the evaluateAndPrintMetricsRanking function.\n"

In [4]:
println(typeof(inputs), typeof(targets))
println(size(inputs),size(targets))
println(targets[1:10])

Matrix{Float32}Vector{Any}
(56883, 5)(56883,)
Any["GALAXY", "GALAXY", "GALAXY", "GALAXY", "GALAXY", "GALAXY", "GALAXY", "GALAXY", "GALAXY", "GALAXY"]


In [5]:
"""Initial holdOut split of the data"""

N = size(inputs, 1)

# Split to train and test using the holdOut function
train_indices, test_indices = holdOut(N, 0.98)

# Extract training and testing data
train_inputs = inputs[train_indices, :]
train_targets = targets[train_indices]
test_inputs = inputs[test_indices, :]
test_targets = targets[test_indices]

# Check size of train and test sets
println("Train inputs: ", size(train_inputs))
println("Train targets: ", size(train_targets))
println("Test inputs: ", size(test_inputs))
println("Test targets: ", size(test_targets))

Train inputs: (39818, 5)
Train targets: (39818,)
Test inputs: (17065, 5)
Test targets: (17065,)


### Testing hyperparameters for each model ###

In [6]:
using ScikitLearn

@sk_import neural_network: MLPClassifier
@sk_import svm: SVC
@sk_import tree: DecisionTreeClassifier
@sk_import neighbors: KNeighborsClassifier

PyObject <class 'sklearn.neighbors._classification.KNeighborsClassifier'>

In [7]:
"""
Setting indices for the k-fold cross-validation
    we are about to do with the different models
"""
N=size(train_inputs,1)
k = 5 # number of folds
kFoldIndices = crossvalidation(N, k);

#### Decision Tree ####

In [8]:
# Define an array of hyperparameter dictionaries for the Decision Tree model
dtree_hyperparameters_array = [
    Dict("max_depth" => 3),
    Dict("max_depth" => 5),
    Dict("max_depth" => 10),
    Dict("max_depth" => 20),
    Dict("max_depth" => 50),
    Dict("max_depth" => 100) # Deeper trees can capture more detail but risk overfitting
]

# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:DecisionTree,dtree_hyperparameters_array, train_inputs, train_targets, kFoldIndices)


Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4
Training with set of hyperparameters 5
Training with set of hyperparameters 6

----- acc -----
Set of hyperparameters 3 -> mean: 0.782 Std. Dev.: 0.003
Set of hyperparameters 4 -> mean: 0.774 Std. Dev.: 0.003
Set of hyperparameters 5 -> mean: 0.764 Std. Dev.: 0.004
Set of hyperparameters 6 -> mean: 0.764 Std. Dev.: 0.004
Set of hyperparameters 2 -> mean: 0.697 Std. Dev.: 0.006
Set of hyperparameters 1 -> mean: 0.634 Std. Dev.: 0.004

----- sensitivity -----
Set of hyperparameters 3 -> mean: 0.782 Std. Dev.: 0.003
Set of hyperparameters 4 -> mean: 0.774 Std. Dev.: 0.003
Set of hyperparameters 5 -> mean: 0.764 Std. Dev.: 0.004
Set of hyperparameters 6 -> mean: 0.764 Std. Dev.: 0.004
Set of hyperparameters 2 -> mean: 0.697 Std. Dev.: 0.006
Set of hyperparameters 1 -> mean: 0.634 Std. Dev.: 0.004

----- specificity -----
Set of hyperpar

#### KNN ####

In [9]:
# Define an array of hyperparameter dictionaries for the kNN model
knn_hyperparameters_array = [
    Dict("n_neighbors" => 5),
    Dict("n_neighbors" => 10),
    Dict("n_neighbors" => 15),
    Dict("n_neighbors" => 20),
    Dict("n_neighbors" => 50),
    Dict("n_neighbors" => 100) # Large neighborhoods, smooths out predictions
]

# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:kNN,knn_hyperparameters_array, train_inputs, train_targets, kFoldIndices)

Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4
Training with set of hyperparameters 5
Training with set of hyperparameters 6

----- acc -----
Set of hyperparameters 2 -> mean: 0.822 Std. Dev.: 0.004
Set of hyperparameters 3 -> mean: 0.822 Std. Dev.: 0.003
Set of hyperparameters 1 -> mean: 0.822 Std. Dev.: 0.002
Set of hyperparameters 4 -> mean: 0.819 Std. Dev.: 0.003
Set of hyperparameters 5 -> mean: 0.805 Std. Dev.: 0.005
Set of hyperparameters 6 -> mean: 0.787 Std. Dev.: 0.005

----- sensitivity -----
Set of hyperparameters 2 -> mean: 0.822 Std. Dev.: 0.004
Set of hyperparameters 3 -> mean: 0.822 Std. Dev.: 0.003
Set of hyperparameters 1 -> mean: 0.822 Std. Dev.: 0.002
Set of hyperparameters 4 -> mean: 0.819 Std. Dev.: 0.003
Set of hyperparameters 5 -> mean: 0.805 Std. Dev.: 0.005
Set of hyperparameters 6 -> mean: 0.787 Std. Dev.: 0.005

----- specificity -----
Set of hyperpar

#### ANN ####

In [10]:
# Define an array of hyperparameter dictionaries for the ANN model
ann_hyperparameters_array = [
    # Two-layer architecture, moderate neurons
    Dict("architecture" => [50, 30], "activation" => "relu", "learning_rate" => 0.01, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10),

    # One-layer architecture, fewer neurons
    Dict("architecture" => [30], "activation" => "relu", "learning_rate" => 0.01, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10),

    # Two-layer, different activation function
    Dict("architecture" => [50, 30], "activation" => "tanh", "learning_rate" => 0.01, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10),

    # One-layer, lower learning rate
    Dict("architecture" => [30], "activation" => "relu", "learning_rate" => 0.001, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 2000, "repetitionsTraining" => 10),

    # Two-layer, higher learning rate
    Dict("architecture" => [50, 30], "activation" => "relu", "learning_rate" => 0.05, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10),

    # One-layer, logistic activation
    Dict("architecture" => [30], "activation" => "logistic", "learning_rate" => 0.01, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10),

    # Two-layer, more neurons, different activation
    Dict("architecture" => [70, 40], "activation" => "tanh", "learning_rate" => 0.01, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10),

    # One-layer, more neurons
    Dict("architecture" => [50], "activation" => "relu", "learning_rate" => 0.01, "validation_ratio" => 0.1, "n_iter_no_change" => 80, "max_iter" => 1000, "repetitionsTraining" => 10)
]

# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:ANN, ann_hyperparameters_array, train_inputs, train_targets, kFoldIndices)

Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4
Training with set of hyperparameters 5
Training with set of hyperparameters 6
Training with set of hyperparameters 7




Training with set of hyperparameters 8

----- acc -----
Set of hyperparameters 7 -> mean: 0.833 Std. Dev.: 0.003
Set of hyperparameters 1 -> mean: 0.832 Std. Dev.: 0.003
Set of hyperparameters 3 -> mean: 0.831 Std. Dev.: 0.001
Set of hyperparameters 5 -> mean: 0.819 Std. Dev.: 0.002
Set of hyperparameters 8 -> mean: 0.814 Std. Dev.: 0.004
Set of hyperparameters 2 -> mean: 0.807 Std. Dev.: 0.003
Set of hyperparameters 6 -> mean: 0.803 Std. Dev.: 0.004
Set of hyperparameters 4 -> mean: 0.796 Std. Dev.: 0.006

----- sensitivity -----
Set of hyperparameters 7 -> mean: 0.833 Std. Dev.: 0.003
Set of hyperparameters 1 -> mean: 0.832 Std. Dev.: 0.003
Set of hyperparameters 3 -> mean: 0.831 Std. Dev.: 0.001
Set of hyperparameters 5 -> mean: 0.819 Std. Dev.: 0.002
Set of hyperparameters 8 -> mean: 0.814 Std. Dev.: 0.004
Set of hyperparameters 2 -> mean: 0.807 Std. Dev.: 0.003
Set of hyperparameters 6 -> mean: 0.803 Std. Dev.: 0.004
Set of hyperparameters 4 -> mean: 0.796 Std. Dev.: 0.006

-----

#### SVM ####

In [12]:
svm_hyperparameters_array = [
    # Uses 'rbf' kernel, medium complexity with C=1.0, default polynomial degree, 'scale' for gamma 
    Dict("kernel" => "rbf", "degree" => 3, "C" => 1.0, "gamma" => "scale"),
    
    # Same 'rbf' kernel, increased penalty (C=10.0) for larger-margin separation, 'auto' gamma adjusts based on features
    Dict("kernel" => "rbf", "degree" => 3, "C" => 10.0, "gamma" => "auto"),
    
    # Same 'rbf' kernel, lower penalty (C=0.1) for a softer-margin, 'scale' gamma is default scaling
    Dict("kernel" => "rbf", "degree" => 3, "C" => 0.1, "gamma" => "scale"),

    # 'linear' kernel, suitable for less complex data
    Dict("kernel" => "linear", "C" => 0.1),
    
    # 'linear' kernel, not affected by 'degree' or 'gamma', with C=1.0 indicating a balance between margin and misclassification
    Dict("kernel" => "linear", "degree" => 3, "C" => 1.0, "gamma" => "auto"),

    # 'linear' kernel with a higher penalty, stricter margin
    Dict("kernel" => "linear", "C" => 10.0),
    
    # 'poly' kernel, polynomial degree is set twice by mistake, should only be 'degree' => 3, 'scale' gamma defaults to feature scale
    Dict("kernel" => "poly", "degree" => 3, "C" => 1.0, "gamma" => "scale"),
    
    # 'poly' kernel, increased polynomial degree (5) for higher model complexity, 'auto' gamma may overfit with high dimension
    Dict("kernel" => "poly", "degree" => 5, "C" => 1.0, "gamma" => "auto")
]


# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:SVM, svm_hyperparameters_array, train_inputs, train_targets, kFoldIndices)

Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4


LoadError: KeyError: key "degree" not found

In [None]:
"
using Plots

# Define the data for each model
ann_means = [0.947, 0.947, 0.925, 0.788, 0.948, 0.933]
ann_stds = [0.018, 0.04, 0.07, 0.097, 0.036, 0.039]
svm_means = [0.947, 0.947, 0.927, 0.953, 0.94, 0.4]
svm_stds = [0.03, 0.038, 0.092, 0.051, 0.043, 0.082]
dt_means = [0.927, 0.913, 0.913, 0.913, 0.913, 0.913]
dt_stds = [0.043, 0.045, 0.045, 0.045, 0.045, 0.045]
knn_means = [0.947, 0.947, 0.96, 0.94, 0.913, 0.507]
knn_stds = [0.038, 0.038, 0.015, 0.043, 0.104, 0.068]

# Create subplots for each model
p1 = bar(1:6, ann_means, yerr=ann_stds, title="ANN", legend=false)
p2 = bar(1:6, svm_means, yerr=svm_stds, title="SVM", legend=false)
p3 = bar(1:6, dt_means, yerr=dt_stds, title="Decision Tree", legend=false)
p4 = bar(1:6, knn_means, yerr=knn_stds, title="KNN", legend=false)

# Customize the y-axis and labels
for p in [p1, p2, p3, p4]
    ylabel!(p, "Accuracy")
    xlabel!(p, "Set of Hyperparameters")
end

# Combine the plots into one figure
plot(p1, p2, p3, p4, layout=(2,2), size=(800,600))
"

# ---------------------------------------------- #

In [None]:
"model = DecisionTreeClassifier(max_depth=10)

# Train the model on the entire dataset
fit!(model, train_inputs, train_targets)
predictions = predict(model, test_inputs)
"

In [None]:
### GOTTA CHANGE THIS TO TAKE THE INPUTS AS THEY ARE RIGHT NOW
"""
accuracy_value = accuracy(predictions, test_targets)
println("Accuracy: ", accuracy_value)
"""

"accuracy_value = accuracy(predictions, test_targets)\nprintln(\"Accuracy: \", accuracy_value)\n"