In [1]:
include("utils/preprocessing.jl")
include("utils/model_evaluation.jl")
include("utils/data_loader.jl")
include("utils/visualization.jl")
include("utils/ml1_utils.jl")

evaluateAndPrintMetricsRanking (generic function with 1 method)

## DESCRIPTION OF APPROACH ##

##### LOAD THE DATA #####

In [2]:
# import Pkg;
# Pkg.add("Random")
using Random
Random.seed!(123)

data = DataLoader.load_data("dataset\\star_classification.csv");

##### (OPTIONAL) REDUCE THE DATASET #####

In [3]:
function reduce_data(dataset::Matrix, percentage_to_keep::Float64)
    # Extract data and targets from the dataset
    data = dataset[:, 1:end-1]
    targets = dataset[:, end]

    unique_classes = unique(targets)
    reduced_data = Matrix{Float64}(undef, 0, size(data, 2))
    reduced_targets = Vector{Float64}()

    for class in unique_classes
        # Get the data and targets for this class
        class_data = data[targets .== class, :]
        class_targets = targets[targets .== class]

        # Calculate the number of rows to keep
        num_rows_to_keep = Int(ceil(size(class_data, 1) * percentage_to_keep))

        # Randomly select the subset of rows
        indices = randperm(size(class_data, 1))[1:num_rows_to_keep]
        subset_class_data = class_data[indices, :]
        subset_class_targets = class_targets[indices]

        # Append the reduced data and targets for this class to the overall reduced data and targets
        reduced_data = vcat(reduced_data, subset_class_data)
        reduced_targets = vcat(reduced_targets, subset_class_targets)
    end

    # Combine reduced data and targets
    reduced_dataset = hcat(reduced_data, reduced_targets)

    return reduced_dataset
end

reduced_data = reduce_data(data, 0.1)

#= using StatsBase

# Extract the class column from the reduced dataset
classes = reduced_data[2:end, 14]

# Count the number of rows in each class
class_counts = countmap(classes)

# Print the counts
for (class, count) in class_counts
    println("Class $class: $count rows")
end =#

10432×18 Matrix{Any}:
                    "obj_ID"     "alpha"  …       "MJD"     "fiber_ID"
 1237668331490115584          217.823        54233       171
 1237678847184536320          329.41         55497       171
 1237662194520752384          167.451        54943       171
 1237679998237671936          348.675        57712       171
 1237666339725443584           15.6284    …  58100       171
 1237654030867366144          175.973        55677       171
 1237663782592315904           12.6412       55201       171
 1237667735043899392          180.531        54208       171
 1237669761184695552          331.729        56105       171
 1237664877804848128          148.53      …  58158       171
 1237651538715345152          202.775        54616       171
 1237665547826234112          238.54         55327       171
                   ⋮                      ⋱              
 1237663479256253184          327.55      …  55478       973
 1237651496835547392          127.034        55182      

##### PREPROCESSING THE DATASET #####

In [4]:
# preprocess_data(dataset, balancing_dataset, features)
inputs, targets = Preprocessing.preprocess_data(reduced_data, false, [4,5,6,7,8]);

"""    This function does the following:
        - Optional balancing the data using the undersampling method
        - Parse the data: chosing the correct columns for inputs and targets
        - Convert the input into an 2D array of floats
"""

# Print first input and target
println("First input: ", inputs[1, :])
println("First target: ", targets[1, :])

Size :(10431, 5)
First input: Float32[19.60844, 17.82744, 16.94823, 16.51685, 16.22224]
First target: Any["GALAXY"]


##### HOLDOUT #####

In [5]:
"""Initial holdOut split of the data"""

N = size(inputs, 1)

# Split to train and test using the holdOut function
train_indices, test_indices = holdOut(N, 0.3)

"""   This function does the following:
        - Split the data into train and test sets
        - Returns the indices of the train and test sets
"""

# Extract training and testing data
train_inputs = inputs[train_indices, :]
train_targets = targets[train_indices]
test_inputs = inputs[test_indices, :]
test_targets = targets[test_indices]

# Check size of train and test sets
println("Train inputs: ", size(train_inputs))
println("Train targets: ", size(train_targets))
println("Test inputs: ", size(test_inputs))
println("Test targets: ", size(test_targets))

Train inputs: (7302, 5)
Train targets: (7302,)
Test inputs: (3129, 5)
Test targets: (3129,)


##### NORMALIZATION #####

In [6]:
train_inputs = Preprocessing.normalize_data(train_inputs, "minmax")
test_inputs = Preprocessing.normalize_data(test_inputs, "minmax")

"""    This function does the following:
        - Normalize the input data using the minmax method ("minmax")
            or the zero mean method ("zeromean")
"""

println("First train inputs: ", train_inputs[1, :])
println("First test inputs: ", test_inputs[1, :])

First train inputs: Float32[0.36424, 0.45762557, 0.4814966, 0.4386007, 0.45649394]
First test inputs: Float32[0.43858442, 0.42200762, 0.4093346, 0.4031706, 0.37274083]


## Testing hyperparameters for each model ##

In [7]:
using ScikitLearn

@sk_import neural_network: MLPClassifier;
@sk_import svm: SVC;
@sk_import tree: DecisionTreeClassifier;
@sk_import neighbors: KNeighborsClassifier;

In [8]:
"""
Setting indices for the k-fold cross-validation
    we are about to do with the different models
"""
N=size(train_inputs,1)
k = 5 # number of folds
kFoldIndices = crossvalidation(N, k);

##### DECISION TREE #####

In [9]:
# Define an array of hyperparameter dictionaries for the Decision Tree model
dtree_hyperparameters_array = [
    Dict("max_depth" => 3),
    Dict("max_depth" => 5),
    Dict("max_depth" => 10),
    Dict("max_depth" => 20),
    Dict("max_depth" => 50),
    Dict("max_depth" => 100) # Deeper trees can capture more detail but risk overfitting
]

# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:DecisionTree,dtree_hyperparameters_array, train_inputs, train_targets, kFoldIndices)

Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4
Training with set of hyperparameters 5
Training with set of hyperparameters 6

----- acc -----
Set of hyperparameters 3 -> mean: 0.801 Std. Dev.: 0.013
Set of hyperparameters 4 -> mean: 0.779 Std. Dev.: 0.009
Set of hyperparameters 5 -> mean: 0.778 Std. Dev.: 0.01
Set of hyperparameters 6 -> mean: 0.778 Std. Dev.: 0.01
Set of hyperparameters 2 -> mean: 0.758 Std. Dev.: 0.003
Set of hyperparameters 1 -> mean: 0.72 Std. Dev.: 0.004

----- sensitivity -----
Set of hyperparameters 3 -> mean: 0.801 Std. Dev.: 0.013
Set of hyperparameters 4 -> mean: 0.779 Std. Dev.: 0.009
Set of hyperparameters 5 -> mean: 0.778 Std. Dev.: 0.01
Set of hyperparameters 6 -> mean: 0.778 Std. Dev.: 0.01
Set of hyperparameters 2 -> mean: 0.758 Std. Dev.: 0.003
Set of hyperparameters 1 -> mean: 0.72 Std. Dev.: 0.004

----- specificity -----
Set of hyperparameter

##### kNN #####

In [10]:
# Define an array of hyperparameter dictionaries for the kNN model
knn_hyperparameters_array = [
    Dict("n_neighbors" => 5),
    Dict("n_neighbors" => 10),
    Dict("n_neighbors" => 15),
    Dict("n_neighbors" => 20),
    Dict("n_neighbors" => 50),
    Dict("n_neighbors" => 100) # Large neighborhoods, smooths out predictions
]

# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:kNN,knn_hyperparameters_array, train_inputs, train_targets, kFoldIndices)

Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4
Training with set of hyperparameters 5
Training with set of hyperparameters 6

----- acc -----
Set of hyperparameters 1 -> mean: 0.828 Std. Dev.: 0.008
Set of hyperparameters 2 -> mean: 0.822 Std. Dev.: 0.01
Set of hyperparameters 3 -> mean: 0.815 Std. Dev.: 0.009
Set of hyperparameters 4 -> mean: 0.809 Std. Dev.: 0.011
Set of hyperparameters 5 -> mean: 0.791 Std. Dev.: 0.007
Set of hyperparameters 6 -> mean: 0.783 Std. Dev.: 0.006

----- sensitivity -----
Set of hyperparameters 1 -> mean: 0.828 Std. Dev.: 0.008
Set of hyperparameters 2 -> mean: 0.822 Std. Dev.: 0.01
Set of hyperparameters 3 -> mean: 0.815 Std. Dev.: 0.009
Set of hyperparameters 4 -> mean: 0.809 Std. Dev.: 0.011
Set of hyperparameters 5 -> mean: 0.791 Std. Dev.: 0.007
Set of hyperparameters 6 -> mean: 0.783 Std. Dev.: 0.006

----- specificity -----
Set of hyperparam

##### ANN #####

In [11]:
svm_hyperparameters_array = [
    # Uses 'rbf' kernel, medium complexity with C=1.0, default polynomial degree, 'scale' for gamma 
    Dict("kernel" => "rbf", "degree" => 3, "C" => 1.0, "gamma" => "scale"),
    
    # Same 'rbf' kernel, increased penalty (C=10.0) for larger-margin separation, 'auto' gamma adjusts based on features
    Dict("kernel" => "rbf", "degree" => 3, "C" => 10.0, "gamma" => "auto"),
    
    # Same 'rbf' kernel, lower penalty (C=0.1) for a softer-margin, 'scale' gamma is default scaling
    Dict("kernel" => "rbf", "degree" => 3, "C" => 0.1, "gamma" => "scale"),

    # 'linear' kernel, suitable for less complex data
    Dict("kernel" => "linear", "degree" => 5,  "C" => 0.1, "gamma" => "auto"),
    
    # 'linear' kernel, not affected by 'degree' or 'gamma', with C=1.0 indicating a balance between margin and misclassification
    Dict("kernel" => "linear", "degree" => 3, "C" => 1.0, "gamma" => "auto"),

    # 'linear' kernel with a higher penalty, stricter margin
    Dict("kernel" => "linear", "degree" => 7, "C" => 10.0, "gamma" => "scale"),
    
    # 'poly' kernel, polynomial degree is set twice by mistake, should only be 'degree' => 3, 'scale' gamma defaults to feature scale
    Dict("kernel" => "poly", "degree" => 3, "C" => 1.0, "gamma" => "scale"),
    
    # 'poly' kernel, increased polynomial degree (5) for higher model complexity, 'auto' gamma may overfit with high dimension
    Dict("kernel" => "poly", "degree" => 5, "C" => 1.0, "gamma" => "auto")
]


# Call the function to evaluate the model using different sets of hyperparameters and print the ranking of metrics.
evaluateAndPrintMetricsRanking(:SVM, svm_hyperparameters_array, train_inputs, train_targets, kFoldIndices)

Training with set of hyperparameters 1
Training with set of hyperparameters 2
Training with set of hyperparameters 3
Training with set of hyperparameters 4
Training with set of hyperparameters 5
Training with set of hyperparameters 6
Training with set of hyperparameters 7
Training with set of hyperparameters 8

----- acc -----
Set of hyperparameters 1 -> mean: 0.817 Std. Dev.: 0.009
Set of hyperparameters 2 -> mean: 0.785 Std. Dev.: 0.009
Set of hyperparameters 3 -> mean: 0.775 Std. Dev.: 0.007
Set of hyperparameters 6 -> mean: 0.758 Std. Dev.: 0.011
Set of hyperparameters 7 -> mean: 0.735 Std. Dev.: 0.007
Set of hyperparameters 5 -> mean: 0.707 Std. Dev.: 0.01
Set of hyperparameters 4 -> mean: 0.692 Std. Dev.: 0.007
Set of hyperparameters 8 -> mean: 0.592 Std. Dev.: 0.007

----- sensitivity -----
Set of hyperparameters 1 -> mean: 0.817 Std. Dev.: 0.009
Set of hyperparameters 2 -> mean: 0.785 Std. Dev.: 0.009
Set of hyperparameters 3 -> mean: 0.775 Std. Dev.: 0.007
Set of hyperparamete