In [1]:
include("utils/data_loader.jl")
include("utils/visualization.jl")
include("utils/preprocessing.jl")

Main.Preprocessing

## APPROACH 1: Without Balancing ##

In [2]:
# import Pkg;
# Pkg.add("Random")
using Random
Random.seed!(123)

data = DataLoader.load_data("dataset\\star_classification.csv");

##### Preprocess and balancing data #####

In [3]:
# preprocess_data(dataset, balancing_dataset, normalization, features)
inputs, targets = Preprocessing.preprocess_data(data, true, [4,5,6,7,8]);

# Print first input and target
println("First input: ", inputs[1, :])
println("First target: ", targets[1, :])

Size :(56883, 5)
First input: Float32[23.12955, 22.98013, 21.18687, 20.26653, 19.87306]
First target: Bool[1, 0, 0]


In [4]:
# Split to train and test
# train_inputs, train_targets, test_inputs, test_targets = DataLoader.split_data(inputs, targets, 0.8);
# Check size of train and test sets
# println("Train inputs: ", size(train_inputs))
# println("Train targets: ", size(train_targets))
# println("Test inputs: ", size(test_inputs))
# println("Test targets: ", size(test_targets))

In [5]:
# Normalize data
train_inputs = Preprocessing.normalize_data(inputs , "minmax");
println("Train inputs: ", train_inputs[1, :])
# println("Train targets: ", train_targets[1, :])

Train inputs: Float32[0.9990379, 0.99914044, 0.5812591, 0.47621873, 0.9991107]


In [6]:
# println("Train inputs: ", size(train_targets))
training_indices = Preprocessing.crossvalidation(targets, 10)
println("Crossvalidation indices: ", training_indices[1:100])

Crossvalidation indices: [6, 4, 10, 7, 4, 7, 8, 1, 4, 9, 10, 10, 3, 5, 4, 4, 1, 6, 1, 4, 9, 1, 5, 5, 9, 5, 2, 10, 8, 6, 10, 4, 1, 2, 1, 2, 10, 9, 10, 6, 5, 8, 1, 10, 6, 9, 6, 2, 9, 1, 6, 9, 1, 3, 10, 7, 5, 2, 3, 7, 9, 4, 5, 8, 4, 2, 3, 5, 7, 9, 1, 5, 2, 8, 3, 1, 7, 6, 2, 3, 5, 9, 5, 1, 7, 2, 2, 9, 7, 2, 2, 1, 6, 5, 6, 1, 2, 4, 4, 5]


##### EVALUATING KNN'S DIFFERENT PARAMETERS #####

In [7]:
using ScikitLearn
using Statistics

@sk_import neighbors: KNeighborsClassifier

function train_and_evaluate_knn_hyperparameters(k_values::AbstractVector{Int}, trainingDataset::Tuple{Matrix{Float32}, BitMatrix}, kFoldIndices::Array{Int64,1})
    numFolds = maximum(kFoldIndices)
    (inputs, targets) = trainingDataset

    testAccuracies = Float64[]

    for k_val in k_values
        knn_model = KNeighborsClassifier(n_neighbors=k_val)

        fold_accuracies = Float64[]

        for numFold in 1:numFolds
            trainingInputs = inputs[kFoldIndices .!= numFold, :]
            testingInputs = inputs[kFoldIndices .== numFold, :]
            trainingTargets = targets[kFoldIndices .!= numFold, :]
            testingTargets = targets[kFoldIndices .== numFold, :]

            _, trained_model = Preprocessing.oneVSall(knn_model, trainingInputs, trainingTargets)
            predictions = predict(trained_model, testingInputs)
            accuracy = sum(predictions .== testingTargets) / length(testingTargets)
            push!(fold_accuracies, accuracy)
        end

        mean_accuracy = mean(fold_accuracies)
        push!(testAccuracies, mean_accuracy)

        println("KNN with k=$k_val - Mean Accuracy: $mean_accuracy")
    end

    best_k = argmax(testAccuracies)
    best_accuracy = testAccuracies[best_k]

    println("Best KNN Model - k=$(k_values[best_k]) - Best Mean Accuracy: $best_accuracy")

    return k_values[best_k]
end


train_and_evaluate_knn_hyperparameters (generic function with 1 method)

In [8]:
k_values_to_try = [3, 5, 7, 9, 11, 13]
# best_k = train_and_evaluate_knn_hyperparameters(k_values_to_try, (train_inputs, targets), training_indices)

6-element Vector{Int64}:
  3
  5
  7
  9
 11
 13

##### EVALUATING DECISION TREE'S HYPERPARAMETERS #####

In [9]:
using ScikitLearn
using Statistics

@sk_import tree: DecisionTreeClassifier

function train_and_evaluate_decision_tree_hyperparameters(max_depth_values::AbstractVector{Int}, trainingDataset::Tuple{Matrix{Float32}, BitMatrix}, kFoldIndices::Array{Int64,1})
    numFolds = maximum(kFoldIndices)
    (inputs, targets) = trainingDataset

    testAccuracies = Float64[]

    for max_depth_val in max_depth_values
        dt_model = DecisionTreeClassifier(max_depth=max_depth_val)

        fold_accuracies = Float64[]

        for numFold in 1:numFolds
            trainingInputs = inputs[kFoldIndices .!= numFold, :]
            testingInputs = inputs[kFoldIndices .== numFold, :]
            trainingTargets = targets[kFoldIndices .!= numFold, :]
            testingTargets = targets[kFoldIndices .== numFold, :]

            _, trained_model = Preprocessing.oneVSall(dt_model, trainingInputs, trainingTargets)
            predictions = predict(trained_model, testingInputs)
            accuracy = sum(predictions .== testingTargets) / length(testingTargets)
            push!(fold_accuracies, accuracy)
        end

        mean_accuracy = mean(fold_accuracies)
        push!(testAccuracies, mean_accuracy)

        println("Decision Tree with max_depth=$max_depth_val - Mean Accuracy: $mean_accuracy")
    end

    best_depth = argmax(testAccuracies)
    best_accuracy = testAccuracies[best_depth]

    println("Best Decision Tree Model - max_depth=$(max_depth_values[best_depth]) - Best Mean Accuracy: $best_accuracy")

    return max_depth_values[best_depth]
end


train_and_evaluate_decision_tree_hyperparameters (generic function with 1 method)

In [10]:
max_depth_values_to_try = [3, 5, 7, 9, 11, 13]
# best_max_depth = train_and_evaluate_decision_tree_hyperparameters(max_depth_values_to_try, (train_inputs, targets), training_indices)

6-element Vector{Int64}:
  3
  5
  7
  9
 11
 13

##### EVALUATING SVM'S HYPERPARAMETERS #####

In [12]:
using ScikitLearn
using Statistics
using PyCall

@sk_import svm: SVC

function train_and_evaluate_svm_hyperparameters(kernels::AbstractVector{String}, c_values::AbstractVector{Float64}, trainingDataset::Tuple{Matrix{Float32}, BitMatrix}, kFoldIndices::Array{Int64,1})
    numFolds = maximum(kFoldIndices)
    (inputs, targets) = trainingDataset

    testAccuracies = Float64[]
    println("Kernels: $kernels")
    println("C values: $c_values")
    
    println("First check.")
    for kernel_val in kernels
        println("Second check.")
        for c_val in c_values
            println("Training SVM with kernel=$kernel_val, C=$c_val")
            # svm = pyimport("sklearn.svm")
            println("Third check.")
            # svc = svm.SVC(C=c_val, kernel=kernel_val)
            svc = SVC(C=c_val, kernel=kernel_val)
            println("Fourth check.")

            fold_accuracies = Float64[]
            println("Fifth check.")
            for numFold in 1:numFolds
                println("Starting training.")
                trainingInputs = inputs[kFoldIndices .!= numFold, :]
                testingInputs = inputs[kFoldIndices .== numFold, :]
                trainingTargets = targets[kFoldIndices .!= numFold, :]
                testingTargets = targets[kFoldIndices .== numFold, :]

                _, trained_model = Preprocessing.oneVSall(svc, trainingInputs, trainingTargets)
                println("Training done, numFold: $numFold")
                predictions = predict(trained_model, testingInputs)
                accuracy = sum(predictions .== testingTargets) / length(testingTargets)
                push!(fold_accuracies, accuracy)
            end

            mean_accuracy = mean(fold_accuracies)
            push!(testAccuracies, mean_accuracy)

            println("SVM with kernel=$kernel_val, C=$c_val - Mean Accuracy: $mean_accuracy")
        end
    end

    best_idx = argmax(testAccuracies)
    best_accuracy = testAccuracies[best_idx]
    best_kernel_idx, best_c_idx = divrem(best_idx - 1, length(c_values))

    best_kernel = kernels[best_kernel_idx + 1]
    best_c = c_values[best_c_idx + 1]

    println("Best SVM Model - Kernel: $best_kernel, C: $best_c - Best Mean Accuracy: $best_accuracy")

    return best_kernel, best_c
end




train_and_evaluate_svm_hyperparameters (generic function with 1 method)

In [13]:
kernels_to_try = ["rbf", "linear", "poly"]
c_values_to_try = [1.0, 20.0, 10.0, 5.0, 2.0, 50.0, 0.5, 0.1]

best_kernel, best_c = train_and_evaluate_svm_hyperparameters(kernels_to_try, c_values_to_try, (train_inputs, targets), training_indices)


Kernels: ["rbf", "linear", "poly"]
C values: [1.0, 20.0, 10.0, 5.0, 2.0, 50.0, 0.5, 0.1]
First check.
Second check.
Training SVM with kernel=rbf, C=1.0
Third check.

In [None]:
#= using ScikitLearn

@sk_import svm: SVC
@sk_import tree: DecisionTreeClassifier
@sk_import neighbors: KNeighborsClassifier

function train_model(estimators::AbstractArray{Symbol,1},
                    modelsHyperParameters::AbstractArray{Dict{String,Any},1},
                    trainingDataset::Tuple{Matrix{Float32}, BitMatrix},
                    kFoldIndices::Array{Int64,1})
    
    numFolds = maximum(kFoldIndices)
    (inputs, targets) = trainingDataset

    testAccuracies = Array{Float64}(undef, numFolds);

    for numFold in 1:numFolds
        trainingInputs = inputs[kFoldIndices .!= numFold, :];
        testingInputs = inputs[kFoldIndices .== numFold, :];
        trainingTargets = targets[kFoldIndices .!= numFold, :];
        testingTargets = targets[kFoldIndices .== numFold, :];
        
        #Define the models to train
        models = Dict("SVM" => SVC(probability=modelsHyperParameters[1]["probability"], kernel=modelsHyperParameters[1]["kernel"], C=modelsHyperParameters[1]["C"]), 
                "DT" => DecisionTreeClassifier(max_depth=modelsHyperParameters[2]["max_depth"]),
                "KNN" => KNeighborsClassifier(n_neighbors=modelsHyperParameters[3]["n_neighbors"][1]))

        # base_models = [name for name in keys(models)]
        # base_models = values(models)

        # test each model 
        model = values(models["KNN"])

        final_outputs, trained_model = Preprocessing.oneVSall(model, trainingInputs, trainingTargets)
        predictions = predict(trained_model, testingInputs)
        # println("Size of final_outputs: ", size(final_outputs))
        # println("Size of trainingTargets: ", size(trainingTargets))
        (Sensitivity, Specificity, PPV, NPV, F1, Accuracy, WeightedSensitivity, WeightedSpecificity, WeightedPPV, WeightedNPV, WeightedF1, WeightedAccuracy) = Preprocessing.confusionMatrix(predictions, testingTargets)
        println("NumFold: ", numFold)
        println("MacroAccuracy: ", WeightedAccuracy)
    end
end =#

In [None]:
#= modelsHyperParameters = Array{Dict{String, Any}}(undef, 3)
modelsHyperParameters[1] = Dict("probability" => true, "kernel" => "linear", "C" => 0.1)
modelsHyperParameters[2] = Dict("max_depth" => 4)
modelsHyperParameters[3] = Dict("n_neighbors" => [1, 3, 5, 7, 9, 11])

ensemble_accuracy = train_model([:SVM, :DT, :KNN], modelsHyperParameters, (inputs, targets), training_indices)=#