# Model definitions and testing

# Imports and declarations

This section contains all imports and declarations

In [1]:
# The next packages must be installed to run the solution
import Pkg; 
#Pkg.add("Flux")
#Pkg.add("RDatasets")
#Pkg.add("FeatureSelectors")
#Pkg.add("ScikitLearn"))
#Pkg.add("WeightedPCA"))
#Pkg.add("BetaML")
# Packages used To store and load models in and from disk
# Pkg.add("JLD")
# Pkg.add("HDF5")
# Pkg.add("PyCallJLD")
# Package use to count distribution
# Pkg.add("DataStructures")
# Pkg.add("MLDataPattern")

In [2]:
# Import libraries
using Flux
using Flux.Losses
using DelimitedFiles
using Statistics
using Random
using ScikitLearn
using RDatasets
using FeatureSelectors
using JLD
using PyCallJLD
using DataStructures
using MLDataPattern

In [3]:
# Import ScikitLearn models
@sk_import svm:SVC
@sk_import tree:DecisionTreeClassifier
@sk_import linear_model:LogisticRegression
@sk_import neighbors: KNeighborsClassifier
@sk_import naive_bayes:GaussianNB 
@sk_import ensemble:VotingClassifier
@sk_import ensemble:StackingClassifier
@sk_import ensemble:BaggingClassifier
@sk_import decomposition:PCA

PyObject <class 'sklearn.decomposition._pca.PCA'>

In [4]:
# Legacy code done in previous practices
include("utils/practices_code.jl")
# Class that handle the model processing
include("utils/model_handler.jl")

LoadError: LoadError: UndefVarError: bool not defined
in expression starting at c:\Master\Machine Learning Final\finalpracticalml\utils\model_handler.jl:2

In [None]:
# Constants
# Execute model test set
RUN_ANN_TEST = false
RUN_SVM_TEST = false
RUN_DT_TEST = true
RUN_KNN_TEST = true
ANN_FILE_PATH = "dataset/models/ann.jld"
SVM_FILE_PATH = "dataset/models/svm.jld"
DT_FILE_PATH = "dataset/models/dt.jld"
KNN_FILE_PATH = "dataset/models/knn.jld"
UPDATE_FILE = false
ORIGINAL_DATASET = "dataset/music_genre.csv"
CLEAN_DATASET = "dataset/clean_music_genre.csv"
NUMERIC_CLEAN_DATASET = "dataset/numeric_clean_music_genre.csv"
KBEST_DATASET = "dataset/kest_df.csv"
CONVERT_STRING=true
ONE_HOT_ENCODING_OUTPUT=false
NORMALIZE_MIN_MAX=true
USE_OVER_SAMPLE=false
USE_UNDER_SAMPLE=false
USE_PCA=false
HOLD_OUT=0.3
FOLDS=20

false

# Data preprocessing

This section contains the preprocessing of the data

In [None]:
# Load the dataset and normalize
dataset = readdlm(KBEST_DATASET,',');

println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,:])

println(size(dataset,1))
println(size(dataset,2))

# Separate train_x and train_y
train_x = dataset[2:size(dataset,1),1:size(dataset,2)-1]
train_y = dataset[2:size(dataset,1),size(dataset,2)]

# Convert to string the output classes
if CONVERT_STRING
    train_y = string.(train_y)
end

# Convert to one hot encoding the output classes
if ONE_HOT_ENCODING_OUTPUT
    train_y = oneHotEncoding(train_y)
end

println("Inputs size: ", size(train_x))
println("Sample of inputs: ", train_x[1,:])
println("Sample of inputs: ", train_x[2,:])
println("Outputs size: ", size(train_y))
println("Sample of Outputs: ", train_y[1])
println("Sample of Outputs: ", train_y[2])

Dataset original size: (17924, 13)
Sample of original dataset: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882, 5]
17924
13
Inputs size: (17923, 12)
Sample of inputs: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample of inputs: Any[54.0, 0.382, 0.814, 13.58626223890115, 0.0011, 0.569, 116.454, 251.733, 0, 0, 0.0406, 0.00401]
Outputs size: (17923,)
Sample of Outputs: 5
Sample of Outputs: 10


In [None]:
# Using Hold Out function to split dataset into train and test
indexs = holdOut(size(train_x,1),HOLD_OUT)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
if NORMALIZE_MIN_MAX
    train_input = normalizeMinMax!(train_input)
    test_input = normalizeMinMax!(test_input)
END

println("Size original input data: ", size(train_x))
println("Size original output data: ", size(train_y))

println("Size train input data: ", size(train_input))
println("Size train output data: ", size(train_output))

println("Size test input data: ", size(test_input))
println("Size test output data: ", size(test_output))

println("Sample original input data: ", train_x[1,:])
println("Sample train input data: ", train_input[1,:])
println("Sample test input data: ", test_input[1,:])

println("Unique Outputs: ", unique(train_y))

Size original input data: (17923, 12)
Size original output data: (17923,)
Size train input data: (12546, 12)
Size train output data: (12546,)
Size test input data: (5377, 12)
Size test output data: (5377,)
Sample original input data: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample train input data: [0.425531914893617, 0.40587665482725216, 0.8199963459258223, 0.7153069110103181, 0.07931726907630522, 0.6350440642820114, 0.4095655012603085, 0.12278214427098692, 0.0, 0.0, 0.056723716381418085, 4.943685786840785e-5]
Sample test input data: [0.29896907216494845, 0.8254553339115351, 0.5820763888192906, 0.7788889689146599, 0.13353413654618473, 0.8923060992026509, 0.344533493335619, 0.10596919580502202, 0.0, 0.0, 0.1945945945945946, 0.000486846672985156]
Unique Outputs: ["5", "10", "6", "2", "Other", "8", "9", "1"]


In [None]:
if USE_OVER_SAMPLE
    println("Inputs size: ", size(train_input))
    println("Outputs size: ", size(train_output))

    balanced_x, balanced_y = oversample((train_input', train_output))

    println("Outputs Values: ", unique(train_output))
    println("Before balance:", counter(train_output))

    train_input = getobs(balanced_x')
    train_output = getobs(balanced_y)

    println("After balance:", counter(train_output))
end

In [None]:
if USE_UNDER_SAMPLE
    println("Inputs size: ", size(train_input))
    println("Outputs size: ", size(train_output))

    balanced_x, balanced_y = undersample((train_input', train_output))

    println("Outputs Values: ", unique(train_output))
    println("Before balance:", counter(train_output))

    train_input = getobs(balanced_x')
    train_output = getobs(balanced_y)

    println("After balance:", counter(train_output))
end

In [None]:
# Apply Principal Component Analysis
if USE_PCA
    pca = PCA(0.95)
    fit!(pca, train_input)

    pca_train = pca.transform(train_input)
    pca_test = pca.transform(test_input)

    println("Train Patterns ", size(train_input), " -> ", size(pca_train))
    println("Test Patterns ", size(test_input), " -> ", size(pca_test))

    train_input = pca_train
    test_input = pca_test

    # PCA based on 95% variance, suggests that there are 5 features have noise and should be eliminated
    # from the input data, it's worth to try apply it and compare, maybe after finding the optimaal
    # parameters of the model
end

In [None]:
indexs = crossvalidation(train_output, NUM_FOLDS)
kFoldIndices = convert(Vector{Int64}, indexs)

println(size(kFoldIndices))

(12546,)


# Model experimentation

This section contains all experimentation of the models

In [None]:
if RUN_ANN_TEST
    test_ANN_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, ANN_FILE_PATH)
end

Parameters: Dict{Any, Any}("repetitionsTraining" => 3, "maxEpochs" => 500, "learningRate" => 0.01, "topology" => [8, 8, 8], "validationRatio" => 0, "maxEpochsVal" => 20, "minLoss" => 0.0, "transferFunctions" => [NNlib.logσ, NNlib.logσ, NNlib.logσ]) Accuracy: 0.1470788072299995 Fscore: 0.06397700566782863
Parameters: Dict{Any, Any}("repetitionsTraining" => 3, "maxEpochs" => 500, "learningRate" => 0.01, "topology" => [16, 12, 8], "validationRatio" => 0, "maxEpochsVal" => 20, "minLoss" => 0.0, "transferFunctions" => [NNlib.logσ, NNlib.logσ, NNlib.logσ]) Accuracy: 0.14833549602502627 Fscore: 0.06655158593484004
Parameters: Dict{Any, Any}("repetitionsTraining" => 3, "maxEpochs" => 500, "learningRate" => 0.01, "topology" => [32, 16, 8], "validationRatio" => 0, "maxEpochsVal" => 20, "minLoss" => 0.0, "transferFunctions" => [NNlib.logσ, NNlib.logσ, NNlib.logσ]) Accuracy: 0.14835986657979322 Fscore: 0.06685896083738449
Parameters: Dict{Any, Any}("repetitionsTraining" => 3, "maxEpochs" => 500, "

LoadError: UndefVarError: testOutputs not defined

In [None]:
if RUN_SVM_TEST
    test_SVM_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, SVM_FILE_PATH)
end

Test results for SVM model: 
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 1, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.4229176534236273 Fscore: 0.3407648984437798
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 2, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.42618836098532886 Fscore: 0.34846228068931423
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 3, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.4273044114639554 Fscore: 0.3513293481100276
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 4, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.42866403626138216 Fscore: 0.3555102890031802
Parameters: Dict{Any, Any}("tol" => 0.001, "kern

In [None]:
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3742877591704647 Fscore: 0.26968481003234757
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.36473533120032464 Fscore: 0.22300396003521353
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.32774655694918753 Fscore: 0.1713062660022087
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.27506838938921974 Fscore: 0.053932067243248574
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3809828701438388 Fscore: 0.293931

In [None]:
if RUN_KNN_TEST
    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3109185508144868 Fscore: 0.29249636917829636
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.28780936566523896 Fscore: 0.2625364079246324
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.28503276145974055 Fscore: 0.2741920229935021
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.34734598479564216 Fscore: 0.3150625765818593
Parameters: Dict{Any, Any}("n_neighbors" => 7, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3617033874435635 Fscore: 0.3203798143749458
Parameters: Dict{Any, Any}("n_neighbors" => 10, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3781196540675367 Fscore: 0.32801742145043744
Parameters: Dict{Any, Any}("n