# Model definitions and testing

# Imports and declarations

This section contains all imports and declarations

In [1]:
# The next packages must be installed to run the solution
import Pkg; 
#Pkg.add("Flux")
#Pkg.add("RDatasets")
#Pkg.add("FeatureSelectors")
#Pkg.add("ScikitLearn"))
#Pkg.add("WeightedPCA"))
#Pkg.add("BetaML")
# Packages used To store and load models in and from disk
# Pkg.add("JLD")
# Pkg.add("HDF5")
# Pkg.add("PyCallJLD")
# Package use to count distribution
# Pkg.add("DataStructures")
# Pkg.add("MLDataPattern")

In [2]:
# Import libraries
using Flux
using Flux.Losses
using DelimitedFiles
using Statistics
using Random
using ScikitLearn
using RDatasets
using FeatureSelectors
using JLD
using PyCallJLD
using DataStructures
using MLDataPattern

In [3]:
# Import ScikitLearn models
@sk_import svm:SVC
@sk_import tree:DecisionTreeClassifier
@sk_import neighbors:KNeighborsClassifier
@sk_import neural_network:MLPClassifier
@sk_import naive_bayes:GaussianNB 
@sk_import linear_model:LogisticRegression
@sk_import neighbors:NearestCentroid
@sk_import neighbors:RadiusNeighborsClassifier
@sk_import linear_model:RidgeClassifier

@sk_import ensemble:VotingClassifier
@sk_import ensemble:StackingClassifier
@sk_import ensemble:BaggingClassifier

@sk_import decomposition:PCA

PyObject <class 'sklearn.decomposition._pca.PCA'>

In [4]:
# Legacy code done in previous practices
include("utils/practices_code.jl")
# Class that handle the model processing
include("utils/model_handler.jl")

loadModel (generic function with 1 method)

In [5]:
# Constants
# Execute model test set
RUN_ANN_TEST = false
RUN_SVM_TEST = true
RUN_DT_TEST = true
RUN_KNN_TEST = true
RUN_MLP_TEST = true
RUN_GB_TEST = true
RUN_LR_TEST = true
RUN_NC_TEST = true
RUN_RN_TEST = true
RUN_RC_TEST = true
# Path of the models
ANN_FILE_PATH = "dataset/models/ann.jld"
SVM_FILE_PATH = "dataset/models/svm.jld"
DT_FILE_PATH = "dataset/models/dt.jld"
KNN_FILE_PATH = "dataset/models/knn.jld"
MLP_FILE_PATH = "dataset/models/mlp.jld"
GB_FILE_PATH = "dataset/models/gb.jld"
LR_FILE_PATH = "dataset/models/lr.jld"
NC_FILE_PATH = "dataset/models/nc.jld"
RN_FILE_PATH = "dataset/models/rn.jld"
RC_FILE_PATH = "dataset/models/rc.jld"
# Default transfer function for the ANN
ANN_DEFAULT_TRANSFER_FUNCTION=sigmoid
# Overwrite model in file
UPDATE_FILE = true
# Possible datasets to test
ORIGINAL_DATASET = "dataset/music_genre.csv"
CLEAN_DATASET = "dataset/clean_music_genre.csv"
NUMERIC_CLEAN_DATASET = "dataset/numeric_clean_music_genre.csv"
KBEST_DATASET = "dataset/kbest_df.csv"
# Which type of output to test
REGULAR_OUTPUT=true
ONE_HOT_ENCODING_OUTPUT=false
# Normalize values with min and max
NORMALIZE_MIN_MAX=true
# Use different types of samples
USE_OVER_SAMPLE=false
USE_UNDER_SAMPLE=false
USE_PCA=false
PCA_CONFIG=0.95
# Configuration to split the data for testing
HOLD_OUT=0.3
NUM_FOLDS=20

# Seed to make the experiment repeteables
Random.seed!(2)

TaskLocalRNG()

# Data preprocessing

This section contains the preprocessing of the data

In [6]:
# Load the dataset from disk (already pre processed)
dataset = readdlm(KBEST_DATASET,',');

# Show information of the dataset
println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,:])
println(size(dataset,1))
println(size(dataset,2))

# Separate the features and the output of the dataset. Remove header.
train_x = dataset[2:size(dataset,1),1:size(dataset,2)-1]
train_y = dataset[2:size(dataset,1),size(dataset,2)]

# Convert to regular values the output classes
if REGULAR_OUTPUT
    train_y = string.(train_y)
end

# Convert to one hot encoding the output classes
if ONE_HOT_ENCODING_OUTPUT
    train_y = oneHotEncoding(train_y)
end


# Show information of the transformed dataset
println("Inputs size: ", size(train_x))
println("Sample of inputs: ", train_x[1,:])
println("Outputs size: ", size(train_y))
println("Unique Outputs: ", unique(train_y))

Dataset original size: (17924, 13)
Sample of original dataset: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882, 5]
17924
13
Inputs size: (17923, 12)
Sample of inputs: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample of inputs: Any[54.0, 0.382, 0.814, 13.58626223890115, 0.0011, 0.569, 116.454, 251.733, 0, 0, 0.0406, 0.00401]
Outputs size: (17923,)
Sample of Outputs: 5
Sample of Outputs: 10


In [7]:
# Using Hold Out function to split dataset into train and test
indexs = holdOut(size(train_x,1),HOLD_OUT)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
if NORMALIZE_MIN_MAX
    train_input = normalizeMinMax!(train_input)
    test_input = normalizeMinMax!(test_input)
end

# Show information about the splitted data
println("Size original input data: ", size(train_x))
println("Size original output data: ", size(train_y))

println("Size train input data: ", size(train_input))
println("Size train output data: ", size(train_output))

println("Size test input data: ", size(test_input))
println("Size test output data: ", size(test_output))

println("Sample original input data: ", train_x[1,:])
println("Sample train input data: ", train_input[1,:])
println("Sample test input data: ", test_input[1,:])

Size original input data: (17923, 12)
Size original output data: (17923,)
Size train input data: (12546, 12)
Size train output data: (12546,)
Size test input data: (5377, 12)
Size test output data: (5377,)
Sample original input data: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample train input data: [0.425531914893617, 0.40587665482725216, 0.8199963459258223, 0.7153069110103181, 0.07931726907630522, 0.6350440642820114, 0.4095655012603085, 0.12278214427098692, 0.0, 0.0, 0.056723716381418085, 4.943685786840785e-5]
Sample test input data: [0.29896907216494845, 0.8254553339115351, 0.5820763888192906, 0.7788889689146599, 0.13353413654618473, 0.8923060992026509, 0.344533493335619, 0.10596919580502202, 0.0, 0.0, 0.1945945945945946, 0.000486846672985156]
Unique Outputs: ["5", "10", "6", "2", "Other", "8", "9", "1"]


In [8]:
# If the over sample option is active
if USE_OVER_SAMPLE
    # Show inputs before aplying over sample
    println("Inputs size: ", size(train_input))
    println("Outputs size: ", size(train_output))

    # Balance the dataset
    balanced_x, balanced_y = oversample((train_input', train_output))

    # Show previous values
    println("Outputs Values: ", unique(train_output))
    println("Before balance:", counter(train_output))

    # Get and show over sampled dataset
    train_input = getobs(balanced_x')
    train_output = getobs(balanced_y)

    println("After balance:", counter(train_output))
end

In [9]:
# If the under sample option is active
if USE_UNDER_SAMPLE
    # Show inputs before aplying under sample
    println("Inputs size: ", size(train_input))
    println("Outputs size: ", size(train_output))

    # Balance the dataset
    balanced_x, balanced_y = undersample((train_input', train_output))

    # Show previous values
    println("Outputs Values: ", unique(train_output))
    println("Before balance:", counter(train_output))

    # Get and show under sampled dataset
    train_input = getobs(balanced_x')
    train_output = getobs(balanced_y)

    println("After balance:", counter(train_output))
end

In [10]:
# Apply Principal Component Analysis
if USE_PCA
    # PCA based on 95% variance, suggests that there are 5 features have noise and should be eliminated
    # Load the PCA configuration
    pca = PCA(PCA_CONFIG)

    # Fit the input dataset with the configuration
    fit!(pca, train_input)

    # Transform the dataset with the new features
    pca_train = pca.transform(train_input)
    pca_test = pca.transform(test_input)

    # Show the difference after applying PCA
    println("Train Patterns ", size(train_input), " -> ", size(pca_train))
    println("Test Patterns ", size(test_input), " -> ", size(pca_test))

    # Assing the PCA dataset to the testing
    train_input = pca_train
    test_input = pca_test
end

In [11]:
# Get the crossvalidation indexs for testing
indexs = crossvalidation(train_output, NUM_FOLDS)
kFoldIndices = convert(Vector{Int64}, indexs)

# Show the crossvalidation size
println(size(kFoldIndices))

(12546,)


# Model experimentation

This section contains all experimentation of the models

In [12]:
# Run the test case for Artificial Neural Network model
if RUN_ANN_TEST
    test_ANN_Model(train_input, train_output, test_input, test_output, kFoldIndices, ANN_DEFAULT_TRANSFER_FUNCTION, UPDATE_FILE, ANN_FILE_PATH)
end

In [13]:
# Run the test case for Support Vector Machine model
if RUN_SVM_TEST
    test_SVM_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, SVM_FILE_PATH)
end

Test results for SVM model: 
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 1, "kernel" => "rbf", "shrinking" => true, "probability" => true, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.4229176534236273 Fscore: 0.3407648984437798
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 2, "kernel" => "rbf", "shrinking" => true, "probability" => true, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.42618836098532886 Fscore: 0.34846228068931423
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 3, "kernel" => "rbf", "shrinking" => true, "probability" => true, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.4273044114639554 Fscore: 0.3513293481100276
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 4, "kernel" => "rbf", "shrinking" => true, "probability" => true, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.42866403626138216 Fscore: 0.3555102890031802
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGa

In [14]:
# Run the test case for Decision Tree model
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3742877591704647 Fscore: 0.26968481003234757
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.36473533120032464 Fscore: 0.22300396003521353
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.32774655694918753 Fscore: 0.1713062660022087
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.27506838938921974 Fscore: 0.053932067243248574
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3809828701438388 Fscore: 0.293931

In [15]:
# Run the test case for K-Nearest Neighbor model
if RUN_KNN_TEST
    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3109185508144868 Fscore: 0.29249636917829636
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.28780936566523896 Fscore: 0.2625364079246324
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.28503276145974055 Fscore: 0.2741920229935021
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.34734598479564216 Fscore: 0.3150625765818593
Parameters: Dict{Any, Any}("n_neighbors" => 7, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3617033874435635 Fscore: 0.3203798143749458
Parameters: Dict{Any, Any}("n_neighbors" => 10, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3781196540675367 Fscore: 0.32801742145043744
Parameters: Dict{Any, Any}("n

In [16]:
# Run the test case for Multi-layer Perceptron model
if RUN_MLP_TEST
    test_MLP_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, MLP_FILE_PATH)
end

Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (8, 8, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.3996553099754661 Fscore: 0.3018327712413336
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (16, 12, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.41128398284638645 Fscore: 0.32925646285095966
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (32, 16, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.4135036144186244 Fscore: 0.33032371065991273
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (16, 4, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.40126693414786396 Fscore: 0.29658121230729695
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (24, 16, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.4124082828345168 

In [17]:
# Run the test case for Gaussian Naive Bayes model
if RUN_GB_TEST
    test_GB_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, GB_FILE_PATH)
end

Test results for GB model: 
Parameters: Dict{Any, Any}() Accuracy: 0.35972001138880694 Fscore: 0.2963248151549114
//////////////////////////////////////////
Best parameters: Dict{Any, Any}() Best accuracy: 0.35972001138880694
Test: Accuracy: 0.36470150641621724 Sensitivity: 0.3389075470404528 Specificity rate: 0.9049433103025836 FScore: 0.29540609983893595
Tot: 5377 Ok: 1961 Acc: 0.36470150641621724


In [18]:
# Run the test case for Logistic Regression model
if RUN_LR_TEST
    test_LR_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, LR_FILE_PATH)
end

Test results for LR model: 
Parameters: Dict{Any, Any}("max_iter" => 1000, "multi_class" => "multinomial") Accuracy: 0.4066647970835507 Fscore: 0.3261618555880831
Parameters: Dict{Any, Any}("max_iter" => 2000, "multi_class" => "multinomial") Accuracy: 0.4066647970835507 Fscore: 0.3261618555880831
Parameters: Dict{Any, Any}("max_iter" => 800, "multi_class" => "multinomial") Accuracy: 0.4066647970835507 Fscore: 0.3261618555880831
Parameters: Dict{Any, Any}("max_iter" => 500, "multi_class" => "multinomial") Accuracy: 0.4066647970835507 Fscore: 0.3261618555880831
//////////////////////////////////////////
Best parameters: Dict{Any, Any}("max_iter" => 1000, "multi_class" => "multinomial") Best accuracy: 0.4066647970835507
Test: Accuracy: 0.40654640133903663 Sensitivity: 0.34275728535898115 Specificity rate: 0.9071324615350144 FScore: 0.32473739612835734
Tot: 5377 Ok: 2186 Acc: 0.40654640133903663


In [19]:
# Run the test case for Nearest centroid model
if RUN_NC_TEST
    test_NC_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, NC_FILE_PATH)
end

Test results for NC model: 
Parameters: Dict{Any, Any}() Accuracy: 0.32900885861454465 Fscore: 0.28638589655828994
//////////////////////////////////////////
Best parameters: Dict{Any, Any}() Best accuracy: 0.32900885861454465
Test: Accuracy: 0.33327134089641064 Sensitivity: 0.34278435406991037 Specificity rate: 0.903170348041426 FScore: 0.28781542467962634
Tot: 5377 Ok: 1792 Acc: 0.33327134089641064


In [20]:
# Run the test case for Radius Neighbors model
if RUN_RN_TEST
    test_RN_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, RN_FILE_PATH)
end

Test results for RN model: 
Parameters: Dict{Any, Any}() Accuracy: 0.3208199101305804 Fscore: 0.12774313349970054
//////////////////////////////////////////
Best parameters: Dict{Any, Any}() Best accuracy: 0.3208199101305804
Test: Accuracy: 0.31653338292728284 Sensitivity: 0.18408523340849461 Specificity rate: 0.8847537718666479 FScore: 0.12648380808891338
Tot: 5377 Ok: 1702 Acc: 0.3165333829272829


In [21]:
# Run the test case for Ridge Regression model
if RUN_RC_TEST
    test_RC_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, RC_FILE_PATH)
end

Test results for RC model: 
Parameters: Dict{Any, Any}() Accuracy: 0.38952966068750655 Fscore: 0.26842682400929085
//////////////////////////////////////////
Best parameters: Dict{Any, Any}() Best accuracy: 0.38952966068750655
Test: Accuracy: 0.39185419378835784 Sensitivity: 0.30658590597688556 Specificity rate: 0.9035209226845062 FScore: 0.27596552328200097
Tot: 5377 Ok: 2107 Acc: 0.39185419378835784
