# Model definitions and testing

# Imports and declarations

This section contains all imports and declarations

In [1]:
# The next packages must be installed to run the solution
import Pkg; 
#Pkg.add("Flux")
#Pkg.add("RDatasets")
#Pkg.add("FeatureSelectors")
#Pkg.add("ScikitLearn"))
#Pkg.add("WeightedPCA"))
#Pkg.add("BetaML")
# Packages used To store and load models in and from disk
# Pkg.add("JLD")
# Pkg.add("HDF5")
# Pkg.add("PyCallJLD")
# Package use to count distribution
# Pkg.add("DataStructures")
# Pkg.add("MLDataPattern")

In [2]:
# Import libraries
using Flux
using Flux.Losses
using DelimitedFiles
using Statistics
using Random
using ScikitLearn
using RDatasets
using FeatureSelectors
using JLD
using PyCallJLD
using DataStructures
using MLDataPattern

In [3]:
# Import ScikitLearn models
@sk_import svm:SVC
@sk_import tree:DecisionTreeClassifier
@sk_import neighbors: KNeighborsClassifier
@sk_import neural_network : MLPClassifier

@sk_import linear_model:LogisticRegression
@sk_import naive_bayes:GaussianNB 
@sk_import ensemble:VotingClassifier
@sk_import ensemble:StackingClassifier
@sk_import ensemble:BaggingClassifier
@sk_import decomposition:PCA

PyObject <class 'sklearn.decomposition._pca.PCA'>

In [4]:
# Legacy code done in previous practices
include("utils/practices_code.jl")
# Class that handle the model processing
include("utils/model_handler.jl")

loadModel (generic function with 1 method)

In [5]:
# Constants
# Execute model test set
RUN_ANN_TEST = false
RUN_SVM_TEST = false
RUN_DT_TEST = false
RUN_KNN_TEST = false
RUN_MLP_TEST = false
ANN_FILE_PATH = "dataset/models/ann.jld"
ANN_DEFAULT_TRANSFER_FUNCTION=sigmoid
SVM_FILE_PATH = "dataset/models/svm.jld"
DT_FILE_PATH = "dataset/models/dt.jld"
KNN_FILE_PATH = "dataset/models/knn.jld"
MLP_FILE_PATH = "dataset/models/mlp.jld"
UPDATE_FILE = true
ORIGINAL_DATASET = "dataset/music_genre.csv"
CLEAN_DATASET = "dataset/clean_music_genre.csv"
NUMERIC_CLEAN_DATASET = "dataset/numeric_clean_music_genre.csv"
KBEST_DATASET = "dataset/kbest_df.csv"
CONVERT_STRING=true
ONE_HOT_ENCODING_OUTPUT=false
NORMALIZE_MIN_MAX=true
USE_OVER_SAMPLE=false
USE_UNDER_SAMPLE=false
USE_PCA=false
PCA_CONFIG=0.95
HOLD_OUT=0.3
NUM_FOLDS=20

Random.seed!(2)

TaskLocalRNG()

# Data preprocessing

This section contains the preprocessing of the data

In [6]:
# Load the dataset and normalize
dataset = readdlm(KBEST_DATASET,',');

println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,:])

println(size(dataset,1))
println(size(dataset,2))

# Separate train_x and train_y
train_x = dataset[2:size(dataset,1),1:size(dataset,2)-1]
train_y = dataset[2:size(dataset,1),size(dataset,2)]

# Convert to string the output classes
if CONVERT_STRING
    train_y = string.(train_y)
end

# Convert to one hot encoding the output classes
if ONE_HOT_ENCODING_OUTPUT
    train_y = oneHotEncoding(train_y)
end

println("Inputs size: ", size(train_x))
println("Sample of inputs: ", train_x[1,:])
println("Sample of inputs: ", train_x[2,:])
println("Outputs size: ", size(train_y))
println("Sample of Outputs: ", train_y[1])
println("Sample of Outputs: ", train_y[2])

Dataset original size: (17924, 13)
Sample of original dataset: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882, 5]
17924
13
Inputs size: (17923, 12)
Sample of inputs: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample of inputs: Any[54.0, 0.382, 0.814, 13.58626223890115, 0.0011, 0.569, 116.454, 251.733, 0, 0, 0.0406, 0.00401]
Outputs size: (17923,)
Sample of Outputs: 5
Sample of Outputs: 10


In [7]:
# Using Hold Out function to split dataset into train and test
indexs = holdOut(size(train_x,1),HOLD_OUT)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
if NORMALIZE_MIN_MAX
    train_input = normalizeMinMax!(train_input)
    test_input = normalizeMinMax!(test_input)
end

println("Size original input data: ", size(train_x))
println("Size original output data: ", size(train_y))

println("Size train input data: ", size(train_input))
println("Size train output data: ", size(train_output))

println("Size test input data: ", size(test_input))
println("Size test output data: ", size(test_output))

println("Sample original input data: ", train_x[1,:])
println("Sample train input data: ", train_input[1,:])
println("Sample test input data: ", test_input[1,:])

println("Unique Outputs: ", unique(train_y))

Size original input data: (17923, 12)
Size original output data: (17923,)
Size train input data: (12546, 12)
Size train output data: (12546,)
Size test input data: (5377, 12)
Size test output data: (5377,)
Sample original input data: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample train input data: [0.425531914893617, 0.40587665482725216, 0.8199963459258223, 0.7153069110103181, 0.07931726907630522, 0.6350440642820114, 0.4095655012603085, 0.12278214427098692, 0.0, 0.0, 0.056723716381418085, 4.943685786840785e-5]
Sample test input data: [0.29896907216494845, 0.8254553339115351, 0.5820763888192906, 0.7788889689146599, 0.13353413654618473, 0.8923060992026509, 0.344533493335619, 0.10596919580502202, 0.0, 0.0, 0.1945945945945946, 0.000486846672985156]
Unique Outputs: ["5", "10", "6", "2", "Other", "8", "9", "1"]


In [8]:
if USE_OVER_SAMPLE
    println("Inputs size: ", size(train_input))
    println("Outputs size: ", size(train_output))

    balanced_x, balanced_y = oversample((train_input', train_output))

    println("Outputs Values: ", unique(train_output))
    println("Before balance:", counter(train_output))

    train_input = getobs(balanced_x')
    train_output = getobs(balanced_y)

    println("After balance:", counter(train_output))
end

In [9]:
if USE_UNDER_SAMPLE
    println("Inputs size: ", size(train_input))
    println("Outputs size: ", size(train_output))

    balanced_x, balanced_y = undersample((train_input', train_output))

    println("Outputs Values: ", unique(train_output))
    println("Before balance:", counter(train_output))

    train_input = getobs(balanced_x')
    train_output = getobs(balanced_y)

    println("After balance:", counter(train_output))
end

In [10]:
# Apply Principal Component Analysis
if USE_PCA
    pca = PCA(PCA_CONFIG)
    fit!(pca, train_input)

    pca_train = pca.transform(train_input)
    pca_test = pca.transform(test_input)

    println("Train Patterns ", size(train_input), " -> ", size(pca_train))
    println("Test Patterns ", size(test_input), " -> ", size(pca_test))

    train_input = pca_train
    test_input = pca_test

    # PCA based on 95% variance, suggests that there are 5 features have noise and should be eliminated
    # from the input data, it's worth to try apply it and compare, maybe after finding the optimaal
    # parameters of the model
end

In [11]:
indexs = crossvalidation(train_output, NUM_FOLDS)
kFoldIndices = convert(Vector{Int64}, indexs)

println(size(kFoldIndices))

(12546,)


# Model experimentation

This section contains all experimentation of the models

In [12]:
if RUN_ANN_TEST
    test_ANN_Model(train_input, train_output, test_input, test_output, kFoldIndices, ANN_DEFAULT_TRANSFER_FUNCTION, UPDATE_FILE, ANN_FILE_PATH)
end

In [13]:
if RUN_SVM_TEST
    test_SVM_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, SVM_FILE_PATH)
end

In [14]:
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, DT_FILE_PATH)
end

In [15]:
if RUN_KNN_TEST
    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, KNN_FILE_PATH)
end

In [16]:
if RUN_MLP_TEST
    test_MLP_Model(train_input, train_output, test_input, test_output, kFoldIndices, UPDATE_FILE, MLP_FILE_PATH)
end

Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (8, 8, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.40579879583337564 Fscore: 0.3105394630210631
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (16, 12, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.4060428872492907 Fscore: 0.32215597156167486
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (32, 16, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.4098548855237817 Fscore: 0.3305187132124118
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (16, 4, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.40063708176710805 Fscore: 0.29135035268543696
Parameters: Dict{Any, Any}("maxEpochs" => 1000, "learningRate" => 0.01, "topology" => (24, 16, 8), "validationRatio" => 0.0, "activation" => "relu") Accuracy: 0.4115379393295383 F

In [17]:
#Agregar plot de los test

Agregar grilla con todas las pruebas