# Machine Learning - Final assignment

**Students:**
<hr>
Mutaz Abueisheh</br>
Marcelo Jose Ferrer</br>
Maximiliano Hormazábal Lagos</br>
Mohamed Aymen Merchaoui</br>

## Classification problem

Classify the dataset between this 10 classes.

0 = Acoustic/Folk</br>
1 = Alternative music</br>
2 = Blues</br>
3 = Bollywood</br>
4 = Country</br>
5 = Hip Hop</br>
6 = Indie</br>
7 = Instrumental</br>
8 = Metal</br>
9 = Pop</br>
10 = Rock</br>

https://www.kaggle.com/datasets/purumalgi/music-genre-classification

# Imports and declarations

This section contains all imports and declarations

In [1]:
# The next packages must be installed to run the solution
import Pkg; 
#Pkg.add("Flux")
#Pkg.add("RDatasets")
#Pkg.add("FeatureSelectors")
#Pkg.add("ScikitLearn"))
#Pkg.add("WeightedPCA"))
#Pkg.add("BetaML")
# Packages used To store and load models in and from disk
# Pkg.add("JLD")
# Pkg.add("HDF5")
# Pkg.add("PyCallJLD")

In [2]:
# Import libraries
using Flux
using Flux.Losses
using DelimitedFiles
using Statistics
using Random
using ScikitLearn
using RDatasets
using FeatureSelectors
using JLD
using PyCallJLD

In [3]:
# Import ScikitLearn models
@sk_import svm:SVC
@sk_import tree:DecisionTreeClassifier
@sk_import linear_model:LogisticRegression
@sk_import neighbors: KNeighborsClassifier
@sk_import naive_bayes:GaussianNB 
@sk_import ensemble:VotingClassifier
@sk_import ensemble:StackingClassifier
@sk_import ensemble:BaggingClassifier
@sk_import decomposition:PCA

PyObject <class 'sklearn.decomposition._pca.PCA'>

In [4]:
# Constants
# Execute model test set
RUN_ANN_TEST = false
RUN_SVM_TEST = true
RUN_DT_TEST = true
RUN_KNN_TEST = true
ANN_FILE_PATH = "dataset/models/ann.jld"
SVM_FILE_PATH = "dataset/models/svm.jld"
DT_FILE_PATH = "dataset/models/dt.jld"
KNN_FILE_PATH = "dataset/models/knn.jld"

"dataset/models/knn.jld"

In [5]:
# Include the code done in previous practices
include("utils/practices_code.jl")
# Class that handle the data processing
include("utils/data_handler.jl")
# Class that handle the model processing
include("utils/model_handler.jl")

loadModel (generic function with 1 method)

# Data preprocessing

This section contains the preprocessing of the data

In [6]:
# Load the dataset and normalize
dataset = readdlm("dataset/clean_music_genre.csv",',');

println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,1:17])


Dataset original size: (17972, 17)
Sample of original dataset: Any["Bruno Mars", "That's What I Like (feat. Gucci Mane)", 52.2, 0.854, 0.564, 1.0, -8.630403119526399, 1, 0.03404, 0.0171, 0.00965882, 0.0849, 0.899, 134.071, 234.596, 4, 5]


In [37]:
# Load the dataset and normalize
dataset = readdlm("dataset/clean_music_genre.csv",',');

println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,1:17])

# Remove line of headers, artist and song name. Separate train_x and train_y
train_x = dataset[2:size(dataset,1),3:16]
train_y = dataset[2:size(dataset,1),17]

# Transform columns to positive values
train_x = abs.(train_x)

# normalized_inputs = normalizeMinMax!(train_x) --> must be after splitting
# binary_outputs = oneHotEncoding(train_y) --> Done only for Ann on training

println("Inputs size: ", size(train_x))
println("Sample of inputs: ", train_x[1,1:14])
println("Outputs size: ", size(train_y))
println("Sample of Outputs: ", train_y[1])

Dataset original size: (17972, 17)
Sample of original dataset: Any["Bruno Mars", "That's What I Like (feat. Gucci Mane)", 52.2, 0.854, 0.564, 1.0, -8.630403119526399, 1, 0.03404, 0.0171, 0.00965882, 0.0849, 0.899, 134.071, 234.596, 4, 5]
Inputs size: (17971, 14)
Sample of inputs: Real[52.2, 0.854, 0.564, 1.0, 8.630403119526399, 1, 0.03404, 0.0171, 0.00965882, 0.0849, 0.899, 134.071, 234.596, 4]
Outputs size: (17971,)
Sample of Outputs: 5


In [39]:
# Using Hold Out function to split dataset into train and test
indexs = holdOut(size(train_x,1),0.2)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
train_input = normalizeMinMax!(train_input)
test_input = normalizeMinMax!(test_input)

println("Size original input data: ", size(train_x))
println("Size original output data: ", size(train_y))

println("Size train input data: ", size(train_input))
println("Size train output data: ", size(train_output))

println("Size test input data: ", size(test_input))
println("Size test output data: ", size(test_output))

println("Sample original input data: ", train_x[1,1:14])
println("Sample train input data: ", train_input[1,1:14])
println("Sample test input data: ", test_input[1,1:14])

Size original input data: (17971, 14)
Size original output data: (17971,)
Size train input data: (14377, 14)
Size train output data: (14377,)
Size test input data: (3594, 14)
Size test output data: (3594,)
Sample original input data: Real[52.2, 0.854, 0.564, 1.0, 8.630403119526399, 1, 0.03404, 0.0171, 0.00965882, 0.0849, 0.899, 134.071, 234.596, 4]
Sample train input data: [0.3645833333333333, 0.935442220787605, 0.6266190394059059, 0.3, 0.26251335945849663, 1.0, 0.07726161369193155, 0.04849397590361446, 0.00014017539446232097, 0.1218637992831541, 0.8687609796424511, 0.4855961240479257, 0.10945638869159291, 0.75]
Sample test input data: [0.4329896907216495, 0.2831819189009528, 0.4550736251627768, 0.4, 0.38778462112728623, 0.0, 0.05503685503685504, 0.6395582329317269, 1.301628679155151e-6, 0.14336868199972455, 0.330893883951908, 0.4933398946461923, 0.10401569074701209, 0.75]


# Model experimentation

This section contains all experimentation of the models

In [41]:
indexs = crossvalidation(train_output, 10)
kFoldIndices = convert(Vector{Int64}, indexs)

println(size(kFoldIndices))

(14377,)


### making 3 other datasets based on number of different selection of features

In [44]:

train_input1 = train_input[ :,[8,3,5,1,2,11,7]]
test_input1 = test_input[ :,[8,3,5,1,2,11,7]];

train_input2 = train_input[ :,[8,3,5,1,2,11,7,6,14,10,12]]
test_input2 = test_input[ :,[8,3,5,1,2,11,7,6,14,10,12]];

train_input3 = train_input[ :,[8,3,5,1]]
test_input3 = test_input[ :,[8,3,5,1]];

### first experementing on original data (all genres)

In [42]:
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.371156629959888 Fscore: 0.22451584979832942
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.36406075595311593 Fscore: 0.20454830380099426
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3294842893986985 Fscore: 0.11805347106328021
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.2729366224767452 Fscore: 0.03898452223194872
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.37260843647408365 Fscore: 0.23146665

In [51]:
if RUN_DT_TEST
    test_DT_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.36614689993226246 Fscore: 0.21150160215879313
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.36406075595311593 Fscore: 0.20454830380099426
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3294842893986985 Fscore: 0.11805347106328021
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.2729366224767452 Fscore: 0.03898452223194872
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3655124553492225 Fscore: 0.2208521

In [50]:
if RUN_DT_TEST
    test_DT_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.37066901824603227 Fscore: 0.2165027991840786
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.36406075595311593 Fscore: 0.20454830380099426
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3294842893986985 Fscore: 0.11805347106328021
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.2729366224767452 Fscore: 0.03898452223194872
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3707273362135662 Fscore: 0.22446848

In [47]:
if RUN_DT_TEST
    test_DT_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.32342775208396224 Fscore: 0.18507846008774814
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.31682345989049343 Fscore: 0.16400192374975092
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.3106396937987316 Fscore: 0.1328364165577954
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.2729366224767452 Fscore: 0.03898452223194872
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.32976724916102074 Fscore: 0.1991031

In [52]:
if RUN_KNN_TEST
    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.26249268083232075 Fscore: 0.2722590925859841
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.2300215688406421 Fscore: 0.24710705300749725
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.259787322078058 Fscore: 0.2524858270121835
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3014458619051005 Fscore: 0.2892874999332066
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.31257707405954954 Fscore: 0.29431774928774884
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.32232277904924655 Fscore: 0.3001050645133443
Parameters: Dict{Any, Any}("n_n

In [53]:
if RUN_KNN_TEST
    test_KNN_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.2611020490695901 Fscore: 0.27128194266391426
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.23146596030056804 Fscore: 0.2523286800696576
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.26047857109631745 Fscore: 0.254295379074484
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.2978966834296767 Fscore: 0.29233823320005997
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.31062658656709824 Fscore: 0.29697905276481656
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.32362897825895826 Fscore: 0.3055792479039726
Parameters: Dict{Any, Any}("n

In [49]:
if RUN_KNN_TEST
    test_KNN_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.2630450544269837 Fscore: 0.273595977463595
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.23217289534773342 Fscore: 0.2521249490216753
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.26695051621840726 Fscore: 0.26120691801685864
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3025520392921176 Fscore: 0.29624994136837335
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.313822380041226 Fscore: 0.30419892139680366
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.32738435644844255 Fscore: 0.31100843951855184
Parameters: Dict{Any, Any}("n_

In [48]:
if RUN_KNN_TEST
    test_KNN_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.21409536832440362 Fscore: 0.22397880466031267
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.19343911845855366 Fscore: 0.2100715096738896
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.22709467458913773 Fscore: 0.2195834506676762
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.2451039627735089 Fscore: 0.23489137241226357
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.2592970177415803 Fscore: 0.24312755266778258
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.26994304332054986 Fscore: 0.25061410518112887
Parameters: Dict{Any, Any}(

# making new dataset based on best 4 music genres only, based on correlation with target column

In [111]:

using DataFrames
n_dataset = DataFrame(dataset,:auto)

n_dataset = filter(row -> row.x17 in [6,8,9,10], n_dataset)
dataset_4 = Matrix(n_dataset)

train_x = dataset_4[:,3:16]
train_y = dataset_4[:,17]

train_x = abs.(train_x)

binary_outputs = oneHotEncoding(train_y);

In [125]:
indexs = holdOut(size(train_x,1),0.2)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
train_input = normalizeMinMax!(train_input)
test_input = normalizeMinMax!(test_input)

1483×14 Matrix{Float64}:
 0.404255  0.685818  0.646899  0.6  …  0.29099    0.340704  0.468825  0.75
 0.712766  0.437548  0.692168  0.1     0.557924   0.111694  0.313154  0.75
 0.659574  0.220037  0.965796  0.5     0.308926   0.267241  0.434504  0.5
 0.444681  0.369439  0.834012  0.3     0.243511   0.698415  0.419379  0.75
 0.548936  0.651763  0.836024  0.1     0.311036   0.555765  0.23268   0.75
 0.523404  0.51115   0.604648  0.4  …  0.664486   1.0       0.335918  0.75
 0.417021  0.827529  0.585534  0.5     0.332138   0.455141  0.291296  0.75
 0.457447  0.573767  0.995976  0.7     0.127453   0.34084   0.296331  0.75
 0.138298  0.36065   0.336049  0.6     0.164381   0.525167  0.363016  0.75
 0.819149  0.781391  0.768623  1.0     0.478793   0.434368  0.341792  0.75
 0.723404  0.851697  0.582516  0.8  …  0.8544     0.626786  0.31365   0.75
 0.410638  0.67703   0.660983  0.9     0.576915   0.283496  0.230624  0.75
 0.0       0.457322  0.978874  0.6     0.289935   0.506713  0.331526  0.75
 

In [8]:
indexs = holdOut(size(train_x,1),0.2)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
train_input = normalizeMinMax!(train_input)
test_input = normalizeMinMax!(test_input)


2378×14 Matrix{Float64}:
 0.43617   0.263783  0.745997  0.1  0.296963  …  0.500218  0.495396   0.75
 0.56383   0.430749  0.963857  0.9  0.108146     0.324042  0.187628   0.75
 0.478723  0.703048  0.375533  0.8  0.455025     0.451562  0.219876   0.75
 0.557447  0.443075  0.537172  0.3  0.383913     0.343986  0.303582   0.75
 0.606383  0.485657  0.899603  0.7  0.209574     0.567882  0.326575   0.75
 0.382979  0.411699  0.987952  0.7  0.131333  …  0.670656  0.301881   0.75
 0.585106  0.399372  0.942774  0.0  0.031632     0.56708   0.315852   0.75
 0.5       0.510309  0.516089  0.6  0.326397     0.50359   0.449437   0.5
 0.62766   0.781488  0.670699  1.0  0.249743     0.695869  0.403148   0.75
 0.525532  0.717615  0.681743  0.4  0.323114     0.529835  0.139318   0.75
 0.659574  0.262662  0.916671  0.8  0.324438  …  0.25934   0.396343   0.75
 0.725532  0.930524  0.784147  0.0  0.280631     0.438555  0.106365   0.75
 0.414894  0.661587  0.959841  0.0  0.199776     0.442202  0.434342   0.75
 

In [9]:
train_input1 = train_input[ :,[8,3,5,1,2,11,7]]
test_input1 = test_input[ :,[8,3,5,1,2,11,7]];

train_input2 = train_input[ :,[8,3,5,1,2,11,7,6,14,10,12]]
test_input2 = test_input[ :,[8,3,5,1,2,11,7,6,14,10,12]];

train_input3 = train_input[ :,[8,3,5,1]]
test_input3 = test_input[ :,[8,3,5,1]];

In [127]:
indexs = crossvalidation(train_output, 10)
kFoldIndices = convert(Vector{Int64}, indexs)

println(size(kFoldIndices))

(5933,)


In [128]:
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49521294980726854 Fscore: 0.46587802630601144
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.48459435579592086 Fscore: 0.4315709792590755
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4565959536254435 Fscore: 0.3916137463111049
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4358700255367968 Fscore: 0.29599170388868634
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5054945462599771 Fscore: 0.480763401

In [129]:
if RUN_DT_TEST
    test_DT_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4973955477424343 Fscore: 0.45629993907305444
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.48205547330171516 Fscore: 0.42848747171538626
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4565959536254435 Fscore: 0.3916137463111049
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4358700255367968 Fscore: 0.29599170388868634
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49706025351965544 Fscore: 0.46214536

In [131]:
if RUN_DT_TEST
    test_DT_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4973955477424343 Fscore: 0.4563568504742827
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.48205547330171516 Fscore: 0.42848747171538626
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4565959536254435 Fscore: 0.3916137463111049
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4358700255367968 Fscore: 0.29599170388868634
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49840734350946264 Fscore: 0.464658636

In [133]:
if RUN_DT_TEST
    test_DT_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.46638095917381356 Fscore: 0.4264127867578111
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.462337706691191 Fscore: 0.4139153141020232
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4512058529976789 Fscore: 0.3702764275818998
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4358700255367968 Fscore: 0.29599170388868634
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4655326844462113 Fscore: 0.435189200318

In [11]:
size(train_input)

(9512, 14)

In [11]:
if RUN_ANN_TEST
    test_ANN_Model(train_input, train_output, test_input, test_output, kFoldIndices, ANN_FILE_PATH)
end

In [12]:
if RUN_SVM_TEST
    test_SVM_Model(train_input, train_output, test_input, test_output, kFoldIndices, SVM_FILE_PATH)
end

Testing on 4 genres - not balanced


In [12]:
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49737760434993145 Fscore: 0.4180971916785152
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49632895331056837 Fscore: 0.39196318352467063
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4898062790087473 Fscore: 0.3893634396478904
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4149496341268469 Fscore: 0.1466305289030839
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5140973216528584 Fscore: 0.4442218451

In [16]:
if RUN_DT_TEST
    test_DT_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4976956094289527 Fscore: 0.40903287720361464
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49517349112569436 Fscore: 0.3907558473011929
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4898062790087473 Fscore: 0.3893634396478904
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4149496341268469 Fscore: 0.1466305289030839
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4996974929731196 Fscore: 0.42622712586

In [14]:
if RUN_DT_TEST
    test_DT_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.497905361401396 Fscore: 0.40856577901786456
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.49517349112569436 Fscore: 0.3907558473011929
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4898062790087473 Fscore: 0.3893634396478904
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4149496341268469 Fscore: 0.1466305289030839
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5000129503856338 Fscore: 0.424944991200

In [17]:
if RUN_DT_TEST
    test_DT_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.461527951627659 Fscore: 0.3993002273380303
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4423900621199065 Fscore: 0.321906520570836
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.43724841950534304 Fscore: 0.31769202650198103
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.4149496341268469 Fscore: 0.1466305289030839
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.471515133833468 Fscore: 0.40325721346137

In [18]:
if RUN_KNN_TEST
    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4242109280515489 Fscore: 0.4163617309995359
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.36239365886668573 Fscore: 0.3724026837275095
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3950898354794194 Fscore: 0.38028001362803154
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4378845363890108 Fscore: 0.4269597243972655
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.44419037661096283 Fscore: 0.4311849637397329
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.46153171242430746 Fscore: 0.4437624599105455
Parameters: Dict{Any, Any}("n_n

In [19]:
if RUN_KNN_TEST
    test_KNN_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4135993613240315 Fscore: 0.4082475045547465
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3531409999305585 Fscore: 0.366773522048065
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.39120040532435035 Fscore: 0.382083438439652
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4339926743810496 Fscore: 0.42822680156549764
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4443993598326691 Fscore: 0.43512460227978067
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4552267644565452 Fscore: 0.442049069493285
Parameters: Dict{Any, Any}("n_neigh

In [20]:
if RUN_KNN_TEST
    test_KNN_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4158977754930933 Fscore: 0.41054471901553
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.35903850977471574 Fscore: 0.3715215934277032
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.39183441892091536 Fscore: 0.37930819979668595
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.43724941359162484 Fscore: 0.42919191557349723
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.4427102688227419 Fscore: 0.43265134111265463
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.454483828737529 Fscore: 0.4402794478481473
Parameters: Dict{Any, Any}("n_ne

In [21]:
if RUN_KNN_TEST
    test_KNN_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.37679504252480206 Fscore: 0.3736764619686862
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.32748717707925057 Fscore: 0.3393837618950219
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3572475976248189 Fscore: 0.34362701316415495
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.3982415417611779 Fscore: 0.39476312402902347
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.39813395557324477 Fscore: 0.393906066634974
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.41190928391765996 Fscore: 0.40297047102756123
Parameters: Dict{Any, Any}("n

# 3 best gernes only

In [22]:

using DataFrames
n_dataset = DataFrame(dataset,:auto)

n_dataset = filter(row -> row.x17 in [6,8,9], n_dataset)
dataset_3 = Matrix(n_dataset)



train_x = dataset_3[:,3:16]
train_y = dataset_3[:,17]

train_x = abs.(train_x)

binary_outputs = oneHotEncoding(train_y);

In [23]:
indexs = holdOut(size(train_x,1),0.2)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
train_input = normalizeMinMax!(train_input)
test_input = normalizeMinMax!(test_input)

1391×14 Matrix{Float64}:
 0.473118  0.378537  0.822246  0.9  …  0.133925   0.423731  0.549708  0.75
 0.268817  0.254019  0.871455  0.3     0.517322   0.711206  0.293683  0.75
 0.655914  0.80643   0.325139  0.9     0.656738   0.396538  0.432917  0.75
 0.473118  0.469097  0.941753  0.3     0.396916   0.284276  0.260986  0.75
 0.903226  0.990944  0.48582   0.1     0.301859   0.482853  0.382066  0.75
 0.344086  0.29477   0.784085  1.0  …  0.572243   0.752039  0.423525  0.75
 0.430108  0.526828  0.607335  1.0     0.127588   0.27744   0.423993  0.75
 0.526882  0.458909  0.595284  0.7     0.424377   0.280281  0.566475  0.75
 0.483871  0.439665  0.535029  0.8     0.555344   0.323883  0.342084  0.75
 0.36129   0.280054  0.986945  0.8     0.065695   0.356673  0.501831  0.75
 0.172043  0.59588   0.79011   0.5  …  0.391635   0.754145  0.62793   0.75
 0.494624  0.349106  0.868442  0.0     0.727503   0.589246  0.370833  0.75
 0.795699  0.82907   0.971881  0.6     0.780313   0.509231  0.513559  0.75


In [24]:
train_input1 = train_input[ :,[8,3,5,1,2,11,7]]
test_input1 = test_input[ :,[8,3,5,1,2,11,7]];

train_input2 = train_input[ :,[8,3,5,1,2,11,7,6,14,10,12]]
test_input2 = test_input[ :,[8,3,5,1,2,11,7,6,14,10,12]];

train_input3 = train_input[ :,[8,3,5,1]]
test_input3 = test_input[ :,[8,3,5,1]];


In [26]:
indexs = crossvalidation(train_output, 10)
kFoldIndices = convert(Vector{Int64}, indexs)

println(size(kFoldIndices))

(5565,)


In [27]:
if RUN_DT_TEST
    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6170774199809353 Fscore: 0.6114845840855985
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6007433544273227 Fscore: 0.5945373745474201
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5791647741412836 Fscore: 0.5905204088097947
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5511303565200382 Fscore: 0.4570351428127749
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.629850155635002 Fscore: 0.62668955048999

In [28]:
if RUN_DT_TEST
    test_DT_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6181729934414787 Fscore: 0.6130962435489098
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5892477046397924 Fscore: 0.5791096475052883
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5791647741412836 Fscore: 0.5905204088097947
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5511303565200382 Fscore: 0.4570351428127749
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6187090307468494 Fscore: 0.6177803865209

In [29]:
if RUN_DT_TEST
    test_DT_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6181729934414787 Fscore: 0.6131392130148327
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5892477046397924 Fscore: 0.5791096475052883
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5791647741412836 Fscore: 0.5905204088097947
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5511303565200382 Fscore: 0.4570351428127749
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6196057350673112 Fscore: 0.6170595517184

In [30]:
if RUN_DT_TEST
    test_DT_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, DT_FILE_PATH)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5996620038206434 Fscore: 0.6021350249684058
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5933744444732595 Fscore: 0.5951254525062122
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5694748230378928 Fscore: 0.5647059639706911
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.5511303565200382 Fscore: 0.4570351428127749
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.6019923836506453 Fscore: 0.6060303107291

In [31]:
if RUN_KNN_TEST
    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5901512270764219 Fscore: 0.5995270461280988
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5259772165644618 Fscore: 0.5296989755544089
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5500405999778072 Fscore: 0.5570951945030405
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5701909918332937 Fscore: 0.5816132574194082
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.60235631337498 Fscore: 0.6093857925794423
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.597688090965747 Fscore: 0.6076422700802114
Parameters: Dict{Any, Any}("n_neighbor

In [32]:
if RUN_KNN_TEST
    test_KNN_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5908531753013845 Fscore: 0.6015533957484362
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5385739453716337 Fscore: 0.5431460524501714
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5620967534612913 Fscore: 0.5714286597037834
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5764669057017084 Fscore: 0.5884000682017363
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.6116787909331531 Fscore: 0.6190699813694878
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.6053896217266317 Fscore: 0.6169042365906637
Parameters: Dict{Any, Any}("n_neigh

In [33]:
if RUN_KNN_TEST
    test_KNN_Model(train_input2, train_output, test_input2, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5755715127246592 Fscore: 0.5865368092318408
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5247091998217216 Fscore: 0.5298194096258807
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5390900183329581 Fscore: 0.5493507296023422
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5595678169827514 Fscore: 0.5719341121683734
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5840160677669994 Fscore: 0.5919327527094559
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5816737416901575 Fscore: 0.5929260778976866
Parameters: Dict{Any, Any}("n_neigh

In [34]:
if RUN_KNN_TEST
    test_KNN_Model(train_input3, train_output, test_input3, test_output, kFoldIndices, KNN_FILE_PATH)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5635259512287053 Fscore: 0.5754279365836805
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5117756083153588 Fscore: 0.5127774994279755
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5250572411174599 Fscore: 0.5342690137184507
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5516476595660887 Fscore: 0.5637002796149341
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5725052205906588 Fscore: 0.5830913819283713
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.5705183974139436 Fscore: 0.5826723202430081
Parameters: Dict{Any, Any}("n_neigh

In [35]:
if RUN_SVM_TEST
    test_SVM_Model(train_input, train_output, test_input, test_output, kFoldIndices, SVM_FILE_PATH)
end

Test results for SVM model: 
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 1, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6530326337963249 Fscore: 0.6583055585636741
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 2, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6535683180339847 Fscore: 0.6582837001104405
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 10, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6395601534171511 Fscore: 0.6466919985504334
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 1, "kernel" => "linear", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6364870942907018 Fscore: 0.6425971870461833
Parameters: Dict{Any, Any}("tol" => 0.001, "ker

In [36]:
if RUN_SVM_TEST
    test_SVM_Model(train_input1, train_output, test_input1, test_output, kFoldIndices, SVM_FILE_PATH)
end

Test results for SVM model: 
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 1, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6557185432143862 Fscore: 0.6582823653686888
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 2, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6587799801528114 Fscore: 0.661393440716001
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 10, "kernel" => "rbf", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.6657944046450796 Fscore: 0.6680423518574263
Parameters: Dict{Any, Any}("tol" => 0.001, "kernelGamma" => 2, "C" => 1, "kernel" => "linear", "shrinking" => true, "probability" => false, "coef0" => 0.0, "kernelDegree" => 3) Accuracy: 0.615633065463438 Fscore: 0.6214421885736637
Parameters: Dict{Any, Any}("tol" => 0.001, "kerne

In [15]:
# Get the best models
#=best_SVM = loadModel(DT_FILE_PATH)
if isnothing(best_SVM)
    best_SVM = get_Best_SVM(train_input, train_output, kFoldIndices)
end=#
best_DT = loadModel(DT_FILE_PATH)
if isnothing(best_DT)
    best_DT = get_Best_DT(train_input, train_output, kFoldIndices)
end
best_KNN = loadModel(KNN_FILE_PATH)
if isnothing(best_KNN)
    best_KNN = get_Best_KNN(train_input, train_output, kFoldIndices)
end

In [16]:
#Define the models to train

#=models = Dict( "SVM" => SVC(probability=true), 
         "LR" =>LogisticRegression(),
         "DT"=> DecisionTreeClassifier(max_depth=4),
         "NB"=> GaussianNB())

base_models =  [ name for name in keys(models)]=#

In [17]:
# Perform the training for each model and calculate the test values (accuracy)
#=for key in keys(models)
    model = models[key]
    fit!(model,train_input, train_output)
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [18]:
#Define the metaclassifier based on the base_models
#=models["Ensemble (Hard Voting)"] = VotingClassifier(estimators = [(name,models[name]) for name in base_models], 
                                                   n_jobs=-1)
fit!(models["Ensemble (Hard Voting)"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [19]:
#=models["Ensemble (Soft Voting)"] = VotingClassifier(estimators = [(name,models[name]) for name in base_models], 
                                                   n_jobs=-1, voting="soft",weights=[1,2,2,1])
fit!(models["Ensemble (Soft Voting)"],train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [20]:
#=models["Ensemble (Stacking)"] = StackingClassifier(estimators=[(name,models[name]) for name in base_models],
    final_estimator=SVC(probability=true), n_jobs=-1)
fit!(models["Ensemble (Stacking)"], train_input, train_output)=#

In [21]:
#=for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [22]:
#=models["Bagging (SVC)"] = BaggingClassifier(base_estimator=SVC(),n_estimators=10, max_samples=0.50, n_jobs=-1)
fit!(models["Bagging (SVC)"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [23]:
#=@sk_import ensemble:(AdaBoostClassifier, GradientBoostingClassifier)

models["Ada"] = AdaBoostClassifier(n_estimators=30)
fit!(models["Ada"], train_input, train_output)

models["GTB"] = GradientBoostingClassifier(n_estimators=30, learning_rate=1.0, max_depth=2, random_state=0)
fit!(models["GTB"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#


In [24]:
#=@sk_import ensemble:RandomForestClassifier

models["RF"] = RandomForestClassifier(n_estimators=8, max_depth=nothing,
                                    min_samples_split=2, n_jobs=-1)
fit!(models["RF"], train_input, train_output)
    
for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [25]:
#=p = bar(y=1:60,models["RF"].feature_importances_, orientation=:horizontal, legend = false)
xlabel!(p,"Gini Gain")
ylabel!(p,"Fearure")
title!("Feature Importance")=#

In [26]:
#=using Pkg;
Pkg.add("XGBoost")=#

In [27]:
#=using XGBoost;

train_output_asNumber= Vector{Number}(train_output);

@assert train_output_asNumber isa Vector{Number}=#

In [28]:
#model = xgboost(train_input, 20, label = train_output_asNumber, eta = 1, max_depth = 6)

In [29]:
#=param = ["max_depth" => 2,
         "eta" => 1,
         "objective" => "binary:logistic"]
metrics = metrics = ["error", "auc"]
model = xgboost(train_input, 20, label = train_output_asNumber, param = param, metrics = metrics)

pred = predict(model, train_input)=#

In [30]:
#=using XGBoost: predict as predict_xgb

pred = predict_xgb(model, test_input)
print("Error of XGboost= ", sum((pred .> 0.5) .!= test_output) / float(size(pred)[1]), "\n")=#

In [31]:
#=feature_gain = map(x-> (x.fname,x.gain), importance(model))
feature, gain = first.(feature_gain), last.(feature_gain)

using Plots;

p = bar(gain, y=feature, orientation="h", legend=false)
xlabel!(p,"Gain")
ylabel!(p,"Feature")
title!("Feature Importance")=#