# Machine Learning - Final assignment

**Students:**
<hr>
Mutaz Abueisheh</br>
Marcelo Jose Ferrer</br>
Maximiliano Hormazábal Lagos</br>
Mohamed Aymen Merchaoui</br>

## Classification problem

Classify the dataset between this 10 classes.

0 = Acoustic/Folk</br>
1 = Alternative music</br>
2 = Blues</br>
3 = Bollywood</br>
4 = Country</br>
5 = Hip Hop</br>
6 = Indie</br>
7 = Instrumental</br>
8 = Metal</br>
9 = Pop</br>
10 = Rock</br>

https://www.kaggle.com/datasets/purumalgi/music-genre-classification

# Imports and declarations

This section contains all imports and declarations

In [1]:
# The next packages must be installed to run the solution
import Pkg; 
#Pkg.add("Flux")
#Pkg.add("ScikitLearn")
# Packages used To store and load models in and from disk
# Pkg.add("JLD")
# Pkg.add("HDF5")
# Pkg.add("PyCallJLD")

In [2]:
# Import libraries
using Flux
using Flux.Losses
using DelimitedFiles
using Statistics
using Random
using ScikitLearn
using JLD
using PyCallJLD

In [3]:
# Import ScikitLearn models
@sk_import svm:SVC
@sk_import tree:DecisionTreeClassifier
@sk_import neural_network : MLPClassifier
@sk_import neighbors: KNeighborsClassifier
@sk_import naive_bayes:GaussianNB 
@sk_import linear_model:LogisticRegression
@sk_import neighbors:NearestCentroid
@sk_import neighbors:RadiusNeighborsClassifier
@sk_import linear_model:RidgeClassifier

@sk_import ensemble:VotingClassifier
@sk_import ensemble:StackingClassifier
@sk_import ensemble:BaggingClassifier

PyObject <class 'sklearn.ensemble._bagging.BaggingClassifier'>

In [4]:
# Legacy code done in previous practices
include("utils/practices_code.jl")
# Class that handle the model processing
include("utils/model_handler.jl")

loadModel (generic function with 1 method)

# Constants

In [5]:
# Path of the models
ANN_FILE_PATH = "dataset/models/ann.jld"
SVM_FILE_PATH = "dataset/models/svm.jld"
DT_FILE_PATH = "dataset/models/dt.jld"
KNN_FILE_PATH = "dataset/models/knn.jld"
MLP_FILE_PATH = "dataset/models/mlp.jld"
GB_FILE_PATH = "dataset/models/gb.jld"
LR_FILE_PATH = "dataset/models/lr.jld"
NC_FILE_PATH = "dataset/models/nc.jld"
RN_FILE_PATH = "dataset/models/rn.jld"
RC_FILE_PATH = "dataset/models/rc.jld"
# Get the metrics for the basic models
RERUN_METRICS = true
# Configuration to split the data
HOLD_OUT=0.3
NUM_FOLDS=20

# Seed to make the experiment repeteables
Random.seed!(2)

TaskLocalRNG()

# Data preprocessing

This section contains the preprocessing of the data

In [6]:
# Load the dataset from disk (already pre processed)
dataset = readdlm("dataset/kbest_df.csv",',');

# Show information of the dataset
println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,:])

println(size(dataset,1))
println(size(dataset,2))

# Separate the features and the output of the dataset. Remove header.
train_x = dataset[2:size(dataset,1),1:size(dataset,2)-1]
train_y = dataset[2:size(dataset,1),size(dataset,2)]

# Convert to regular values the output classes
train_y = string.(train_y)

# Show information of the transformed dataset
println("Inputs size: ", size(train_x))
println("Sample of inputs: ", train_x[1,:])
println("Outputs size: ", size(train_y))
println("Sample of Outputs: ", train_y[1])
println("Unique Outputs: ", unique(train_y))

Dataset original size: (17924, 13)
Sample of original dataset: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882, 5]
17924
13
Inputs size: (17923, 12)
Sample of inputs: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample of inputs: Any[54.0, 0.382, 0.814, 13.58626223890115, 0.0011, 0.569, 116.454, 251.733, 0, 0, 0.0406, 0.00401]
Outputs size: (17923,)
Sample of Outputs: 5
Sample of Outputs: 10


In [7]:
# Using Hold Out function to split dataset into train and test
indexs = holdOut(size(train_x,1),HOLD_OUT)

train_input = train_x[indexs[1],:]
train_output = vec(train_y[indexs[1],:])

test_input = train_x[indexs[2],:]
test_output = vec(train_y[indexs[2],:])

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
train_input = normalizeMinMax!(train_input)
test_input = normalizeMinMax!(test_input)

# Show information about the splitted data
println("Size original input data: ", size(train_x))
println("Size original output data: ", size(train_y))

println("Size train input data: ", size(train_input))
println("Size train output data: ", size(train_output))

println("Size test input data: ", size(test_input))
println("Size test output data: ", size(test_output))

println("Sample original input data: ", train_x[1,:])
println("Sample train input data: ", train_input[1,:])
println("Sample test input data: ", test_input[1,:])

Size original input data: (17923, 12)
Size original output data: (17923,)
Size train input data: (12546, 12)
Size train output data: (12546,)
Size test input data: (5377, 12)
Size test output data: (5377,)
Sample original input data: Any[52.2, 0.854, 0.564, 12.18585911937475, 0.0171, 0.899, 134.071, 234.596, 1, 0, 0.03404, 0.00965882]
Sample train input data: [0.425531914893617, 0.40587665482725216, 0.8199963459258223, 0.7153069110103181, 0.07931726907630522, 0.6350440642820114, 0.4095655012603085, 0.12278214427098692, 0.0, 0.0, 0.056723716381418085, 4.943685786840785e-5]
Sample test input data: [0.29896907216494845, 0.8254553339115351, 0.5820763888192906, 0.7788889689146599, 0.13353413654618473, 0.8923060992026509, 0.344533493335619, 0.10596919580502202, 0.0, 0.0, 0.1945945945945946, 0.000486846672985156]
Unique Outputs: ["5", "10", "6", "2", "Other", "8", "9", "1"]


In [8]:
# Get the crossvalidation indexs for testing
indexs = crossvalidation(train_output, NUM_FOLDS)
kFoldIndices = convert(Vector{Int64}, indexs)

# Show the crossvalidation size
println(size(kFoldIndices))

(12546,)


# Model training

This section contains all training of the best models

In [9]:
# Get the best model for Multi-layer Perceptron
best_MLP = loadModel(MLP_FILE_PATH)
# If model can not be loaded from disk, reload from code
if isnothing(best_MLP)
    best_MLP = get_Best_MLP(train_input, train_output, kFoldIndices)
end
# Get the best model for Support Vector Machine
best_SVM = loadModel(SVM_FILE_PATH)
if isnothing(best_SVM)
    best_SVM = get_Best_SVM(train_input, train_output, kFoldIndices)
end
# Get the best model for Decision Tree
best_DT = loadModel(DT_FILE_PATH)
if isnothing(best_DT)
    best_DT = get_Best_DT(train_input, train_output, kFoldIndices)
end
# Get the best model for K-Nearest Neighbor
best_KNN = loadModel(KNN_FILE_PATH)
if isnothing(best_KNN)
    best_KNN = get_Best_KNN(train_input, train_output, kFoldIndices)
end
# Get the best model for Gaussian Naive Bayes
best_GB = loadModel(GB_FILE_PATH)
if isnothing(best_GB)
    best_GB = get_Best_GB(train_input, train_output, kFoldIndices)
end
# Get the best model for Logistic Regression
best_LR = loadModel(LR_FILE_PATH)
if isnothing(best_LR)
    best_LR = get_Best_LR(train_input, train_output, kFoldIndices)
end
# Get the best model for Nearest centroid
best_NC = loadModel(NC_FILE_PATH)
if isnothing(best_NC)
    best_NC = get_Best_NC(train_input, train_output, kFoldIndices)
end
# Get the best model for Radius Neighbors
best_RN = loadModel(RN_FILE_PATH)
if isnothing(best_RN)
    best_RN = get_Best_RN(train_input, train_output, kFoldIndices)
end
# Get the best model for Ridge Regression
best_RC = loadModel(RC_FILE_PATH)
if isnothing(best_RC)
    best_RC = get_Best_RC(train_input, train_output, kFoldIndices)
end

In [10]:
# Predict with the test dataset to get the metrics of each model
if RERUN_METRICS
    testOutputs = predict(best_MLP, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Multi-layer Perceptron Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_SVM, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Support Vector Machine Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_DT, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Decision Tree Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_KNN, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("K-Nearest Neighbor Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_GB, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Gaussian Naive Bayes Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_LR, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Logistic Regression Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_NC, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Nearest centroid Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_RN, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Radius Neighbors Accuracy: ", metrics[1], " Fscore: ", metrics[2])

    testOutputs = predict(best_RC, test_input)
    metrics = confusionMatrix(testOutputs, test_output, weighted=false)
    println("Ridge Regression Accuracy: ", metrics[1], " Fscore: ", metrics[2])
end

Multi-layer Perceptron Accuracy: 0.41584526687744094 Fscore: 0.5841547331225591
Support Vector Machine Accuracy: 0.41807699460665804 Fscore: 0.581923005393342
Decision Tree Accuracy: 0.39464385344987907 Fscore: 0.6053561465501209
K-Nearest Neighbor Accuracy: 0.4015250139482983 Fscore: 0.5984749860517017
Gaussian Naive Bayes Accuracy: 0.36470150641621724 Fscore: 0.6352984935837828
Logistic Regression Accuracy: 0.40654640133903663 Fscore: 0.5934535986609634
Nearest centroid Accuracy: 0.33327134089641064 Fscore: 0.6667286591035894
Radius Neighbors Accuracy: 0.31653338292728284 Fscore: 0.6834666170727172
Ridge Regression Accuracy: 0.39185419378835784 Fscore: 0.6081458062116422


In [11]:
#Define the models to train

#=models = Dict( "SVM" => SVC(probability=true), 
         "LR" =>LogisticRegression(),
         "DT"=> DecisionTreeClassifier(max_depth=4),
         "NB"=> GaussianNB())

base_models =  [ name for name in keys(models)]=#

In [12]:
# Perform the training for each model and calculate the test values (accuracy)
#=for key in keys(models)
    model = models[key]
    fit!(model,train_input, train_output)
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [13]:
#Define the metaclassifier based on the base_models
#=models["Ensemble (Hard Voting)"] = VotingClassifier(estimators = [(name,models[name]) for name in base_models], 
                                                   n_jobs=-1)
fit!(models["Ensemble (Hard Voting)"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [14]:
#=models["Ensemble (Soft Voting)"] = VotingClassifier(estimators = [(name,models[name]) for name in base_models], 
                                                   n_jobs=-1, voting="soft",weights=[1,2,2,1])
fit!(models["Ensemble (Soft Voting)"],train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [15]:
#=models["Ensemble (Stacking)"] = StackingClassifier(estimators=[(name,models[name]) for name in base_models],
    final_estimator=SVC(probability=true), n_jobs=-1)
fit!(models["Ensemble (Stacking)"], train_input, train_output)=#

In [16]:
#=for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [17]:
#=models["Bagging (SVC)"] = BaggingClassifier(base_estimator=SVC(),n_estimators=10, max_samples=0.50, n_jobs=-1)
fit!(models["Bagging (SVC)"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [18]:
#=@sk_import ensemble:(AdaBoostClassifier, GradientBoostingClassifier)

models["Ada"] = AdaBoostClassifier(n_estimators=30)
fit!(models["Ada"], train_input, train_output)

models["GTB"] = GradientBoostingClassifier(n_estimators=30, learning_rate=1.0, max_depth=2, random_state=0)
fit!(models["GTB"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#


In [19]:
#=@sk_import ensemble:RandomForestClassifier

models["RF"] = RandomForestClassifier(n_estimators=8, max_depth=nothing,
                                    min_samples_split=2, n_jobs=-1)
fit!(models["RF"], train_input, train_output)
    
for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [20]:
#=p = bar(y=1:60,models["RF"].feature_importances_, orientation=:horizontal, legend = false)
xlabel!(p,"Gini Gain")
ylabel!(p,"Fearure")
title!("Feature Importance")=#

In [21]:
#=using Pkg;
Pkg.add("XGBoost")=#

In [22]:
#=using XGBoost;

train_output_asNumber= Vector{Number}(train_output);

@assert train_output_asNumber isa Vector{Number}=#

In [23]:
#model = xgboost(train_input, 20, label = train_output_asNumber, eta = 1, max_depth = 6)

In [24]:
#=param = ["max_depth" => 2,
         "eta" => 1,
         "objective" => "binary:logistic"]
metrics = metrics = ["error", "auc"]
model = xgboost(train_input, 20, label = train_output_asNumber, param = param, metrics = metrics)

pred = predict(model, train_input)=#

In [25]:
#=using XGBoost: predict as predict_xgb

pred = predict_xgb(model, test_input)
print("Error of XGboost= ", sum((pred .> 0.5) .!= test_output) / float(size(pred)[1]), "\n")=#

In [26]:
#=feature_gain = map(x-> (x.fname,x.gain), importance(model))
feature, gain = first.(feature_gain), last.(feature_gain)

using Plots;

p = bar(gain, y=feature, orientation="h", legend=false)
xlabel!(p,"Gain")
ylabel!(p,"Feature")
title!("Feature Importance")=#