# Machine Learning - Final assignment

**Students:**
<hr>
Mutaz Abueisheh</br>
Marcelo Jose Ferrer</br>
Maximiliano Hormazábal Lagos</br>
Mohamed Aymen Merchaoui</br>

## Classification problem

Classify the dataset between this 10 classes.

0 = Acoustic/Folk</br>
1 = Alternative music</br>
2 = Blues</br>
3 = Bollywood</br>
4 = Country</br>
5 = Hip Hop</br>
6 = Indie</br>
7 = Instrumental</br>
8 = Metal</br>
9 = Pop</br>
10 = Rock</br>

https://www.kaggle.com/datasets/purumalgi/music-genre-classification

# Imports and declarations

This section contains all imports and declarations

In [1]:
# The next packages must be installed to run the solution
import Pkg; 
#Pkg.add("Flux")
#Pkg.add("RDatasets")
#Pkg.add("FeatureSelectors")
#Pkg.add("ScikitLearn"))
#Pkg.add("WeightedPCA"))
# Pkg.add("BetaML")



In [2]:
# Import libraries
using Flux
using Flux.Losses
using DelimitedFiles
using Statistics
using Random
using ScikitLearn
using RDatasets
using FeatureSelectors

In [3]:
# Import ScikitLearn models
@sk_import svm:SVC
@sk_import tree:DecisionTreeClassifier
@sk_import linear_model:LogisticRegression
@sk_import neighbors: KNeighborsClassifier
@sk_import naive_bayes:GaussianNB 
@sk_import ensemble:VotingClassifier
@sk_import ensemble:StackingClassifier
@sk_import ensemble:BaggingClassifier
@sk_import decomposition:PCA

PyObject <class 'sklearn.decomposition._pca.PCA'>

In [4]:
# Constants
# Execute model test set
RUN_ANN_TEST = false
RUN_SVM_TEST = false
RUN_DT_TEST = true
RUN_KNN_TEST = true

true

In [5]:
# Include the code done in previous practices
include("utils/practices_code.jl")
# Class that handle the data processing
include("utils/data_handler.jl")
# Class that handle the model processing
include("utils/model_handler.jl")

evaluateModel (generic function with 1 method)

# Data preprocessing

This section contains the preprocessing of the data

In [6]:
# Load the dataset and normalize
dataset = readdlm("dataset/clean_music_genre.csv",',');

println("Dataset original size: ", size(dataset))
println("Sample of original dataset: ", dataset[2,1:17])

# Remove line of headers, artist and song name. Separate train_x and train_y
train_x = dataset[2:size(dataset,1),3:16]
train_y = dataset[2:size(dataset,1),17]

# Transform columns to positive values
train_x = abs.(train_x)

# normalized_inputs = normalizeMinMax!(train_x) --> must be after splitting
binary_outputs = oneHotEncoding(train_y)

println("Inputs size: ", size(train_x))
println("Sample of inputs: ", train_x[1,1:14])
println("Outputs size: ", size(train_y))
println("Sample of Outputs: ", train_y[1])

Dataset original size: (17972, 17)
Sample of original dataset: Any["Bruno Mars", "That's What I Like (feat. Gucci Mane)", 52.2, 0.854, 0.564, 1.0, -8.630403119526399, 1, 0.03404, 0.0171, 0.00965882, 0.0849, 0.899, 134.071, 234.596, 4, 5]
Inputs size: (17971, 14)
Sample of inputs: Real[52.2, 0.854, 0.564, 1.0, 8.630403119526399, 1, 0.03404, 0.0171, 0.00965882, 0.0849, 0.899, 134.071, 234.596, 4]
Outputs size: (17971,)
Sample of Outputs: 5


In [7]:
# Using Hold Out function to split dataset into train, test and validation
#indexs = holdOut(size(train_x,1),0.2,0.1)
indexs = holdOut(size(train_x,1),0.2)

train_input = train_x[indexs[1],:]
train_output = binary_outputs[indexs[1]]

test_input = train_x[indexs[2],:]
test_output = binary_outputs[indexs[2]]

#normalization after splitting, so test data cannot affect the train data and the first touch between them should be in predictions.
train_input = normalizeMinMax!(train_input)
test_input = normalizeMinMax!(test_input)


#validation_input = normalized_inputs[indexs[3],:]
#validation_output = binary_outputs[indexs[3]]

println("Size original input data: ", size(train_x))
println("Size original output data: ", size(binary_outputs))

println("Size train input data: ", size(train_input))
println("Size train output data: ", size(train_output))

println("Size test input data: ", size(test_input))
println("Size test output data: ", size(test_output))

#println("Size validation input data: ", size(validation_input))
#println("Size validation output data: ", size(validation_output))

Size original input data: (17971, 14)
Size original output data: (17971, 11)
Size train input data: (14377, 14)
Size train output data: (14377,)
Size test input data: (3594, 14)
Size test output data: (3594,)


In [8]:
# applying PCA, 
pca = PCA(0.95)
fit!(pca, train_input)

pca_train = pca.transform(train_input)
pca_test = pca.transform(test_input)

print("Train Patterns ", size(train_input), " -> ", size(pca_train))
print("Test Patterns ", size(test_input), " -> ", size(pca_test))

# PCA based on 95% variance, suggests that there are 5 features have noise and should be eliminated
# from the input data, it's worth to try apply it and compare, maybe after finding the optimaal
# parameters of the model

Train Patterns (14377, 14) -> (14377, 10)Test Patterns (3594, 14) -> (3594, 10)

In [9]:
# Feature Selection:
# this function can be used to the most K important features after normalization and splitting.
# for optimal number of K, I am still searching
#FeatureSelection(train_x,train_y,k)

# Model experimentation

This section contains all experimentation of the models

In [10]:
if RUN_ANN_TEST
    indexs = crossvalidation(train_output, 10)
    kFoldIndices = convert(Vector{Int64}, indexs)

    test_ANN_Model(train_input, train_output, test_input, test_output, kFoldIndices)
end

In [11]:
if RUN_SVM_TEST
    indexs = crossvalidation(train_output, 10)
    kFoldIndices = convert(Vector{Int64}, indexs)

    test_SVM_Model(train_input, train_output, test_input, test_output, kFoldIndices)
end

In [12]:
if RUN_DT_TEST
    indexs = crossvalidation(train_output, 10)
    kFoldIndices = convert(Vector{Int64}, indexs)

    test_DT_Model(train_input, train_output, test_input, test_output, kFoldIndices)
end

Test results for Decision tree model: 
Parameters: Dict{Any, Any}("max_depth" => 4, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.9286357085684033
Parameters: Dict{Any, Any}("max_depth" => 3, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.9238361193298898
Parameters: Dict{Any, Any}("max_depth" => 2, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.9248800574524075
Parameters: Dict{Any, Any}("max_depth" => 1, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.9202198890247123
Parameters: Dict{Any, Any}("max_depth" => 5, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" => 2) Accuracy: 0.930165659604163
Parameters: Dict{Any, Any}("max_depth" => 6, "random_state" => 1, "splitter" => "best", "criterion" => "gini", "min_samples_split" =

In [13]:
if RUN_KNN_TEST
    indexs = crossvalidation(train_output, 10)
    kFoldIndices = convert(Vector{Int64}, indexs)

    test_KNN_Model(train_input, train_output, test_input, test_output, kFoldIndices)
end

Test results for KNN model: 
Parameters: Dict{Any, Any}("n_neighbors" => 3, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.9188987062561763
Parameters: Dict{Any, Any}("n_neighbors" => 2, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.9212633432152249
Parameters: Dict{Any, Any}("n_neighbors" => 1, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.897267816682685
Parameters: Dict{Any, Any}("n_neighbors" => 4, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.9212630528560215
Parameters: Dict{Any, Any}("n_neighbors" => 5, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.9228632224257962
Parameters: Dict{Any, Any}("n_neighbors" => 6, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.9234195022662537
Parameters: Dict{Any, Any}("n_neighbors" => 7, "metric" => "nan_euclidean", "weights" => "uniform") Accuracy: 0.9237671106258889
Parameters: Dict{Any, Any}("n_neighbors" => 8, "metric" => "nan_eucli

In [14]:
#Define the models to train

models = Dict( "SVM" => SVC(probability=true), 
         "LR" =>LogisticRegression(),
         "DT"=> DecisionTreeClassifier(max_depth=4),
         "NB"=> GaussianNB())

base_models =  [ name for name in keys(models)]

4-element Vector{String}:
 "NB"
 "SVM"
 "LR"
 "DT"

In [15]:
# Perform the training for each model and calculate the test values (accuracy)
for key in keys(models)
    model = models[key]
    fit!(model,train_input, train_output)
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end

NB: 90.20589872008904 %
SVM: 91.68057874234836 %
LR: 92.18141346688927 %
DT: 91.90317195325542 %


In [16]:
#Define the metaclassifier based on the base_models
#=models["Ensemble (Hard Voting)"] = VotingClassifier(estimators = [(name,models[name]) for name in base_models], 
                                                   n_jobs=-1)
fit!(models["Ensemble (Hard Voting)"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [17]:
#=models["Ensemble (Soft Voting)"] = VotingClassifier(estimators = [(name,models[name]) for name in base_models], 
                                                   n_jobs=-1, voting="soft",weights=[1,2,2,1])
fit!(models["Ensemble (Soft Voting)"],train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [18]:
#=models["Ensemble (Stacking)"] = StackingClassifier(estimators=[(name,models[name]) for name in base_models],
    final_estimator=SVC(probability=true), n_jobs=-1)
fit!(models["Ensemble (Stacking)"], train_input, train_output)=#

In [19]:
#=for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [20]:
#=models["Bagging (SVC)"] = BaggingClassifier(base_estimator=SVC(),n_estimators=10, max_samples=0.50, n_jobs=-1)
fit!(models["Bagging (SVC)"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,train_input, train_output)
    println("$key: $(acc*100) %")
end=#

In [21]:
#=@sk_import ensemble:(AdaBoostClassifier, GradientBoostingClassifier)

models["Ada"] = AdaBoostClassifier(n_estimators=30)
fit!(models["Ada"], train_input, train_output)

models["GTB"] = GradientBoostingClassifier(n_estimators=30, learning_rate=1.0, max_depth=2, random_state=0)
fit!(models["GTB"], train_input, train_output)

for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#


In [22]:
#=@sk_import ensemble:RandomForestClassifier

models["RF"] = RandomForestClassifier(n_estimators=8, max_depth=nothing,
                                    min_samples_split=2, n_jobs=-1)
fit!(models["RF"], train_input, train_output)
    
for key in keys(models)
    model = models[key]
    acc = score(model,test_input, test_output)
    println("$key: $(acc*100) %")
end=#

In [23]:
#=p = bar(y=1:60,models["RF"].feature_importances_, orientation=:horizontal, legend = false)
xlabel!(p,"Gini Gain")
ylabel!(p,"Fearure")
title!("Feature Importance")=#

In [24]:
#=using Pkg;
Pkg.add("XGBoost")=#

In [25]:
#=using XGBoost;

train_output_asNumber= Vector{Number}(train_output);

@assert train_output_asNumber isa Vector{Number}=#

In [26]:
#=model = xgboost(train_input, 20, label = train_output_asNumber, eta = 1, max_depth = 6)#

LoadError: syntax: incomplete: unterminated multi-line comment #= ... =#

In [None]:
#=param = ["max_depth" => 2,
         "eta" => 1,
         "objective" => "binary:logistic"]
metrics = metrics = ["error", "auc"]
model = xgboost(train_input, 20, label = train_output_asNumber, param = param, metrics = metrics)

pred = predict(model, train_input)=#

In [None]:
#=using XGBoost: predict as predict_xgb

pred = predict_xgb(model, test_input)
print("Error of XGboost= ", sum((pred .> 0.5) .!= test_output) / float(size(pred)[1]), "\n")=#

In [None]:
#=feature_gain = map(x-> (x.fname,x.gain), importance(model))
feature, gain = first.(feature_gain), last.(feature_gain)

using Plots;

p = bar(gain, y=feature, orientation="h", legend=false)
xlabel!(p,"Gain")
ylabel!(p,"Feature")
title!("Feature Importance")=#