In [1]:
using MLBase
using Flux
using Flux: onehotbatch, argmax, crossentropy, throttle
using Plots
using DecisionTree
using Distances
using LIBSVM
using CSV
using DataFrames
using Random

In [2]:
data = CSV.read("penguins.csv",DataFrame)

Row,Column1,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex,year
Unnamed: 0_level_1,Int64,String15,String15,String7,String7,String3,String7,String7,Int64
1,1,Adelie,Torgersen,39.1,18.7,181,3750,male,2007
2,2,Adelie,Torgersen,39.5,17.4,186,3800,female,2007
3,3,Adelie,Torgersen,40.3,18,195,3250,female,2007
4,4,Adelie,Torgersen,,,,,,2007
5,5,Adelie,Torgersen,36.7,19.3,193,3450,female,2007
6,6,Adelie,Torgersen,39.3,20.6,190,3650,male,2007
7,7,Adelie,Torgersen,38.9,17.8,181,3625,female,2007
8,8,Adelie,Torgersen,39.2,19.6,195,4675,male,2007
9,9,Adelie,Torgersen,34.1,18.1,193,3475,,2007
10,10,Adelie,Torgersen,42,20.2,190,4250,,2007


In [3]:
data_r = DataFrame(
    species = data[:,:species],
    bill_length_mm = data[:,:bill_length_mm],
    bill_depth_mm = data[:,:bill_depth_mm],
    flipper_length_mm = data[:,:flipper_length_mm],
    body_mass_g = data[:,:body_mass_g]
)
filter!(row -> row[:body_mass_g] != "NA", data_r)
filter!(row -> row[:bill_length_mm] != "NA", data_r)
filter!(row -> row[:bill_depth_mm] != "NA", data_r)
filter!(row -> row[:flipper_length_mm] != "NA", data_r)


Row,species,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g
Unnamed: 0_level_1,String15,String7,String7,String3,String7
1,Adelie,39.1,18.7,181,3750
2,Adelie,39.5,17.4,186,3800
3,Adelie,40.3,18,195,3250
4,Adelie,36.7,19.3,193,3450
5,Adelie,39.3,20.6,190,3650
6,Adelie,38.9,17.8,181,3625
7,Adelie,39.2,19.6,195,4675
8,Adelie,34.1,18.1,193,3475
9,Adelie,42,20.2,190,4250
10,Adelie,37.8,17.1,186,3300


In [4]:
x = Matrix(data_r[:,2:5])
x = parse.(Float64,x)
labels = data_r[:,1]
labelsmap = labelmap(labels)
y = labelencode(labelsmap,labels)


342-element Vector{Int64}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 3
 3
 3
 3
 3
 3
 3
 3
 3

In [5]:
function perclass_splits(y,at)
    uids = unique(y)
    keepids = []
    for ui in uids
        curids = findall(y.==ui)
        rowids = randsubseq(curids, at)
        push!(keepids,rowids...)
    end
    return keepids
end
findaccuracy(predictedvals, groundtruthvals) = sum(predictedvals.==groundtruthvals)/length(groundtruthvals)
assing_class(predictedvalue) = argmin(abs.(predictedvalue .- [1,2,3]))
trainids = perclass_splits(y,0.7)
testids = setdiff(1:length(y),trainids)

104-element Vector{Any}:
   2
   4
   7
  12
  13
  14
  21
  24
  31
  33
   ⋮
 323
 325
 330
 333
 335
 337
 338
 340
 342

In [6]:
Xtrain = Float32.(x[trainids,:])
ytrain = y[trainids]
Xtest = Float32.(x[testids,:])
ytest = y[testids]

104-element Vector{Int64}:
 1
 1
 1
 1
 1
 1
 1
 1
 1
 1
 ⋮
 3
 3
 3
 3
 3
 3
 3
 3
 3

In [7]:
model_DT = DecisionTreeClassifier(max_depth=4)
DecisionTree.fit!(model_DT, Xtrain, ytrain)
pred_DT = DecisionTree.predict(model_DT,Xtest)
DT=findaccuracy(pred_DT,ytest)

0.9711538461538461

In [8]:
model_RF = RandomForestClassifier(n_trees = 20)
DecisionTree.fit!(model_RF,Xtrain, ytrain)
pred_RF = DecisionTree.predict(model_RF, Xtest)
RF=findaccuracy(pred_RF, ytest)

0.9807692307692307

In [9]:
model_SVM = svmtrain(Xtrain', ytrain,kernel=Kernel.Linear)
predictions_SVM, decision_values = svmpredict(model_SVM, Xtest')
println("Prediction: $predictions_SVM \nValues $decision_values ")
SVM = findaccuracy(predictions_SVM,ytest)

Prediction: [1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3] 
Values [5.575232265907125 6.6780733332678395 2.52714479753633 7.777953278663354 6.376785499200942 4.112483791465657 7.795584143296601 6.490727828041436 7.057790582691105 5.933177388430359 5.937834463937925 5.6084654353733825 8.767981519381301 3.9582601333211826 7.636113655444952 6.702937350998418 4.8134832034527975 9.247304573346646 7.9983173815242665 4.165214257937642 2.2960508371878134 2.565185705124186 3.9104482281662243 6.224278058082582 5.5312454406346845 1.3179613423242884 7.581662261782931 3.6790557821921723 8.208830089698331 1.1371377603325712 6.386968489660667 6.854092903592409 6.55500759037097 4.979055426473515 6.219066759961443 6.80697376692553 7.27318915

0.9711538461538461

In [10]:
y_train = onehotbatch(ytrain,unique(ytrain))
y_test = onehotbatch(ytest, unique(ytest))
data_train = Flux.DataLoader((Xtrain',y_train), shuffle=true, batchsize=64)
data_test = Flux.DataLoader((Xtest',y_test), shuffle=true, batchsize=64)

2-element DataLoader(::Tuple{LinearAlgebra.Adjoint{Float32, Matrix{Float32}}, OneHotArrays.OneHotMatrix{UInt32, Vector{UInt32}}}, shuffle=true, batchsize=64)
  with first element:
  (4×64 Matrix{Float32}, 3×64 OneHotMatrix(::Vector{UInt32}) with eltype Bool,)

In [11]:
function get_model()
    c = Chain(
        Dense(4,400,tanh),
        Dense(400,400,sigmoid),
        Dense(400,3),
        softmax
    )

end

model = get_model()

loss(x,y) = crossentropy(model(x),y)
opt = ADAM(0.0001, (0.9, 0.999))
evalcb = () -> @show(loss(Xtrain', y_train))
parameters = Flux.params(model)

Params([Float32[0.06598567 0.06311966 -0.088171996 -0.08848988; 0.07912348 0.09385517 0.0014165175 -0.11556347; … ; 0.1148015 -0.032974605 -0.004806818 0.052326392; 0.049403183 0.10212676 0.04271142 0.021910934], Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], Float32[-9.7921504f-5 -0.018894235 … -0.06411027 -0.08476334; 0.048861805 0.05909818 … -0.030438833 0.058043107; … ; 0.018324722 0.045638293 … 0.046573892 -0.045390617; 0.07245071 -0.06711934 … 0.030269234 -0.06233272], Float32[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0  …  0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0], Float32[-0.06666006 -0.121004164 … -0.10343177 0.08490923; 0.0846203 0.06213753 … -0.05734532 -0.075966984; 0.0231082 -0.11541543 … 0.0636733 -0.07094063], Float32[0.0, 0.0, 0.0]])

In [12]:
for i in 1:1000
    Flux.train!(loss, parameters, data_train, opt)
    l = loss(Xtrain', y_train)
    println("Epoch $i -> loss = $l")
end

Epoch 1 -> loss = 1.0343238
Epoch 2 -> loss = 1.0311456
Epoch 3 -> loss = 1.030017
Epoch 4 -> loss = 1.0204996
Epoch 5 -> loss = 1.0146968
Epoch 6 -> loss = 1.0140275
Epoch 7 -> loss = 1.009302
Epoch 8 -> loss = 1.0060306
Epoch 9 -> loss = 1.0027831
Epoch 10 -> loss = 1.0008583
Epoch 11 -> loss = 0.9978152
Epoch 12 -> loss = 0.9948191
Epoch 13 -> loss = 0.9913811
Epoch 14 -> loss = 0.98871624
Epoch 15 -> loss = 0.984845
Epoch 16 -> loss = 0.9803223
Epoch 17 -> loss = 0.9769881
Epoch 18 -> loss = 0.9727365
Epoch 19 -> loss = 0.9703556
Epoch 20 -> loss = 0.9652929
Epoch 21 -> loss = 0.9620106
Epoch 22 -> loss = 0.9575336
Epoch 23 -> loss = 0.9540409
Epoch 24 -> loss = 0.9495995
Epoch 25 -> loss = 0.94585776
Epoch 26 -> loss = 0.9414192
Epoch 27 -> loss = 0.9379773
Epoch 28 -> loss = 0.933271
Epoch 29 -> loss = 0.93287814
Epoch 30 -> loss = 0.9280434
Epoch 31 -> loss = 0.92199224
Epoch 32 -> loss = 0.91838324
Epoch 33 -> loss = 0.91459316
Epoch 34 -> loss = 0.91139776
Epoch 35 -> loss = 0

In [13]:
function loss_all(data_loader)
    sum([loss(x, y) for (x,y) in data_loader]) / length(data_loader) 
end

function acc(data_loader)
    f(x) = Flux.onecold(cpu(x))
    acces = [sum(f(model(x)) .== f(y)) / size(x,2)  for (x,y) in data_loader]
    sum(acces) / length(data_loader)
end

acc (generic function with 1 method)

In [14]:
println(
    "Accuracy:
    SVM = $SVM
    RandomForest = $RF
    DecisionTree = $DT"
)

Accuracy:
    SVM = 0.9711538461538461
    RandomForest = 0.9807692307692307
    DecisionTree = 0.9711538461538461


In [15]:
@show train_loss = loss_all(data_train)
@show test_loss = loss_all(data_test)
@show train_acc = acc(data_train)
@show test_acc = acc(data_test)

train_loss = loss_all(data_train) = 0.057488993f0
test_loss = loss_all(data_test) = 0.05692446f0
train_acc = acc(data_train) = 0.9774116847826086
test_acc = acc(data_test) = 0.9875


0.9875