# Machine Learning Using Julia

## Activate venv

In [1]:
 
Pkg.status()

[32m[1m    Status[22m[39m `/media/halper/Seagate/projects/julia_iris_ml/julia-ml/Project.toml`
 [90m [336ed68f][39m[37m CSV v0.8.2[39m
 [90m [a93c6f00][39m[37m DataFrames v0.21.8[39m
 [90m [7806a523][39m[37m DecisionTree v0.10.10[39m
 [90m [38e38edf][39m[37m GLM v1.3.11[39m
 [90m [b1bec4e5][39m[37m LIBSVM v0.4.0[39m
 [90m [38d8eb38][39m[37m Lathe v0.0.9[39m
 [90m [add582a8][39m[37m MLJ v0.15.0[39m
 [90m [d491faf4][39m[37m MLJModels v0.13.1[39m
 [90m [3646fa90][39m[37m ScikitLearn v0.6.2[39m
 [90m [3eaba693][39m[37m StatsModels v0.6.15[39m


## Import dataset

In [2]:
using CSV, DataFrames

iris = DataFrame(CSV.File("../../backup_datasets/iris/iris.csv"))

species = iris[:, :species]

150-element PooledArrays.PooledArray{String,UInt32,1,Array{UInt32,1}}:
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 ⋮          
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"

## Preprocessing

### One-hot encoding

In [3]:
using Lathe

scaled_feature = Lathe.preprocess.OneHotEncode(iris, :species)  # Perform OH encoding

iris = selectT!(iris, Not([:species]))  # Remove original species column

first(iris, 5)


Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Bool,Bool,Bool
1,5.1,3.5,1.4,0.2,True,False,False
2,4.9,3.0,1.4,0.2,True,False,False
3,4.7,3.2,1.3,0.2,True,False,False
4,4.6,3.1,1.5,0.2,True,False,False
5,5.0,3.6,1.4,0.2,True,False,False


## Train/test-split

In [4]:
using Random

sample = randsubseq(1:size(iris, 1), 0.75)
train = iris[sample, :]
notsample = [i for i in 1:size(iris, 1) if isempty(searchsorted(sample, i))]
test = iris[notsample, :]

y_test = species[notsample]

45-element PooledArrays.PooledArray{String,UInt32,1,Array{UInt32,1}}:
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 ⋮          
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"

## Using GLM

In [5]:
using DataFrames, GLM

fm_setosa = @formula(setosa ~ sepal_length + sepal_width + petal_length + petal_width)
lm_setosa = glm(fm_setosa, train, Binomial(), LogitLink())
pred_setosa = predict(lm_setosa, test)

fm_virginica = @formula(virginica ~ sepal_length + sepal_width + petal_length + petal_width)
lm_virginica = glm(fm_virginica, train, Binomial(), LogitLink())
pred_virginica = predict(lm_virginica, test)

fm_versicolor = @formula(versicolor ~ sepal_length + sepal_width + petal_length + petal_width)
lm_versicolor = glm(fm_versicolor, train, Binomial(), LogitLink())
pred_versicolor = predict(lm_versicolor, test)

preds = hcat(pred_setosa, pred_virginica, pred_versicolor)

45×3 Array{Float64,2}:
 1.0          8.08672e-18  0.197724  
 1.0          1.03352e-18  0.0316617 
 1.0          4.96012e-19  0.0285771 
 1.0          1.59551e-18  0.0045917 
 1.0          2.41963e-18  0.0339515 
 1.0          2.21699e-17  0.0120309 
 1.0          2.17532e-19  0.0156983 
 1.0          3.07711e-16  0.0535158 
 1.0          3.4314e-18   0.334661  
 1.0          3.78321e-19  0.00762672
 1.0          2.07397e-19  0.00343455
 1.0          6.14976e-20  0.0501334 
 1.0          3.48849e-18  0.205872  
 ⋮                                   
 2.04075e-26  1.0          0.00871156
 2.08437e-22  0.999701     0.147322  
 5.83629e-25  0.999981     0.0224368 
 7.28646e-20  0.272367     0.941128  
 2.76556e-19  0.781956     0.395061  
 6.65951e-20  0.998721     0.358361  
 3.15841e-21  0.906307     0.915437  
 5.79976e-28  1.0          0.00995493
 9.79358e-20  0.865575     0.128183  
 1.99253e-26  0.999999     0.0583912 
 2.52873e-21  0.962528     0.537977  
 3.26321e-22  0.990161     

In [6]:
# Reclass by maximum predicted proba
preds_cat = String[];

for i in 1:nrow(DataFrame(preds))
    if pred_setosa[i] >= pred_virginica[i] && pred_setosa[i] >= pred_versicolor[i]
        preds_cat = vcat(preds_cat, "setosa")
    elseif pred_versicolor[i] >= pred_virginica[i] && pred_versicolor[i] >= pred_setosa[i]
        preds_cat = vcat(preds_cat, "versicolor")
    else
        preds_cat = vcat(preds_cat, "virginica")
    end
end

In [7]:
# Compute accuracy of GLM

correct = 0

n = length(y_test)
for i in 1:n
    if y_test[i] == preds_cat[i]
        correct += 1
    end
end

println(correct / n)

0.9111111111111111
