# Machine Learning Using Julia

## Activate venv

In [62]:
using Pkg; Pkg.activate("julia-ml")
Pkg.status()

[32m[1m    Status[22m[39m `/media/halper/Seagate/projects/julia_iris_ml/julia-ml/Project.toml`
 [90m [336ed68f][39m[37m CSV v0.8.2[39m
 [90m [a93c6f00][39m[37m DataFrames v0.21.8[39m
 [90m [7806a523][39m[37m DecisionTree v0.10.10[39m
 [90m [38e38edf][39m[37m GLM v1.3.11[39m
 [90m [b1bec4e5][39m[37m LIBSVM v0.4.0[39m
 [90m [38d8eb38][39m[37m Lathe v0.0.9[39m
 [90m [add582a8][39m[37m MLJ v0.15.0[39m
 [90m [d491faf4][39m[37m MLJModels v0.13.1[39m
 [90m [3646fa90][39m[37m ScikitLearn v0.6.2[39m
 [90m [3eaba693][39m[37m StatsModels v0.6.15[39m


## Import dataset

In [63]:
using CSV, DataFrames

iris = DataFrame(CSV.File("../../backup_datasets/iris/iris.csv"))

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,species
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,String
1,5.1,3.5,1.4,0.2,setosa
2,4.9,3.0,1.4,0.2,setosa
3,4.7,3.2,1.3,0.2,setosa
4,4.6,3.1,1.5,0.2,setosa
5,5.0,3.6,1.4,0.2,setosa
6,5.4,3.9,1.7,0.4,setosa
7,4.6,3.4,1.4,0.3,setosa
8,5.0,3.4,1.5,0.2,setosa
9,4.4,2.9,1.4,0.2,setosa
10,4.9,3.1,1.5,0.1,setosa


## Preprocessing

### One-hot encoding

In [64]:
using Lathe

scaled_feature = Lathe.preprocess.OneHotEncode(iris, :species)  # Perform OH encoding

iris = select!(iris, Not([:species]))  # Remove original species column

first(iris, 5)


Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Bool,Bool,Bool
1,5.1,3.5,1.4,0.2,True,False,False
2,4.9,3.0,1.4,0.2,True,False,False
3,4.7,3.2,1.3,0.2,True,False,False
4,4.6,3.1,1.5,0.2,True,False,False
5,5.0,3.6,1.4,0.2,True,False,False


## Train/test-split

In [65]:
using Random

sample = randsubseq(1:size(iris, 1), 0.75)
train = iris[sample, :]
notsample = [i for i in 1:size(iris, 1) if isempty(searchsorted(sample, i))]
test = iris[notsample, :]

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Bool,Bool,Bool
1,5.0,3.4,1.5,0.2,true,false,false
2,5.4,3.7,1.5,0.2,true,false,false
3,4.3,3.0,1.1,0.1,true,false,false
4,5.8,4.0,1.2,0.2,true,false,false
5,5.7,4.4,1.5,0.4,true,false,false
6,5.7,3.8,1.7,0.3,true,false,false
7,5.2,3.4,1.4,0.2,true,false,false
8,4.8,3.1,1.6,0.2,true,false,false
9,5.4,3.4,1.5,0.4,true,false,false
10,4.9,3.1,1.5,0.1,true,false,false


## Using GLM

In [66]:
using DataFrames, GLM

fm_setosa = @formula(setosa ~ sepal_length + sepal_width + petal_length + petal_width)
lm_setosa = glm(fm_setosa, train, Binomial(), LogitLink())
pred_setosa = predict(lm_setosa, test)

fm_virginica = @formula(virginica ~ sepal_length + sepal_width + petal_length + petal_width)
lm_virginica = glm(fm_virginica, train, Binomial(), LogitLink())
pred_virginica = predict(lm_virginica, test)

fm_versicolor = @formula(versicolor ~ sepal_length + sepal_width + petal_length + petal_width)
lm_versicolor = glm(fm_versicolor, train, Binomial(), LogitLink())
pred_versicolor = predict(lm_versicolor, test)

preds = hcat(pred_setosa, pred_virginica, pred_versicolor)

35×3 Array{Float64,2}:
 1.0          7.03287e-22  0.109778  
 1.0          5.07423e-23  0.0477091 
 1.0          3.11227e-22  0.241125  
 1.0          2.85585e-25  0.0134553 
 1.0          8.00615e-24  0.00274587
 1.0          2.90138e-22  0.0353527 
 1.0          1.81037e-22  0.101869  
 1.0          1.3744e-20   0.267178  
 1.0          3.73467e-21  0.0669209 
 1.0          1.19166e-21  0.310731  
 1.0          2.86623e-22  0.046789  
 1.0          2.05629e-19  0.0272177 
 1.0          1.48217e-22  0.0366673 
 ⋮                                   
 8.38259e-26  0.999971     0.805315  
 7.86098e-24  0.998688     0.551626  
 2.84511e-24  0.999773     0.255739  
 6.45819e-33  1.0          0.84768   
 9.32514e-29  1.0          0.851339  
 8.72348e-21  0.895251     0.492338  
 3.36403e-20  0.733033     0.372272  
 1.54204e-23  0.977617     0.92204   
 6.21392e-23  0.998828     0.180904  
 4.3709e-28   0.999999     0.0426998 
 1.19385e-22  0.994818     0.227495  
 3.53573e-26  0.999932     

In [67]:
# Reclass by maximum predicted proba
preds_cat = String[];

for i in 1:nrow(DataFrame(preds))
    if pred_setosa[i] >= pred_virginica[i] && pred_setosa[i] >= pred_versicolor[i]
        preds_cat = vcat(preds_cat, "setosa")
    elseif pred_versicolor[i] >= pred_virginica[i] && pred_versicolor[i] >= pred_setosa[i]
        preds_cat = vcat(preds_cat, "versicolor")
    else
        preds_cat = vcat(preds_cat, "virginica")
    end
end

In [70]:
iris_orig

Unnamed: 0_level_0,sepal_length,sepal_width,petal_length,petal_width,setosa,versicolor,virginica
Unnamed: 0_level_1,Float64,Float64,Float64,Float64,Bool,Bool,Bool
1,5.1,3.5,1.4,0.2,true,false,false
2,4.9,3.0,1.4,0.2,true,false,false
3,4.7,3.2,1.3,0.2,true,false,false
4,4.6,3.1,1.5,0.2,true,false,false
5,5.0,3.6,1.4,0.2,true,false,false
6,5.4,3.9,1.7,0.4,true,false,false
7,4.6,3.4,1.4,0.3,true,false,false
8,5.0,3.4,1.5,0.2,true,false,false
9,4.4,2.9,1.4,0.2,true,false,false
10,4.9,3.1,1.5,0.1,true,false,false


In [69]:
# Compute accuracy of GLM

correct = 0

actual = iris_orig[notsample, :species]
n = length(actual)
for i in 1:n
    if actual[i] == preds_cat[i]
        correct += 1
    end
end

println(correct / n)

ArgumentError: ArgumentError: column name :species not found in the data frame