# Machine Learning Using Julia

## Activate venv

In [1]:
using Pkg; Pkg.activate("julia-ml")
Pkg.status()

[32m[1m    Status[22m[39m `/media/halper/Seagate/projects/julia_iris_ml/julia-ml/Project.toml`
 [90m [336ed68f][39m[37m CSV v0.8.2[39m
 [90m [a93c6f00][39m[37m DataFrames v0.21.8[39m
 [90m [7806a523][39m[37m DecisionTree v0.10.10[39m
 [90m [38e38edf][39m[37m GLM v1.3.11[39m
 [90m [b1bec4e5][39m[37m LIBSVM v0.4.0[39m
 [90m [38d8eb38][39m[37m Lathe v0.0.9[39m
 [90m [add582a8][39m[37m MLJ v0.15.0[39m
 [90m [d491faf4][39m[37m MLJModels v0.13.1[39m
 [90m [3646fa90][39m[37m ScikitLearn v0.6.2[39m
 [90m [3eaba693][39m[37m StatsModels v0.6.15[39m


## Import dataset

In [2]:
using CSV, DataFrames

iris = DataFrame(CSV.File("../../backup_datasets/iris/iris.csv"))

species = iris[:, :species]

150-element PooledArrays.PooledArray{String,UInt32,1,Array{UInt32,1}}:
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 "setosa"   
 ⋮          
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"
 "virginica"

## Preprocessing

### One-hot encoding

In [None]:
using Lathe

scaled_feature = Lathe.preprocess.OneHotEncode(iris, :species)  # Perform OH encoding

iris = select!(iris, Not([:species]))  # Remove original species column

first(iris, 5)


## Train/test-split

In [None]:
using Random

sample = randsubseq(1:size(iris, 1), 0.75)
train = iris[sample, :]
notsample = [i for i in 1:size(iris, 1) if isempty(searchsorted(sample, i))]
test = iris[notsample, :]

y_test = species[notsample]

## Using GLM

In [None]:
using DataFrames, GLM

fm_setosa = @formula(setosa ~ sepal_length + sepal_width + petal_length + petal_width)
lm_setosa = glm(fm_setosa, train, Binomial(), LogitLink())
pred_setosa = predict(lm_setosa, test)

fm_virginica = @formula(virginica ~ sepal_length + sepal_width + petal_length + petal_width)
lm_virginica = glm(fm_virginica, train, Binomial(), LogitLink())
pred_virginica = predict(lm_virginica, test)

fm_versicolor = @formula(versicolor ~ sepal_length + sepal_width + petal_length + petal_width)
lm_versicolor = glm(fm_versicolor, train, Binomial(), LogitLink())
pred_versicolor = predict(lm_versicolor, test)

preds = hcat(pred_setosa, pred_virginica, pred_versicolor)

In [None]:
# Reclass by maximum predicted proba
preds_cat = String[];

for i in 1:nrow(DataFrame(preds))
    if pred_setosa[i] >= pred_virginica[i] && pred_setosa[i] >= pred_versicolor[i]
        preds_cat = vcat(preds_cat, "setosa")
    elseif pred_versicolor[i] >= pred_virginica[i] && pred_versicolor[i] >= pred_setosa[i]
        preds_cat = vcat(preds_cat, "versicolor")
    else
        preds_cat = vcat(preds_cat, "virginica")
    end
end

In [None]:
# Compute accuracy of GLM

correct = 0

n = length(y_test)
for i in 1:n
    if y_test[i] == preds_cat[i]
        correct += 1
    end
end

println(correct / n)