# CVD classification
---

In [1]:
using DataFrames, Statistics, LinearAlgebra
using Distributions, StatsBase, Random, MLJ, ShapML

include("src/test_s3.jl");

In [62]:
lpheno = S3Path("s3://envbran/methylation/GSE117064_pheno.arrow")
lmirna = S3Path("s3://envbran/methylation/GSE117064_mirna.arrow")

pheno = DataFrame(Arrow.Table(lpheno));
mirna = DataFrame(Arrow.Table(lmirna));

In [68]:
pheno = pheno[pheno.class_label .== 1,:];
pheno.diagnosis = Int64.(pheno.diagnosis)
mirna = mirna[:,vcat("rn",pheno.geo_accession)];
mirna.rn = "miRNA" .* string.(1:2565);
Tmirna = permutedims(mirna,1);

In [69]:
train, test = partition(collect(eachindex(Tmirna.miRNA1)), 0.8, shuffle=true, rng=111)
X = MLJ.table(Matrix{Float64}(Tmirna[:,2:2566]))
y = coerce(pheno.diagnosis, OrderedFactor);

In [70]:
# Create machine for Elastic-Net Regression 
Standardizer = @load Standardizer pkg=MLJModels
LogisticClassifier = @load LogisticClassifier pkg=MLJLinearModels;

import MLJModels ✔
import MLJLinearModels ✔


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 


In [87]:
model = LogisticClassifier(solver = MLJLinearModels.ProxGrad(max_iter = 10000),
                           penalty = :l1)

LogisticClassifier(
  lambda = 2.220446049250313e-16, 
  gamma = 0.0, 
  penalty = :l1, 
  fit_intercept = true, 
  penalize_intercept = false, 
  scale_penalty_with_samples = true, 
  solver = MLJLinearModels.ProxGrad
  accel: Bool false
  max_iter: Int64 10000
  tol: Float64 0.0001
  max_inner: Int64 100
  beta: Float64 0.8
  gram: Bool false
)

In [88]:
mach = machine(model, X, y) 

untrained Machine; caches model-specific representations of data
  model: LogisticClassifier(lambda = 2.220446049250313e-16, …)
  args: 
    1:	Source @760 ⏎ Table{AbstractVector{ScientificTypesBase.Continuous}}
    2:	Source @143 ⏎ AbstractVector{OrderedFactor{2}}


In [89]:
fit!(mach, rows = train)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(LogisticClassifier(lambda = 2.220446049250313e-16, …), …).
[36m[1m┌ [22m[39m[36m[1mInfo: [22m[39mSolver: MLJLinearModels.ProxGrad
[36m[1m│ [22m[39m  accel: Bool false
[36m[1m│ [22m[39m  max_iter: Int64 10000
[36m[1m│ [22m[39m  tol: Float64 0.0001
[36m[1m│ [22m[39m  max_inner: Int64 100
[36m[1m│ [22m[39m  beta: Float64 0.8
[36m[1m└ [22m[39m  gram: Bool false


trained Machine; caches model-specific representations of data
  model: LogisticClassifier(lambda = 2.220446049250313e-16, …)
  args: 
    1:	Source @760 ⏎ Table{AbstractVector{ScientificTypesBase.Continuous}}
    2:	Source @143 ⏎ AbstractVector{OrderedFactor{2}}


In [92]:
evaluate!(mach, resampling = CV(nfolds=10, rng=1234),measure = [accuracy])



PerformanceEvaluation object with these fields:
  model, measure, operation, measurement, per_fold,
  per_observation, fitted_params_per_fold,
  report_per_fold, train_test_rows, resampling, repeats
Extract:
┌────────────┬──────────────┬─────────────┬─────────┬───────────────────────────
│[22m measure    [0m│[22m operation    [0m│[22m measurement [0m│[22m 1.96*SE [0m│[22m per_fold                [0m ⋯
├────────────┼──────────────┼─────────────┼─────────┼───────────────────────────
│ Accuracy() │ predict_mode │ 0.968       │ 0.0227  │ [0.914, 0.971, 1.0, 1.0, ⋯
└────────────┴──────────────┴─────────────┴─────────┴───────────────────────────
[36m                                                                1 column omitted[0m


In [101]:
yhat = MLJ.predict_mode(mach, rows = test);
confusion_matrix(yhat, y[test])

          ┌─────────────┐
          │Ground Truth │
┌─────────┼──────┬──────┤
│Predicted│  0   │  1   │
├─────────┼──────┼──────┤
│    0    │  38  │  0   │
├─────────┼──────┼──────┤
│    1    │  0   │  31  │
└─────────┴──────┴──────┘


In [103]:
accuracy(yhat, y[test])

1.0