# Partial Least Squares (PLS)
---

In [1]:
using DataFrames, Statistics, LinearAlgebra
using Distributions, StatsBase, Random, MLJ, ShapML

include("src/test_s3.jl");

In [2]:
lpheno = S3Path("s3://envbran/methylation/GSE117064_pheno.arrow")
lmirna = S3Path("s3://envbran/methylation/GSE117064_mirna.arrow")

pheno = DataFrame(Arrow.Table(lpheno));
mirna = DataFrame(Arrow.Table(lmirna));

In [6]:
pheno = pheno[pheno.diagnosis .== 0,:];

In [7]:
mirna = mirna[:,vcat("rn",pheno.geo_accession)];
mirna.rn = "miRNA" .* string.(1:2565);
Tmirna = permutedims(mirna,1);

In [32]:
names_outcome = names(pheno)[[7,8,10,13]]

4-element Vector{String}:
 "bmi:ch1"
 "diastolic bp:ch1"
 "hb-a1c:ch1"
 "systolic bp:ch1"

In [34]:
pheno[:,"bmi"] = parse.(Float64, pheno[:,"bmi:ch1"])
pheno[:,"dia"] = parse.(Float64, pheno[:,"diastolic bp:ch1"])
pheno[:,"hba1c"] = parse.(Float64, pheno[:,"hb-a1c:ch1"])
pheno[:,"sys"] = parse.(Float64, pheno[:,"systolic bp:ch1"]);

In [35]:
train, test = partition(collect(eachindex(Tmirna.miRNA1)), 0.8, shuffle=true, rng=111)
X = MLJ.table(Matrix{Float64}(Tmirna[:,2:2566]))
y = MLJ.table(Matrix{Float64}(pheno[:,[:bmi,:dia,:sys,:hba1c]]))

Tables.MatrixTable{Matrix{Float64}} with 1612 rows, 4 columns, and schema:
 :x1  Float64
 :x2  Float64
 :x3  Float64
 :x4  Float64

In [51]:
PLSRegressor = @load PLSRegressor pkg=PartialLeastSquaresRegressor
Standardizer = @load Standardizer pkg=MLJModels

import PartialLeastSquaresRegressor ✔
import MLJModels ✔


[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mFor silent loading, specify `verbosity=0`. 


MLJModels.Standardizer

In [52]:
model = Standardizer |> PLSRegressor(n_factors=25)

DeterministicPipeline(
  standardizer = Standardizer(
        features = Symbol[], 
        ignore = false, 
        ordered_factor = false, 
        count = false), 
  pls_regressor = PLSRegressor(
        n_factors = 25), 
  cache = true)

In [58]:
mach = machine(model, X, y) 

untrained Machine; does not cache data
  model: DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …)
  args: 
    1:	Source @328 ⏎ Table{AbstractVector{ScientificTypesBase.Continuous}}
    2:	Source @044 ⏎ Table{AbstractVector{ScientificTypesBase.Continuous}}


In [59]:
fit!(mach, rows = train)

[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …), …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:standardizer, …).
[36m[1m[ [22m[39m[36m[1mInfo: [22m[39mTraining machine(:pls_regressor, …).


trained Machine; does not cache data
  model: DeterministicPipeline(standardizer = Standardizer(features = Symbol[], …), …)
  args: 
    1:	Source @328 ⏎ Table{AbstractVector{ScientificTypesBase.Continuous}}
    2:	Source @044 ⏎ Table{AbstractVector{ScientificTypesBase.Continuous}}


In [60]:
yhat = MLJ.predict(mach, rows = test);

In [61]:
DataFrame(yhat.x1,y.x1,

322-element view(::Matrix{Float64}, :, 1) with eltype Float64:
  -5.9431504361998275
   4.838824234375542
  -1.008601798180835
  -9.070610671885426
  -9.797413691837484
   6.998050982662823
   4.588240213325142
  16.256922003765794
  23.67444290925268
 -12.23287021649748
  -5.139870768216045
   4.975784055292242
 -14.647615727583226
   ⋮
 -12.984870139134074
   9.514224275843526
 -15.050671277217615
  18.615341259629723
   2.109695070830325
  -0.09753339911677983
   7.718318372294241
 -13.466290270164826
   0.17940946415842407
  -1.8196735664628119
  -5.523010618207759
   0.4413733165582552