In [None]:
import Pkg
Pkg.add("CSV")
Pkg.add("StatsBase")
Pkg.add("DataFrames")

## Settings

Load packages

In [None]:
using CSV, StatsBase, Statistics, DataFrames

Load generated datasets and combine them

In [None]:
R = CSV.read("Data/matR.csv", header = false) # Reference
N = CSV.read("Data/matN.csv", header = false) # Negative control
PS = CSV.read("Data/matPS.csv", header = false) # Shifted
PR = CSV.read("Data/matPR.csv", header = false) # Rescaled
1 # Do not display last element

In [None]:
dataset = vcat(R, N, PS, PR)

In [None]:
# Remember how these records were generated
origDataset = vcat(repeat(["Reference"], size(R, 1)),
                   repeat(["Negative control"], size(N, 1)),
                   repeat(["Shifted control"], size(PS, 1)),
                   repeat(["Rescaled control"], size(PR, 1)))

## Feature filtering

In [None]:
FILT_MAX_CORR = 0.1 # Keep uncorrelated variables
dimUMAP = 5

In [None]:
# Center and scale on control values
transfNorm(x,y) = (x .- median(y)) ./ mad(y)
indRef = origDataset .== "Reference"
normDataset = DataFrame(map(x -> transfNorm(x, x[indRef]), eachcol(dataset)))
1

In [None]:
# Order features from biggest mad to smallest mad
# Since features have mad(reference) = 1, it means that we rank features by how more variable they are
# for all conditions compared to the reference
orderFt = sortperm(convert(Array, map(x -> mad(x, normalize = true), eachcol(normDataset))), rev=true)

In [None]:
function decorrelate(data::DataFrame; orderCol = nothing, threshold = 0.8)
    """Returns column  of 'data' that are never pairwise-correlated more than 'threshold',
    prioritizing columns by a giver order 'orderCol' (defaults to left to right).
    """
    if isnothing(orderCol)
        orderCol = 1:size(normDataset, 2)
    end
    # Columns to sort
    L1 = orderCol
    # Sorted columns to keep
    L2 = Array{Int64,1}()
    while length(L1) > 0
        refFt = first(L1)
        append!(L2, refFt)
        popfirst!(L1)
        stillToKeep = []
        for (ift, ft) in enumerate(L1)
            if abs(cor(data[refFt], data[ft])) < threshold
                append!(stillToKeep, ift)
            end
        end
        L1 = L1[stillToKeep]
    end
    return(L2)
end

In [None]:
uncorrFt = decorrelate(normDataset, orderCol = orderFt, threshold = FILT_MAX_CORR)
normDataset = normDataset[uncorrFt]