Note: all code is written in julia (https://julialang.org/) and should be compatible with julia v1.9 and later versions.

Code author: Janko Tackmann (jtackm@github)

# Abundance aggregation

In [None]:
###########################################################
# Functions to aggregate abundances per community cluster #
###########################################################

using Statistics, StatsBase, SparseArrays, AxisArrays, ProgressMeter

function make_ccabund_matrix(cc_oabund_dict, cc_header, oid_header)
    """Convert an aggregated count dict (see 'sum_abunds_per_otu_and_cclust()' and 'convert_to_mean_abunds(()')
    into a matrix, aligned with indices 'cc_header' (community clusters) and 'oid_header' (OTUs)."""
    n_cc = length(cc_header)
    n_oid = length(oid_header)
    A = zeros(Float32, n_cc, n_oid)
    oid_header_rev = Dict(oid=>i for (i, oid) in enumerate(oid_header))
    @showprogress for (i, cc) in enumerate(cc_header)
        for (oid, abund) in cc_oabund_dict[cc]
            if haskey(oid_header_rev, oid)
                oid_j = oid_header_rev[oid]
                A[i, oid_j] = abund
            end
        end
    end
    AxisArray(A, cc_header, oid_header)
end

function sum_abunds_per_otu_and_cclust(otu_matn_T::AxisArray{T}, cclust_map::Dict) where T<:Real
    """Accumulate counts for each OTU (row) and community cluster (column) in OTU table 'otu_matn_T'.

    @cclust_map: dictionary with sample IDs as keys and community cluster IDs as values."""
    
    n_samps = size(otu_matn_T, 2)
    rev_sid_map = Dict(s=>i for (i, s) in enumerate(axisvalues(otu_matn_T)[2]))
    rev_cc_map = Dict(s=>i for (i, s) in enumerate(keys(cclust_map)))
    cc_idx = zeros(Int, n_samps)
    
    for (cc, sids) in cclust_map
        cc_i = rev_cc_map[cc]
        for sid in sids
            if haskey(rev_sid_map, sid)
                cc_idx[rev_sid_map[sid]] = cc_i
            end
        end
    end
    
    @assert !any(iszero.(cc_idx))
    
    accum_dict = Dict{Int,Dict{Int,T}}()
    A = otu_matn_T.data
    nzv = nonzeros(A)
    rvs = rowvals(A)
    
    @showprogress for s_i in 1:n_samps
        s_itr = nzrange(A, s_i)
        
        cc_i = cc_idx[s_i]
        if !haskey(accum_dict, cc_i)
            sub_acc_dict = Dict(rvs[j] => nzv[j] for j in s_itr)
            accum_dict[cc_i] = sub_acc_dict
        else
            sub_acc_dict = accum_dict[cc_i]
            for j in s_itr
                oid = rvs[j]
                abund = nzv[j]
                if haskey(sub_acc_dict, oid)
                    sub_acc_dict[oid] += abund
                else
                    sub_acc_dict[oid] = abund
                end
            end
        end
    end
    cc_map = Dict(v=>k for (k,v) in rev_cc_map)
    oid_map = axisvalues(otu_matn_T)[1]
    @time trans_accum_dict = Dict(cc_map[cc_i]=>Dict(oid_map[oid_i]=>abund for (oid_i, abund) in sub_d)
                            for (cc_i, sub_d) in accum_dict)
    trans_accum_dict
end

function convert_to_mean_abunds(accum_dict, cclust_map; prev_accum_dict=nothing)
    """Converts accumulated counts (see 'sum_abunds_per_otu_and_cclust()') into averages."""
    @assert !nz || prev_accum_dict != nothing "provide a prevalence count dict when choosing 'nz=true'"
    mean_accum_dict = Dict{String,Dict{String,Float64}}()
    @showprogress for (cc, sub_acc_dict) in accum_dict
        cc_size = length(cclust_map[cc])
        mean_accum_dict[cc] = Dict(oid=>abund/cc_size for (oid, abund) in sub_acc_dict)
    end
    mean_accum_dict
end

# Generalism scores

In [None]:
####################################################################
# Functions to compute cross-environment habitat generalism scores #
####################################################################

using Statistics, StatsBase, SparseArrays, AxisArrays

function mean_abunds_per_habitat(A_cc_mean_abunds::AxisArray, env_map::AbstractVector)
    """Compute per-environment mean abundances for each OTU and across all community clusters.

    @A_cc_mean_abunds: average counts per OTU (columns) and community cluster (rows), aggregated
    from a full OTU table.
    @env_map: dictionary with community cluster IDs as keys and main environments (animal,
    aquatic, soil, plant) as values.
    """
    
    @assert size(A_cc_mean_abunds, 1) == length(env_map)
    rows = []
    envs = unique(env_map)
    for env in envs
        env_mask = env_map .== env
        A_env = A_cc_mean_abunds.data[env_mask, :]
        push!(rows, mean(A_env, dims=1))
    end
    AxisArray(vcat(rows...), envs, axisvalues(A_cc_mean_abunds)[2])
end

function env_entropy_generalism(mean_abunds::AbstractVector)
    """Kernel function that computes a entropy-based generalism score
    for environment distribution 'mean_abunds'.

    @mean_abunds: average abundance of an OTU within each main environment
    (animal, aquatic, soil, plant)"""
    
    mean_abunds_norm = mean_abunds ./ sum(mean_abunds)
    entropy(mean_abunds_norm)
end

function env_entropy_generalism(A_cc_mean_abunds::AxisArray, env_map::AbstractVector)
    """Compute habitat generalism scores for each OTU in an aggregated OTU table,
    based on average abundaces per main environment.
    
    @A_cc_mean_abunds: average counts per OTU (columns) and community cluster (rows), aggregated
    from a full OTU table. 
    @env_map: dictionary with community cluster IDs as keys and main environments (animal,
    aquatic, soil, plant) as values.
    """
    
    @assert size(A_cc_mean_abunds, 1) == length(env_map)
    A_env_abund = mean_abunds_per_habitat(A_cc_mean_abunds, env_map)
    gen_scores = mapslices(env_entropy_generalism, A_env_abund.data, dims=1)
    (gen_scores=AxisArray(vec(gen_scores), axisvalues(A_env_abund)[2]), env_abunds=A_env_abund)
end