Note: all code is written in julia (https://julialang.org/) and should be compatible with julia v1.9 and later versions.

Code author: Janko Tackmann (jtackm@github)

In [None]:
###############################################################
# Functions to compute metadata keyword enrichment statistics #
###############################################################

using MultipleTesting, HypothesisTests, Statistics, StatsBase, ProgressMeter

function test_fisher(group_vec1, group_vec2; pval_kws...)
    """Compute Fisher's exact test for two discrete data groupings.
    
    @pval_kws: additional keywords arguments provided to FisherExactTest (for instance: 'tail=:both/:left/:right)."""
    
    ctab = zeros(Int, 2, 2)
    for i in 1:length(group_vec1)
        ctab[group_vec1[i]+1, group_vec2[i]+1] += 1
    end
    oddsr = (ctab[1, 1] * ctab[2, 2]) / (ctab[1, 2] * ctab[2, 1])
    test_res = FisherExactTest(ctab'...)
    oddsr, pvalue(test_res; pval_kws...), ctab
end

function get_keyword_counts(map_meta_df; keyw_col=:keyw, proportions=false, keyw_sep=",")
    """Compute keyword counts or frequencies from per-sample aggregated clean keywords.
    
    @map_meta_df: DataFrame with per-sample keyword information stored in column 'keyw_col'.
    @proportions: convert per-keyword counts to frequencies
    @keyw_sep: string that separates per-sample keywords in 'keyw_col'"""
    
    keyw_count_dict = mapreduce(x->Dict(y=>1 for y in split(x, keyw_sep)), (x, y)->merge(+, x, y), map_meta_df[!, keyw_col])
    
    if proportions
        counts_total = sum(values(keyw_count_dict))
        return Dict(k=>v/counts_total for (k, v) in keyw_count_dict)
    else
        return keyw_count_dict
    end
end

function compute_differential_keywords(meta_df1, meta_df2; alpha=0.05, n_obs_min=0, test_fun=test_fisher, keyw_col=:keyw, fdr=true, 
        pval_tail=:right, test_kws...)
    """Compute terms enriched in samples described by meta_df1 compared to samples described by meta_df2.
    
    @meta_df1/2: DataFrames with per-sample keyword information stored in column 'keyw_col'.
    @n_obs_min: minimum number of samples a keyword must be found in to be considered reliable
    @test_fun: function for a statistical test that takes two group vectors (presence/absence of keyword in each data set)
    and returns a pair (effect size, p-value)
    """
    comb_meta_df = vcat(meta_df1, meta_df2)
    group_mask = vcat(ones(Int, nrow(meta_df1)), zeros(Int, nrow(meta_df2)))

    # Choose set of keywords to test depending on hypothesis
    keyw_meta_df = if pval_tail == :right
        meta_df1
    elseif pval_tail == :left
        meta_df2
    elseif pval_tail == :both
        comb_meta_df
    else
        @error "$pval_tail is not a valid tail distribution option"
    end
        
    keyws = mapreduce(x->Set(split(x, ",")), union, keyw_meta_df[!, keyw_col])
    row_kws_vec = [Set(split(x, ",")) for x in comb_meta_df[!, keyw_col]]
    
    test_rows = []
    @showprogress for kw in keyws
        kw_mask = [Int(kw in row_kws) for row_kws in row_kws_vec]
        
        # Skip keywords found in too few samples
        sum(kw_mask) < n_obs_min && continue
        
        stat, pval, other_test_stats = test_fun(kw_mask, group_mask; tail=pval_tail, test_kws...)

        if test_fun == test_fisher
            ctab = other_test_stats
            group2_kw_frac, group1_kw_frac = [ctab[2, i] / (ctab[2, i] + ctab[1, i]) for i in (1, 2)]
            group2_kw_count, group1_kw_count = [ctab[2, i] for i in (1, 2)]
            other_args = (;group1_kw_count, group2_kw_count, group1_kw_frac, group2_kw_frac)
        else
            other_args = ()
        end
        
        push!(test_rows, (;keyword=kw, stat, pval, other_args...))
    end

    test_df = DataFrame(test_rows)

    pval_col = if fdr
        pvals_adj = adjust(test_df.pval, BenjaminiHochberg())
        test_df[!, :pval_adj] .= pvals_adj
        :pval_adj
    else
        :pval
    end

    pval_col = fdr ? :pval_adj : :pval
    test_df[!, :signif] = test_df[!, pval_col] .< alpha
    sort!(test_df, pval_col)
    
    cols_ord = (:keyword, :stat, :pval, :pval_adj, :signif)
    if !fdr
        cols_ord = filter(!=(:pval_adj), cols_ord)
    end
    test_df = select(test_df, cols_ord..., Not.(cols_ord)...)
    
    return test_df
end