diff --git a/Cargo.toml b/Cargo.toml index aa869ad..a0dbad6 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "intc" -version = "0.1.4" +version = "0.2.0" edition = "2021" description = "An implementation of the *-INC method to calculate an empirical FDR for non-targeting controls in CRISPR screens " license = "MIT" diff --git a/src/fdr.rs b/src/fdr.rs index a4a49a8..9c0440b 100644 --- a/src/fdr.rs +++ b/src/fdr.rs @@ -1,6 +1,12 @@ -use crate::utils::{argsort, argsort_vec}; +use crate::utils::{argsort, argsort_vec, diagonal_product}; use ndarray::{Array1, Axis}; +#[derive(Clone, Copy, Debug)] +pub enum Direction { + Less, + Greater, +} + #[derive(Debug)] /// Result struct for False Discovery Rate and p-value threshold pub struct FdrResult { @@ -35,29 +41,48 @@ impl FdrResult { /// False Discovery Rate pub struct Fdr<'a> { pvalues: &'a Array1, + product: Array1, ntc_indices: &'a [usize], alpha: f64, + use_product: Option, } impl<'a> Fdr<'a> { /// Create a new FDR struct - pub fn new(pvalues: &'a Array1, ntc_indices: &'a [usize], alpha: f64) -> Self { + pub fn new( + pvalues: &'a Array1, + logfc: &'a Array1, + ntc_indices: &'a [usize], + alpha: f64, + use_product: Option, + ) -> Self { Self { pvalues, ntc_indices, alpha, + use_product, + product: diagonal_product(logfc, pvalues), } } /// Fit the FDR pub fn fit(&self) -> FdrResult { - let order = argsort(self.pvalues); + let values = match self.use_product { + Some(Direction::Less) => &self.product, + Some(Direction::Greater) => &self.product, + None => &self.pvalues, + }; + let order = match self.use_product { + Some(Direction::Less) => argsort(&self.product, true), + Some(Direction::Greater) => argsort(&self.product, false), + None => argsort(&self.pvalues, true), + }; let reorder = argsort_vec(&order); let is_ntc = Self::ntc_mask(self.ntc_indices, self.pvalues.len()); - let sorted_pvalues = self.pvalues.select(Axis(0), &order); + let sorted_values = values.select(Axis(0), &order); let sorted_ntc = is_ntc.select(Axis(0), &order); let sorted_fdr = Self::empirical_fdr(&sorted_ntc); - let threshold = Self::threshold(&sorted_pvalues, &sorted_fdr, self.alpha); + let threshold = Self::threshold(&sorted_values, &sorted_fdr, self.alpha); let unsorted_fdr = sorted_fdr.select(Axis(0), &reorder); FdrResult::new(unsorted_fdr, threshold) } @@ -87,11 +112,11 @@ impl<'a> Fdr<'a> { } /// Calculate the p-value threshold - fn threshold(pvalues: &Array1, fdr: &Array1, alpha: f64) -> f64 { + fn threshold(values: &Array1, fdr: &Array1, alpha: f64) -> f64 { let fdr_pval = fdr .iter() - .zip(pvalues.iter()) - .take_while(|(fdr, _pvalue)| *fdr <= &alpha) + .zip(values.iter()) + .take_while(|(fdr, _value)| *fdr <= &alpha) .reduce(|_x, y| y); if let Some(fp) = fdr_pval { *fp.1 @@ -109,27 +134,30 @@ mod testing { #[test] fn test_fdr() { let pvalues = array![0.1, 0.2, 0.3]; + let logfc = array![0.1, 0.2, 0.3]; let ntc_indices = vec![1]; let alpha = 0.1; - let fdr = Fdr::new(&pvalues, &ntc_indices, alpha).fit(); + let fdr = Fdr::new(&pvalues, &logfc, &ntc_indices, alpha, None).fit(); assert_eq!(fdr.fdr(), array![0.0, 0.5, 1. / 3.]); } #[test] fn test_fdr_unsorted() { let pvalues = array![0.2, 0.1, 0.3]; + let logfc = array![0.1, 0.2, 0.3]; let ntc_indices = vec![1]; let alpha = 0.1; - let fdr = Fdr::new(&pvalues, &ntc_indices, alpha).fit(); + let fdr = Fdr::new(&pvalues, &logfc, &ntc_indices, alpha, None).fit(); assert_eq!(fdr.fdr(), array![0.5, 1.0, 1. / 3.]); } #[test] fn test_fdr_unsorted_larger() { let pvalues = array![0.5, 0.1, 0.3, 0.4, 0.2, 0.6]; + let logfc = array![0.5, 0.1, 0.3, 0.4, 0.2, 0.6]; let ntc_indices = vec![3, 5]; let alpha = 0.1; - let fdr = Fdr::new(&pvalues, &ntc_indices, alpha).fit(); + let fdr = Fdr::new(&pvalues, &logfc, &ntc_indices, alpha, None).fit(); assert_eq!(fdr.fdr(), array![0.2, 0.0, 0.0, 0.25, 0.0, 1. / 3.]); } @@ -144,9 +172,10 @@ mod testing { fn test_fdr_threshold() { let m = 10; let pvalues = Array1::linspace(0.0, 1.0, m); + let logfc = Array1::linspace(0.0, 1.0, m); let ntc_indices = vec![4, 6, 9]; let alpha = 0.1; - let fdr = Fdr::new(&pvalues, &ntc_indices, alpha).fit(); + let fdr = Fdr::new(&pvalues, &logfc, &ntc_indices, alpha, None).fit(); assert_eq!(fdr.threshold(), 1. / 3.); } } diff --git a/src/inc.rs b/src/inc.rs index ec0a9c2..1e5a16c 100644 --- a/src/inc.rs +++ b/src/inc.rs @@ -1,8 +1,13 @@ use crate::{ encode::EncodeIndex, + fdr::Direction, + mwu::Alternative, rank_test::{pseudo_rank_test, rank_test}, result::IncResult, - utils::{build_pseudo_names, reconstruct_names, select_ranks, validate_token}, mwu::Alternative, + utils::{ + aggregate_fold_changes, build_pseudo_names, reconstruct_names, select_values, + validate_token, + }, }; use anyhow::Result; use ndarray::Array1; @@ -10,6 +15,7 @@ use ndarray::Array1; #[derive(Debug)] pub struct Inc<'a> { pvalues: &'a Array1, + logfc: &'a Array1, genes: &'a [String], token: &'a str, n_pseudo: usize, @@ -17,11 +23,14 @@ pub struct Inc<'a> { alpha: f64, alternative: Alternative, continuity: bool, + use_product: Option, + seed: u64, } impl<'a> Inc<'a> { pub fn new( pvalues: &'a Array1, + logfc: &'a Array1, genes: &'a [String], token: &'a str, n_pseudo: usize, @@ -29,9 +38,12 @@ impl<'a> Inc<'a> { alpha: f64, alternative: Alternative, continuity: bool, + use_product: Option, + seed: Option, ) -> Inc<'a> { Inc { pvalues, + logfc, genes, token, n_pseudo, @@ -39,40 +51,64 @@ impl<'a> Inc<'a> { alpha, alternative, continuity, + use_product, + seed: seed.unwrap_or(0), } } pub fn fit(&self) -> Result { let encoding = EncodeIndex::new(self.genes); let ntc_index = validate_token(&encoding.map, self.token)?; - let ntc_values = select_ranks(ntc_index, encoding.encoding(), self.pvalues); + let ntc_pvalues = select_values(ntc_index, encoding.encoding(), self.pvalues); + let ntc_logfcs = select_values(ntc_index, encoding.encoding(), self.logfc); let n_genes = encoding.map.len() - 1; + let gene_fc_map = aggregate_fold_changes(self.genes, self.logfc); + // run the rank test on all genes let (mwu_scores, mwu_pvalues) = rank_test( n_genes, ntc_index, encoding.encoding(), self.pvalues, - &ntc_values, + &ntc_pvalues, + Alternative::Less, + self.continuity, + ); + + // run the rank test on pseudo genes + let (pseudo_scores, pseudo_pvalues, pseudo_logfc) = pseudo_rank_test( + self.n_pseudo, + self.s_pseudo, + &ntc_pvalues, + &ntc_logfcs, self.alternative, self.continuity, + self.seed, ); - let (pseudo_scores, pseudo_pvalues) = - pseudo_rank_test(self.n_pseudo, self.s_pseudo, &ntc_values, self.alternative, self.continuity); // reconstruct the gene names let gene_names = reconstruct_names(encoding.map(), ntc_index); let pseudo_names = build_pseudo_names(self.n_pseudo); + // collect the gene fold changes + let gene_logfc = gene_names + .iter() + .map(|gene| gene_fc_map.get(gene).unwrap()) + .copied() + .collect::>(); + Ok(IncResult::new( gene_names, pseudo_names, mwu_scores, mwu_pvalues, + gene_logfc, pseudo_scores, pseudo_pvalues, + pseudo_logfc, self.alpha, + self.use_product, )) } } diff --git a/src/mwu.rs b/src/mwu.rs index fbc60e4..7757559 100644 --- a/src/mwu.rs +++ b/src/mwu.rs @@ -5,10 +5,9 @@ use statrs::{ }; use std::ops::Div; -/// Defines the alternative hypothesis +/// Defines the alternative hypothesis #[derive(Clone, Copy, Default, Debug)] pub enum Alternative { - /// The alternative hypothesis is that the first array is greater than the second array Greater, @@ -150,11 +149,11 @@ fn p_value(z_u: f64, alternative: Alternative) -> f64 { /// * `alternative` - The alternative hypothesis to test against /// * `use_continuity` - Whether to use continuity correction pub fn mann_whitney_u( - x: &Array1, - y: &Array1, - alternative: Alternative, - use_continuity: bool, - ) -> (f64, f64) { + x: &Array1, + y: &Array1, + alternative: Alternative, + use_continuity: bool, +) -> (f64, f64) { let (ranks_x, ranks_y) = merged_ranks(x, y); let nx = x.len() as f64; @@ -169,8 +168,8 @@ pub fn mann_whitney_u( #[cfg(test)] mod testing { - use crate::mwu::mann_whitney_u; use super::merged_ranks; + use crate::mwu::mann_whitney_u; use ndarray::{array, Array}; const EPSILON: f64 = 1e-10; @@ -236,14 +235,20 @@ mod testing { fn test_alt_u_statistic_greater() { let x = array![1., 2., 4.]; let y = array![3., 5., 6.]; - assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::Greater), 8.); + assert_eq!( + super::alt_u_statistic(&x, &y, super::Alternative::Greater), + 8. + ); } #[test] fn test_alt_u_statistic_two_sided() { let x = array![1., 2., 4.]; let y = array![3., 5., 6.]; - assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::TwoSided), 1.); + assert_eq!( + super::alt_u_statistic(&x, &y, super::Alternative::TwoSided), + 1. + ); } #[test] diff --git a/src/rank_test.rs b/src/rank_test.rs index 3e16c8d..c648dfe 100644 --- a/src/rank_test.rs +++ b/src/rank_test.rs @@ -1,6 +1,13 @@ -use crate::{mwu::{mann_whitney_u, Alternative}, utils::select_ranks}; +use crate::{ + mwu::{mann_whitney_u, Alternative}, + utils::select_values, +}; use ndarray::{Array1, Axis}; -use ndarray_rand::{rand_distr::Uniform, RandomExt}; +use ndarray_rand::{ + rand::{rngs::StdRng, SeedableRng}, + rand_distr::Uniform, + RandomExt, +}; /// Performs a rank test for each gene in the dataset. /// Returns a tuple of vectors containing the U and p-values for each gene. @@ -8,14 +15,14 @@ pub fn rank_test( n_genes: usize, ntc_index: usize, encoding: &[usize], - pvalues: &Array1, + test_values: &Array1, ntc_values: &Array1, alternative: Alternative, continuity: bool, ) -> (Vec, Vec) { (0..=n_genes) .filter(|x| *x != ntc_index) - .map(|x| select_ranks(x, encoding, pvalues)) + .map(|x| select_values(x, encoding, test_values)) .map(|x| mann_whitney_u(&x, ntc_values, alternative, continuity)) .unzip() } @@ -25,40 +32,72 @@ pub fn rank_test( pub fn pseudo_rank_test( n_pseudo: usize, s_pseudo: usize, - ntc_values: &Array1, + ntc_pvalues: &Array1, + ntc_logfcs: &Array1, alternative: Alternative, continuity: bool, -) -> (Vec, Vec) { + seed: u64, +) -> (Vec, Vec, Vec) { + let mut pseudo_scores = Vec::with_capacity(n_pseudo); + let mut pseudo_pvalues = Vec::with_capacity(n_pseudo); + let mut pseudo_logfc = Vec::with_capacity(n_pseudo); + let num_ntc = ntc_pvalues.len(); + let mut rng = StdRng::seed_from_u64(seed); + (0..n_pseudo) - .map(|_| Array1::random(s_pseudo, Uniform::new(0, ntc_values.len()))) + // generate array of random indices considered "in" the test group + .map(|_| Array1::random_using(s_pseudo, Uniform::new(0, num_ntc), &mut rng)) + // generate the complement list of indices considered "out" of the test group .map(|mask| { - let slice_mask = mask.as_slice().unwrap(); - let out_mask = (0..ntc_values.len()) + let slice_mask = mask.as_slice().unwrap().to_owned(); + let out_mask = (0..num_ntc) .filter(|x| !slice_mask.contains(x)) .collect::>(); - let in_group = ntc_values.select(Axis(0), slice_mask); - let out_group = ntc_values.select(Axis(0), &out_mask); - (in_group, out_group) + (slice_mask, out_mask) }) - .map(|(in_group, out_group)| mann_whitney_u(&in_group, &out_group, alternative, continuity)) - .unzip() + // subset the pvalues and logfc to the "in" and "out" groups + .map(|(in_mask, out_mask)| { + let in_group_pvalues = ntc_pvalues.select(Axis(0), &in_mask); + let in_group_logfcs = ntc_logfcs.select(Axis(0), &in_mask); + let out_group_pvalues = ntc_pvalues.select(Axis(0), &out_mask); + (in_group_pvalues, in_group_logfcs, out_group_pvalues) + }) + // calculate the U, p-values, and aggregate logfc for each pseudo gene + .for_each(|(ig_pvalues, ig_logfcs, og_pvalues)| { + let (score, pvalue) = mann_whitney_u(&ig_pvalues, &og_pvalues, alternative, continuity); + let logfc = ig_logfcs.mean().unwrap_or(0.0); + + pseudo_scores.push(score); + pseudo_pvalues.push(pvalue); + pseudo_logfc.push(logfc); + }); + + (pseudo_scores, pseudo_pvalues, pseudo_logfc) } #[cfg(test)] mod testing { use super::rank_test; - use ndarray::array; use crate::mwu::Alternative; + use ndarray::array; #[test] fn test_rank_test() { let n_genes = 2; let ntc_index = 0; let encoding = vec![0, 1, 2, 0, 1, 2, 0, 1, 2, 0]; - let pvalues = array![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.0]; + let test_values = array![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.0]; let ntc_values = array![0.7, 0.8, 0.9, 0.7, 0.8]; let alternative = Alternative::TwoSided; - let (u, p) = rank_test(n_genes, ntc_index, &encoding, &pvalues, &ntc_values, alternative, false); + let (u, p) = rank_test( + n_genes, + ntc_index, + &encoding, + &test_values, + &ntc_values, + alternative, + false, + ); assert_eq!(u, vec![3.0, 4.5]); assert_eq!(p, vec![0.17971249488715443, 0.37109336955630695]); } @@ -67,9 +106,19 @@ mod testing { fn test_pseudo_rank_test() { let n_pseudo = 2; let s_pseudo = 3; - let ntc_values = array![0.7, 0.7, 0.7, 0.7, 0.7, 0.7]; + let ntc_pvalues = array![0.7, 0.7, 0.7, 0.7, 0.7, 0.7]; + let ntc_logfcs = array![0.7, 0.7, 0.7, 0.7, 0.7, 0.7]; let alternative = Alternative::TwoSided; - let (_u, p) = super::pseudo_rank_test(n_pseudo, s_pseudo, &ntc_values, alternative, false); + let seed = 0; + let (_u, p, _l) = super::pseudo_rank_test( + n_pseudo, + s_pseudo, + &ntc_pvalues, + &ntc_logfcs, + alternative, + false, + seed, + ); assert_eq!(p, vec![1.0, 1.0]); } } diff --git a/src/result.rs b/src/result.rs index a218f1b..859cb71 100644 --- a/src/result.rs +++ b/src/result.rs @@ -1,4 +1,4 @@ -use crate::fdr::{Fdr, FdrResult}; +use crate::fdr::{Direction, Fdr, FdrResult}; use ndarray::Array1; #[derive(Debug)] @@ -7,6 +7,7 @@ pub struct IncResult { genes: Vec, u_scores: Array1, u_pvalues: Array1, + logfc: Array1, fdr: FdrResult, } impl IncResult { @@ -15,20 +16,25 @@ impl IncResult { pseudo_genes: Vec, gene_scores: Vec, gene_pvalues: Vec, + gene_logfc: Vec, pseudo_scores: Vec, pseudo_pvalues: Vec, + pseudo_logfc: Vec, alpha: f64, + use_product: Option, ) -> Self { let n_pseudo = pseudo_genes.len(); let genes = vec![genes, pseudo_genes].concat(); let u_scores = Array1::from_vec(vec![gene_scores, pseudo_scores].concat()); let u_pvalues = Array1::from_vec(vec![gene_pvalues, pseudo_pvalues].concat()); + let logfc = Array1::from_vec(vec![gene_logfc, pseudo_logfc].concat()); let ntc_indices = Self::create_ntc_indices(n_pseudo, genes.len()); - let fdr = Fdr::new(&u_pvalues, &ntc_indices, alpha).fit(); + let fdr = Fdr::new(&u_pvalues, &logfc, &ntc_indices, alpha, use_product).fit(); Self { genes, u_scores, u_pvalues, + logfc, fdr, } } @@ -59,6 +65,11 @@ impl IncResult { self.fdr.fdr() } + /// Get the log fold changes + pub fn logfc(&self) -> &Array1 { + &self.logfc + } + /// Get the p-value threshold pub fn threshold(&self) -> f64 { self.fdr.threshold() @@ -81,21 +92,27 @@ mod testing { .collect::>(); let gene_scores = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]; let gene_pvalues = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]; + let gene_logfc = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]; let pseudo_scores = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]; let pseudo_pvalues = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]; + let pseudo_logfc = vec![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]; let alpha = 0.05; let result = IncResult::new( genes, pseudo_genes, gene_scores, gene_pvalues, + gene_logfc, pseudo_scores, pseudo_pvalues, + pseudo_logfc, alpha, + None, ); assert_eq!(result.genes().len(), 20); assert_eq!(result.u_scores().len(), 20); assert_eq!(result.u_pvalues().len(), 20); + assert_eq!(result.logfc().len(), 20); assert_eq!(result.fdr().len(), 20); assert!(result.threshold() >= 0.); } diff --git a/src/utils.rs b/src/utils.rs index 7f2323d..a769528 100644 --- a/src/utils.rs +++ b/src/utils.rs @@ -1,6 +1,7 @@ use anyhow::{bail, Result}; use hashbrown::HashMap; -use ndarray::Array1; +use ndarray::{Array1, Axis}; +use std::hash::Hash; /// Validates the provided token is found one and only once in the gene set pub fn validate_token(encode_map: &HashMap, token: &str) -> Result { @@ -18,12 +19,12 @@ pub fn validate_token(encode_map: &HashMap, token: &str) -> Result< /// Select the ranks for a provided embedding. Applies a filter which selects all ranks /// for the current gene index -pub fn select_ranks(current_idx: usize, encodings: &[usize], ranks: &Array1) -> Array1 { +pub fn select_values(current_idx: usize, encodings: &[usize], values: &Array1) -> Array1 { encodings .iter() - .zip(ranks.iter()) + .zip(values.iter()) .filter(|(idx, _ranks)| **idx == current_idx) - .map(|(_, ranks)| *ranks) + .map(|(_, value)| *value) .collect() } @@ -41,12 +42,16 @@ pub fn build_pseudo_names(n_pseudo: usize) -> Vec { } /// Performs an argsort on a 1D ndarray and returns an array of indices -pub fn argsort(array: &Array1) -> Vec +pub fn argsort(array: &Array1, ascending: bool) -> Vec where T: PartialOrd, { let mut indices: Vec = (0..array.len()).collect(); - indices.sort_by(|&a, &b| array[a].partial_cmp(&array[b]).unwrap()); + if ascending { + indices.sort_by(|&a, &b| array[a].partial_cmp(&array[b]).unwrap()); + } else { + indices.sort_by(|&a, &b| array[b].partial_cmp(&array[a]).unwrap()); + } indices } @@ -60,6 +65,37 @@ where indices } +/// Calculates the diagonal product of fold changes and pvalues +pub fn diagonal_product(log2_fold_changes: &Array1, pvalues: &Array1) -> Array1 { + log2_fold_changes * pvalues.mapv(|x| -x.log10()) +} + +/// recovers the indices of all unique values in a vector and returns a hashmap of the unique values and their indices +/// # Arguments +/// * `vec` - the vector to be searched and hashed +/// ``` +pub fn unique_indices(vec: &[T]) -> HashMap> { + let mut map = HashMap::new(); + for (i, x) in vec.iter().enumerate() { + map.entry(x.clone()).or_insert(Vec::new()).push(i); + } + map +} + +pub fn aggregate_fold_changes( + gene_names: &[String], + fold_changes: &Array1, +) -> HashMap { + let idx_map = unique_indices(gene_names); + idx_map + .iter() + .map(|(k, v)| { + let fc = fold_changes.select(Axis(0), v).mean().unwrap(); + (k.clone(), fc) + }) + .collect() +} + #[cfg(test)] mod testing { use super::{argsort, argsort_vec}; @@ -70,7 +106,7 @@ mod testing { #[test] fn test_argsort_forward() { let array = array![1.0, 2.0, 3.0, 4.0, 5.0]; - let sorted = argsort(&array); + let sorted = argsort(&array, true); assert_eq!(sorted, vec![0, 1, 2, 3, 4]); assert_eq!( array.select(Axis(0), &sorted), @@ -81,7 +117,7 @@ mod testing { #[test] fn test_argsort_reverse() { let array = array![5.0, 4.0, 3.0, 2.0, 1.0]; - let sorted = argsort(&array); + let sorted = argsort(&array, true); assert_eq!(sorted, vec![4, 3, 2, 1, 0]); assert_eq!( array.select(Axis(0), &sorted), @@ -92,7 +128,7 @@ mod testing { #[test] fn test_reordering() { let pvalues = Array1::random(100, Uniform::new(0.0, 1.0)); - let order = argsort(&pvalues); + let order = argsort(&pvalues, true); let reorder = argsort_vec(&order); let sorted_pvalues = pvalues.select(Axis(0), &order); @@ -103,10 +139,10 @@ mod testing { } #[test] - fn test_select_ranks() { + fn test_select_values() { let encodings = vec![0, 0, 1, 1, 2, 2]; let ranks = array![0.1, 0.2, 0.3, 0.4, 0.5, 0.6]; - let selected = super::select_ranks(1, &encodings, &ranks); + let selected = super::select_values(1, &encodings, &ranks); assert_eq!(selected, array![0.3, 0.4]); }