diff --git a/Cargo.toml b/Cargo.toml index 9fba553..aa869ad 100644 --- a/Cargo.toml +++ b/Cargo.toml @@ -1,6 +1,6 @@ [package] name = "intc" -version = "0.1.3" +version = "0.1.4" edition = "2021" description = "An implementation of the *-INC method to calculate an empirical FDR for non-targeting controls in CRISPR screens " license = "MIT" diff --git a/src/inc.rs b/src/inc.rs index afbba33..ec0a9c2 100644 --- a/src/inc.rs +++ b/src/inc.rs @@ -2,7 +2,7 @@ use crate::{ encode::EncodeIndex, rank_test::{pseudo_rank_test, rank_test}, result::IncResult, - utils::{build_pseudo_names, reconstruct_names, select_ranks, validate_token}, + utils::{build_pseudo_names, reconstruct_names, select_ranks, validate_token}, mwu::Alternative, }; use anyhow::Result; use ndarray::Array1; @@ -15,6 +15,7 @@ pub struct Inc<'a> { n_pseudo: usize, s_pseudo: usize, alpha: f64, + alternative: Alternative, continuity: bool, } @@ -26,6 +27,7 @@ impl<'a> Inc<'a> { n_pseudo: usize, s_pseudo: usize, alpha: f64, + alternative: Alternative, continuity: bool, ) -> Inc<'a> { Inc { @@ -35,6 +37,7 @@ impl<'a> Inc<'a> { n_pseudo, s_pseudo, alpha, + alternative, continuity, } } @@ -52,10 +55,11 @@ impl<'a> Inc<'a> { encoding.encoding(), self.pvalues, &ntc_values, + self.alternative, self.continuity, ); let (pseudo_scores, pseudo_pvalues) = - pseudo_rank_test(self.n_pseudo, self.s_pseudo, &ntc_values, self.continuity); + pseudo_rank_test(self.n_pseudo, self.s_pseudo, &ntc_values, self.alternative, self.continuity); // reconstruct the gene names let gene_names = reconstruct_names(encoding.map(), ntc_index); diff --git a/src/mwu.rs b/src/mwu.rs index ad37ebe..fbc60e4 100644 --- a/src/mwu.rs +++ b/src/mwu.rs @@ -5,7 +5,29 @@ use statrs::{ }; use std::ops::Div; +/// Defines the alternative hypothesis +#[derive(Clone, Copy, Default, Debug)] +pub enum Alternative { + + /// The alternative hypothesis is that the first array is greater than the second array + Greater, + + /// The alternative hypothesis is that the first array is less than the second array + Less, + + /// The alternative hypothesis is that the first array is not equal to the second array + #[default] + TwoSided, +} + /// Concatenate two arrays, rank them, and return the rankings for each of the groups. +/// +/// # Arguments +/// * `x` - The first array +/// * `y` - The second array +/// +/// # Returns +/// * `ranks` - A tuple containing the rankings for each of the groups fn merged_ranks(x: &Array1, y: &Array1) -> (Array1, Array1) { let midpoint = x.len(); let joined = concatenate(Axis(0), &[x.view(), y.view()]) @@ -19,18 +41,59 @@ fn merged_ranks(x: &Array1, y: &Array1) -> (Array1, Array1) } /// Calculates the U-Statistic given an array of ranks +/// +/// # Arguments +/// * `array` - The array of ranks +/// +/// # Returns +/// * `u` - The U-Statistic fn u_statistic(array: &Array1) -> f64 { let s = array.sum(); let n = array.len() as f64; s - ((n * (n + 1.)) / 2.) } +/// Calculates the Mann-Whitney U-Statistic +/// +/// # Arguments +/// * `x` - The first group of ranks +/// * `y` - The second group of ranks +/// * `alternative` - The alternative hypothesis +/// +/// # Returns +/// * `u` - The U-Statistic +fn alt_u_statistic(ranks_x: &Array1, ranks_y: &Array1, alternative: Alternative) -> f64 { + match alternative { + Alternative::Less => u_statistic(ranks_x), + Alternative::Greater => u_statistic(ranks_y), + Alternative::TwoSided => { + let u_x = u_statistic(ranks_x); + let u_y = u_statistic(ranks_y); + u_x.min(u_y) + } + } +} + /// Calculates the U-Distribution Mean +/// +/// # Arguments +/// * `nx` - The number of elements in the first group +/// * `ny` - The number of elements in the second group +/// +/// # Returns +/// * `m_u` - The mean of the U-Distribution fn u_mean(nx: f64, ny: f64) -> f64 { (nx * ny) / 2. } /// Calculates the U-Distribution Standard Deviation +/// +/// # Arguments +/// * `nx` - The number of elements in the first group +/// * `ny` - The number of elements in the second group +/// +/// # Returns +/// * `s_u` - The standard deviation of the U-Distribution fn u_std(nx: f64, ny: f64) -> f64 { (nx * ny * (nx + ny + 1.)).div(12.).sqrt() } @@ -41,6 +104,15 @@ fn u_std(nx: f64, ny: f64) -> f64 { // Because SF is always used to calculate the p-value, we can always // _subtract_ 0.5 for the continuity correction. This always increases the // p-value to account for the rest of the probability mass _at_ q = U. +// +// # Arguments +// * `u` - The U-Statistic +// * `nx` - The number of elements in the first group +// * `ny` - The number of elements in the second group +// * `use_continuity` - Whether to use continuity correction +// +// # Returns +// * `z_u` - The z-score of the U-Statistic fn z_score(u: f64, nx: f64, ny: f64, use_continuity: bool) -> f64 { let m_u = u_mean(nx, ny); let s_u = u_std(nx, ny); @@ -50,29 +122,62 @@ fn z_score(u: f64, nx: f64, ny: f64, use_continuity: bool) -> f64 { } else { (u - m_u) / s_u } +} +/// Calculates the pvalue of the z-score and adjusts for the alternative hypothesis +/// if necessary. +/// +/// # Arguments +/// * `z_u` - The z-score of the U-Statistic +/// * `alternative` - The alternative hypothesis to test against +/// +/// # Returns +/// * `pvalue` - The p-value of the test +fn p_value(z_u: f64, alternative: Alternative) -> f64 { + let pvalue = Normal::new(0., 1.).unwrap().cdf(z_u); + match alternative { + Alternative::TwoSided => 2. * pvalue, + _ => pvalue, + } } -/// Performs the Mann-Whitney U Test otherwise known as the Rank-Sum Test to measure the -/// statistical difference between two values through their ranks. -pub fn mann_whitney_u(x: &Array1, y: &Array1, use_continuity: bool) -> (f64, f64) { - let (ranks_x, _ranks_y) = merged_ranks(x, y); +/// Performs the Mann-Whitney U Test to measure the statistical difference between +/// two groups through their rank values. +/// +/// # Arguments +/// * `x` - Array of values from the first group +/// * `y` - Array of values from the second group +/// * `alternative` - The alternative hypothesis to test against +/// * `use_continuity` - Whether to use continuity correction +pub fn mann_whitney_u( + x: &Array1, + y: &Array1, + alternative: Alternative, + use_continuity: bool, + ) -> (f64, f64) { + let (ranks_x, ranks_y) = merged_ranks(x, y); let nx = x.len() as f64; let ny = y.len() as f64; - let u_t = u_statistic(&ranks_x); + let u_t = alt_u_statistic(&ranks_x, &ranks_y, alternative); let z_u = z_score(u_t, nx, ny, use_continuity); + let p_v = p_value(z_u, alternative); - (u_t, Normal::new(0., 1.).unwrap().cdf(z_u)) + (u_t, p_v) } #[cfg(test)] mod testing { use crate::mwu::mann_whitney_u; - use super::merged_ranks; - use ndarray::{array, Array1}; + use ndarray::{array, Array}; + + const EPSILON: f64 = 1e-10; + + fn test_close(a: f64, b: f64) { + assert!((a - b).abs() < EPSILON); + } #[test] fn test_merged_ranks() { @@ -95,7 +200,9 @@ mod testing { #[test] fn test_u_statistic() { let x = array![1., 2., 4.]; + let y = array![3., 5., 6.]; assert_eq!(super::u_statistic(&x), 1.); + assert_eq!(super::u_statistic(&y), 8.); } #[test] @@ -119,10 +226,77 @@ mod testing { } #[test] - fn test_mann_whitney_u() { - let x = Array1::range(1., 100., 1.); - let y = Array1::range(100., 200., 1.); - let (_, pv) = mann_whitney_u(&x, &y, false); - assert!(pv - 1.87e-34 < 1e-30); + fn test_alt_u_statistic_less() { + let x = array![1., 2., 4.]; + let y = array![3., 5., 6.]; + assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::Less), 1.); + } + + #[test] + fn test_alt_u_statistic_greater() { + let x = array![1., 2., 4.]; + let y = array![3., 5., 6.]; + assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::Greater), 8.); + } + + #[test] + fn test_alt_u_statistic_two_sided() { + let x = array![1., 2., 4.]; + let y = array![3., 5., 6.]; + assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::TwoSided), 1.); + } + + #[test] + fn test_mann_whitney_u_twosided_continuity() { + let x = Array::range(0.0, 10.0, 1.0); + let y = Array::range(10.0, 20.0, 1.0); + let (u, p) = mann_whitney_u(&x, &y, super::Alternative::TwoSided, true); + assert_eq!(u, 0.); + test_close(p, 0.0001826717911243235); + } + + #[test] + fn test_mann_whitney_u_twosided_no_continuity() { + let x = Array::range(0.0, 10.0, 1.0); + let y = Array::range(10.0, 20.0, 1.0); + let (u, p) = mann_whitney_u(&x, &y, super::Alternative::TwoSided, false); + assert_eq!(u, 0.); + test_close(p, 0.00015705228423075119); + } + + #[test] + fn test_mann_whitney_u_less_continuity() { + let x = Array::range(0.0, 10.0, 1.0); + let y = Array::range(10.0, 20.0, 1.0); + let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Less, true); + assert_eq!(u, 0.); + test_close(p, 9.133589556216175e-5); + } + + #[test] + fn test_mann_whitney_u_less_no_continuity() { + let x = Array::range(0.0, 10.0, 1.0); + let y = Array::range(10.0, 20.0, 1.0); + let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Less, false); + assert_eq!(u, 0.); + test_close(p, 7.852614211537559e-05); + } + + #[test] + fn test_mann_whitney_u_greater_continuity() { + let x = Array::range(0.0, 10.0, 1.0); + let y = Array::range(10.0, 20.0, 1.0); + let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Greater, true); + assert_eq!(u, 100.); + test_close(p, 0.9999325785388173); + } + + #[test] + fn test_mann_whitney_u_greater_no_continuity() { + let x = Array::range(0.0, 10.0, 1.0); + let y = Array::range(10.0, 20.0, 1.0); + let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Greater, false); + assert_eq!(u, 100.); + test_close(p, 0.9999214738578847); } } diff --git a/src/rank_test.rs b/src/rank_test.rs index c6c352e..3e16c8d 100644 --- a/src/rank_test.rs +++ b/src/rank_test.rs @@ -1,4 +1,4 @@ -use crate::{mwu::mann_whitney_u, utils::select_ranks}; +use crate::{mwu::{mann_whitney_u, Alternative}, utils::select_ranks}; use ndarray::{Array1, Axis}; use ndarray_rand::{rand_distr::Uniform, RandomExt}; @@ -10,12 +10,13 @@ pub fn rank_test( encoding: &[usize], pvalues: &Array1, ntc_values: &Array1, + alternative: Alternative, continuity: bool, ) -> (Vec, Vec) { (0..=n_genes) .filter(|x| *x != ntc_index) .map(|x| select_ranks(x, encoding, pvalues)) - .map(|x| mann_whitney_u(&x, ntc_values, continuity)) + .map(|x| mann_whitney_u(&x, ntc_values, alternative, continuity)) .unzip() } @@ -25,6 +26,7 @@ pub fn pseudo_rank_test( n_pseudo: usize, s_pseudo: usize, ntc_values: &Array1, + alternative: Alternative, continuity: bool, ) -> (Vec, Vec) { (0..n_pseudo) @@ -38,7 +40,7 @@ pub fn pseudo_rank_test( let out_group = ntc_values.select(Axis(0), &out_mask); (in_group, out_group) }) - .map(|(in_group, out_group)| mann_whitney_u(&in_group, &out_group, continuity)) + .map(|(in_group, out_group)| mann_whitney_u(&in_group, &out_group, alternative, continuity)) .unzip() } @@ -46,6 +48,7 @@ pub fn pseudo_rank_test( mod testing { use super::rank_test; use ndarray::array; + use crate::mwu::Alternative; #[test] fn test_rank_test() { @@ -54,9 +57,10 @@ mod testing { let encoding = vec![0, 1, 2, 0, 1, 2, 0, 1, 2, 0]; let pvalues = array![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.0]; let ntc_values = array![0.7, 0.8, 0.9, 0.7, 0.8]; - let (u, p) = rank_test(n_genes, ntc_index, &encoding, &pvalues, &ntc_values, false); + let alternative = Alternative::TwoSided; + let (u, p) = rank_test(n_genes, ntc_index, &encoding, &pvalues, &ntc_values, alternative, false); assert_eq!(u, vec![3.0, 4.5]); - assert_eq!(p, vec![0.08985624744357722, 0.18554668477815348]); + assert_eq!(p, vec![0.17971249488715443, 0.37109336955630695]); } #[test] @@ -64,7 +68,8 @@ mod testing { let n_pseudo = 2; let s_pseudo = 3; let ntc_values = array![0.7, 0.7, 0.7, 0.7, 0.7, 0.7]; - let (_u, p) = super::pseudo_rank_test(n_pseudo, s_pseudo, &ntc_values, false); - assert_eq!(p, vec![0.5, 0.5]); + let alternative = Alternative::TwoSided; + let (_u, p) = super::pseudo_rank_test(n_pseudo, s_pseudo, &ntc_values, alternative, false); + assert_eq!(p, vec![1.0, 1.0]); } }