Merge pull request #9 from noamteyssier/7-specify-alternative-in-mann…

…-whitney-u-test 7 specify alternative in mann whitney u test
noamteyssier · Feb 2, 2023 · 47b7268 · 47b7268
2 parents 132621b + 6844880
commit 47b7268
Show file tree

Hide file tree

Showing 4 changed files with 206 additions and 23 deletions.
diff --git a/Cargo.toml b/Cargo.toml
@@ -1,6 +1,6 @@
 [package]
 name = "intc"
-version = "0.1.3"
+version = "0.1.4"
 edition = "2021"
 description = "An implementation of the *-INC method to calculate an empirical FDR for non-targeting controls in CRISPR screens "
 license = "MIT"

diff --git a/src/inc.rs b/src/inc.rs
@@ -2,7 +2,7 @@ use crate::{
     encode::EncodeIndex,
     rank_test::{pseudo_rank_test, rank_test},
     result::IncResult,
-    utils::{build_pseudo_names, reconstruct_names, select_ranks, validate_token},
+    utils::{build_pseudo_names, reconstruct_names, select_ranks, validate_token}, mwu::Alternative,
 };
 use anyhow::Result;
 use ndarray::Array1;
@@ -15,6 +15,7 @@ pub struct Inc<'a> {
     n_pseudo: usize,
     s_pseudo: usize,
     alpha: f64,
+    alternative: Alternative,
     continuity: bool,
 }
 
@@ -26,6 +27,7 @@ impl<'a> Inc<'a> {
         n_pseudo: usize,
         s_pseudo: usize,
         alpha: f64,
+        alternative: Alternative,
         continuity: bool,
     ) -> Inc<'a> {
         Inc {
@@ -35,6 +37,7 @@ impl<'a> Inc<'a> {
             n_pseudo,
             s_pseudo,
             alpha,
+            alternative,
             continuity,
         }
     }
@@ -52,10 +55,11 @@ impl<'a> Inc<'a> {
             encoding.encoding(),
             self.pvalues,
             &ntc_values,
+            self.alternative,
             self.continuity,
         );
         let (pseudo_scores, pseudo_pvalues) =
-            pseudo_rank_test(self.n_pseudo, self.s_pseudo, &ntc_values, self.continuity);
+            pseudo_rank_test(self.n_pseudo, self.s_pseudo, &ntc_values, self.alternative, self.continuity);
 
         // reconstruct the gene names
         let gene_names = reconstruct_names(encoding.map(), ntc_index);

diff --git a/src/mwu.rs b/src/mwu.rs
@@ -5,7 +5,29 @@ use statrs::{
 };
 use std::ops::Div;
 
+/// Defines the alternative hypothesis 
+#[derive(Clone, Copy, Default, Debug)]
+pub enum Alternative {
+
+    /// The alternative hypothesis is that the first array is greater than the second array
+    Greater,
+
+    /// The alternative hypothesis is that the first array is less than the second array
+    Less,
+
+    /// The alternative hypothesis is that the first array is not equal to the second array
+    #[default]
+    TwoSided,
+}
+
 /// Concatenate two arrays, rank them, and return the rankings for each of the groups.
+///
+/// # Arguments
+/// * `x` - The first array
+/// * `y` - The second array
+///
+/// # Returns
+/// * `ranks` - A tuple containing the rankings for each of the groups
 fn merged_ranks(x: &Array1<f64>, y: &Array1<f64>) -> (Array1<f64>, Array1<f64>) {
     let midpoint = x.len();
     let joined = concatenate(Axis(0), &[x.view(), y.view()])
@@ -19,18 +41,59 @@ fn merged_ranks(x: &Array1<f64>, y: &Array1<f64>) -> (Array1<f64>, Array1<f64>)
 }
 
 /// Calculates the U-Statistic given an array of ranks
+///
+/// # Arguments
+/// * `array` - The array of ranks
+///
+/// # Returns
+/// * `u` - The U-Statistic
 fn u_statistic(array: &Array1<f64>) -> f64 {
     let s = array.sum();
     let n = array.len() as f64;
     s - ((n * (n + 1.)) / 2.)
 }
 
+/// Calculates the Mann-Whitney U-Statistic
+///
+/// # Arguments
+/// * `x` - The first group of ranks
+/// * `y` - The second group of ranks
+/// * `alternative` - The alternative hypothesis
+///
+/// # Returns
+/// * `u` - The U-Statistic
+fn alt_u_statistic(ranks_x: &Array1<f64>, ranks_y: &Array1<f64>, alternative: Alternative) -> f64 {
+    match alternative {
+        Alternative::Less => u_statistic(ranks_x),
+        Alternative::Greater => u_statistic(ranks_y),
+        Alternative::TwoSided => {
+            let u_x = u_statistic(ranks_x);
+            let u_y = u_statistic(ranks_y);
+            u_x.min(u_y)
+        }
+    }
+}
+
 /// Calculates the U-Distribution Mean
+///
+/// # Arguments
+/// * `nx` - The number of elements in the first group
+/// * `ny` - The number of elements in the second group
+///
+/// # Returns
+/// * `m_u` - The mean of the U-Distribution
 fn u_mean(nx: f64, ny: f64) -> f64 {
     (nx * ny) / 2.
 }
 
 /// Calculates the U-Distribution Standard Deviation
+///
+/// # Arguments
+/// * `nx` - The number of elements in the first group
+/// * `ny` - The number of elements in the second group
+///
+/// # Returns
+/// * `s_u` - The standard deviation of the U-Distribution
 fn u_std(nx: f64, ny: f64) -> f64 {
     (nx * ny * (nx + ny + 1.)).div(12.).sqrt()
 }
@@ -41,6 +104,15 @@ fn u_std(nx: f64, ny: f64) -> f64 {
 // Because SF is always used to calculate the p-value, we can always
 // _subtract_ 0.5 for the continuity correction. This always increases the
 // p-value to account for the rest of the probability mass _at_ q = U.
+//
+// # Arguments
+// * `u` - The U-Statistic
+// * `nx` - The number of elements in the first group
+// * `ny` - The number of elements in the second group
+// * `use_continuity` - Whether to use continuity correction
+//
+// # Returns
+// * `z_u` - The z-score of the U-Statistic
 fn z_score(u: f64, nx: f64, ny: f64, use_continuity: bool) -> f64 {
     let m_u = u_mean(nx, ny);
     let s_u = u_std(nx, ny);
@@ -50,29 +122,62 @@ fn z_score(u: f64, nx: f64, ny: f64, use_continuity: bool) -> f64 {
     } else {
         (u - m_u) / s_u
     }
+}
 
+/// Calculates the pvalue of the z-score and adjusts for the alternative hypothesis
+/// if necessary.
+///
+/// # Arguments
+/// * `z_u` - The z-score of the U-Statistic
+/// * `alternative` - The alternative hypothesis to test against
+///
+/// # Returns
+/// * `pvalue` - The p-value of the test
+fn p_value(z_u: f64, alternative: Alternative) -> f64 {
+    let pvalue = Normal::new(0., 1.).unwrap().cdf(z_u);
+    match alternative {
+        Alternative::TwoSided => 2. * pvalue,
+        _ => pvalue,
+    }
 }
 
-/// Performs the Mann-Whitney U Test otherwise known as the Rank-Sum Test to measure the
-/// statistical difference between two values through their ranks.
-pub fn mann_whitney_u(x: &Array1<f64>, y: &Array1<f64>, use_continuity: bool) -> (f64, f64) {
-    let (ranks_x, _ranks_y) = merged_ranks(x, y);
+/// Performs the Mann-Whitney U Test to measure the statistical difference between
+/// two groups through their rank values.
+///
+/// # Arguments
+/// * `x` - Array of values from the first group
+/// * `y` - Array of values from the second group
+/// * `alternative` - The alternative hypothesis to test against
+/// * `use_continuity` - Whether to use continuity correction
+pub fn mann_whitney_u(
+        x: &Array1<f64>, 
+        y: &Array1<f64>, 
+        alternative: Alternative,
+        use_continuity: bool,
+    ) -> (f64, f64) {
+    let (ranks_x, ranks_y) = merged_ranks(x, y);
 
     let nx = x.len() as f64;
     let ny = y.len() as f64;
 
-    let u_t = u_statistic(&ranks_x);
+    let u_t = alt_u_statistic(&ranks_x, &ranks_y, alternative);
     let z_u = z_score(u_t, nx, ny, use_continuity);
+    let p_v = p_value(z_u, alternative);
 
-    (u_t, Normal::new(0., 1.).unwrap().cdf(z_u))
+    (u_t, p_v)
 }
 
 #[cfg(test)]
 mod testing {
     use crate::mwu::mann_whitney_u;
-
     use super::merged_ranks;
-    use ndarray::{array, Array1};
+    use ndarray::{array, Array};
+
+    const EPSILON: f64 = 1e-10;
+
+    fn test_close(a: f64, b: f64) {
+        assert!((a - b).abs() < EPSILON);
+    }
 
     #[test]
     fn test_merged_ranks() {
@@ -95,7 +200,9 @@ mod testing {
     #[test]
     fn test_u_statistic() {
         let x = array![1., 2., 4.];
+        let y = array![3., 5., 6.];
         assert_eq!(super::u_statistic(&x), 1.);
+        assert_eq!(super::u_statistic(&y), 8.);
     }
 
     #[test]
@@ -119,10 +226,77 @@ mod testing {
     }
 
     #[test]
-    fn test_mann_whitney_u() {
-        let x = Array1::range(1., 100., 1.);
-        let y = Array1::range(100., 200., 1.);
-        let (_, pv) = mann_whitney_u(&x, &y, false);
-        assert!(pv - 1.87e-34 < 1e-30);
+    fn test_alt_u_statistic_less() {
+        let x = array![1., 2., 4.];
+        let y = array![3., 5., 6.];
+        assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::Less), 1.);
+    }
+
+    #[test]
+    fn test_alt_u_statistic_greater() {
+        let x = array![1., 2., 4.];
+        let y = array![3., 5., 6.];
+        assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::Greater), 8.);
+    }
+
+    #[test]
+    fn test_alt_u_statistic_two_sided() {
+        let x = array![1., 2., 4.];
+        let y = array![3., 5., 6.];
+        assert_eq!(super::alt_u_statistic(&x, &y, super::Alternative::TwoSided), 1.);
+    }
+
+    #[test]
+    fn test_mann_whitney_u_twosided_continuity() {
+        let x = Array::range(0.0, 10.0, 1.0);
+        let y = Array::range(10.0, 20.0, 1.0);
+        let (u, p) = mann_whitney_u(&x, &y, super::Alternative::TwoSided, true);
+        assert_eq!(u, 0.);
+        test_close(p, 0.0001826717911243235);
+    }
+
+    #[test]
+    fn test_mann_whitney_u_twosided_no_continuity() {
+        let x = Array::range(0.0, 10.0, 1.0);
+        let y = Array::range(10.0, 20.0, 1.0);
+        let (u, p) = mann_whitney_u(&x, &y, super::Alternative::TwoSided, false);
+        assert_eq!(u, 0.);
+        test_close(p, 0.00015705228423075119);
+    }
+
+    #[test]
+    fn test_mann_whitney_u_less_continuity() {
+        let x = Array::range(0.0, 10.0, 1.0);
+        let y = Array::range(10.0, 20.0, 1.0);
+        let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Less, true);
+        assert_eq!(u, 0.);
+        test_close(p, 9.133589556216175e-5);
+    }
+
+    #[test]
+    fn test_mann_whitney_u_less_no_continuity() {
+        let x = Array::range(0.0, 10.0, 1.0);
+        let y = Array::range(10.0, 20.0, 1.0);
+        let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Less, false);
+        assert_eq!(u, 0.);
+        test_close(p, 7.852614211537559e-05);
+    }
+
+    #[test]
+    fn test_mann_whitney_u_greater_continuity() {
+        let x = Array::range(0.0, 10.0, 1.0);
+        let y = Array::range(10.0, 20.0, 1.0);
+        let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Greater, true);
+        assert_eq!(u, 100.);
+        test_close(p, 0.9999325785388173);
+    }
+
+    #[test]
+    fn test_mann_whitney_u_greater_no_continuity() {
+        let x = Array::range(0.0, 10.0, 1.0);
+        let y = Array::range(10.0, 20.0, 1.0);
+        let (u, p) = mann_whitney_u(&x, &y, super::Alternative::Greater, false);
+        assert_eq!(u, 100.);
+        test_close(p, 0.9999214738578847);
     }
 }
diff --git a/src/rank_test.rs b/src/rank_test.rs
@@ -1,4 +1,4 @@
-use crate::{mwu::mann_whitney_u, utils::select_ranks};
+use crate::{mwu::{mann_whitney_u, Alternative}, utils::select_ranks};
 use ndarray::{Array1, Axis};
 use ndarray_rand::{rand_distr::Uniform, RandomExt};
 
@@ -10,12 +10,13 @@ pub fn rank_test(
     encoding: &[usize],
     pvalues: &Array1<f64>,
     ntc_values: &Array1<f64>,
+    alternative: Alternative,
     continuity: bool,
 ) -> (Vec<f64>, Vec<f64>) {
     (0..=n_genes)
         .filter(|x| *x != ntc_index)
         .map(|x| select_ranks(x, encoding, pvalues))
-        .map(|x| mann_whitney_u(&x, ntc_values, continuity))
+        .map(|x| mann_whitney_u(&x, ntc_values, alternative, continuity))
         .unzip()
 }
 
@@ -25,6 +26,7 @@ pub fn pseudo_rank_test(
     n_pseudo: usize,
     s_pseudo: usize,
     ntc_values: &Array1<f64>,
+    alternative: Alternative,
     continuity: bool,
 ) -> (Vec<f64>, Vec<f64>) {
     (0..n_pseudo)
@@ -38,14 +40,15 @@ pub fn pseudo_rank_test(
             let out_group = ntc_values.select(Axis(0), &out_mask);
             (in_group, out_group)
         })
-        .map(|(in_group, out_group)| mann_whitney_u(&in_group, &out_group, continuity))
+        .map(|(in_group, out_group)| mann_whitney_u(&in_group, &out_group, alternative, continuity))
         .unzip()
 }
 
 #[cfg(test)]
 mod testing {
     use super::rank_test;
     use ndarray::array;
+    use crate::mwu::Alternative;
 
     #[test]
     fn test_rank_test() {
@@ -54,17 +57,19 @@ mod testing {
         let encoding = vec![0, 1, 2, 0, 1, 2, 0, 1, 2, 0];
         let pvalues = array![0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.0];
         let ntc_values = array![0.7, 0.8, 0.9, 0.7, 0.8];
-        let (u, p) = rank_test(n_genes, ntc_index, &encoding, &pvalues, &ntc_values, false);
+        let alternative = Alternative::TwoSided;
+        let (u, p) = rank_test(n_genes, ntc_index, &encoding, &pvalues, &ntc_values, alternative, false);
         assert_eq!(u, vec![3.0, 4.5]);
-        assert_eq!(p, vec![0.08985624744357722, 0.18554668477815348]);
+        assert_eq!(p, vec![0.17971249488715443, 0.37109336955630695]);
     }
 
     #[test]
     fn test_pseudo_rank_test() {
         let n_pseudo = 2;
         let s_pseudo = 3;
         let ntc_values = array![0.7, 0.7, 0.7, 0.7, 0.7, 0.7];
-        let (_u, p) = super::pseudo_rank_test(n_pseudo, s_pseudo, &ntc_values, false);
-        assert_eq!(p, vec![0.5, 0.5]);
+        let alternative = Alternative::TwoSided;
+        let (_u, p) = super::pseudo_rank_test(n_pseudo, s_pseudo, &ntc_values, alternative, false);
+        assert_eq!(p, vec![1.0, 1.0]);
     }
 }