working on unit tests

maxibor · Jun 20, 2019 · 078e549 · 078e549
1 parent 18e45c3
commit 078e549
Show file tree

Hide file tree

Showing 8 changed files with 190 additions and 6 deletions.
diff --git a/paper/paper.md b/paper/paper.md
@@ -1,5 +1,5 @@
 ---
-title: 'Sourcepredict: Prediction of metagenomic sample sources using machine learning algorithms'
+title: 'Sourcepredict: Prediction of metagenomic sample sources using dimension reduction followed by machine learning classification'
 tags:
   - microbiome
   - source tracking

diff --git a/sourcepredictlib/ml.py b/sourcepredictlib/ml.py
@@ -16,12 +16,18 @@
 from io import StringIO
 import umap
 import warnings
+import os
 import sys
-
 from collections import Counter
-from . import normalize
 
-from . import utils
+
+parentScriptDir = "/".join(os.path.dirname(
+    os.path.realpath(__file__)).split("/")[:-1])
+sys.path.append(parentScriptDir+"/sourcepredictlib")
+
+import normalize
+
+import utils
 
 
 class sourceunknown():
@@ -54,7 +60,7 @@ def __repr__(self):
         return(f'A sourceforest object of source {self.ref} and sink {self.tmp_sink}')
 
     def add_unknown(self, alpha, seed):
-        """Add unkown
+        """Add unknown samples
 
         Create unknown Samples from test sample
         N Random samples are created with N being average of class counts

diff --git a/sourcepredictlib/normalize.py b/sourcepredictlib/normalize.py
@@ -14,6 +14,8 @@ def RLE_normalize(pd_dataframe):
             colums as Samples, Rows as OTUs
     Returns:
         pandas DataFrame: RLE Normalized datafrane. Colums as Samples, Rows as OTUs
+    Example:
+        >>> RLE_normalize(pd.DataFrame)
     """
 
     step1 = pd_dataframe.apply(np.log, 0)
@@ -83,7 +85,7 @@ def gmpr_size_factor(col, ar):
     """Generate GMPR size factor
 
     Args:
-        col (list): individual columms of the numpy array
+        col (int): columm index of the numpy array
         ar (numpy array): numpy array of OTU counts,
             colums as Samples, Rows as OTUs
     Returns:

diff --git a/sourcepredictlib/utils.py b/sourcepredictlib/utils.py
@@ -88,6 +88,9 @@ def check_norm(method):
         method(str): Normalization method
     Returns:
         str: capitalized normalization method name
+    Example:
+        >>> check_norm('rle')
+        'RLE'
     """
 
     methods = ['RLE', 'SUBSAMPLE', 'GMPR']
@@ -106,6 +109,9 @@ def check_embed(method):
         method(str): Embedding method
     Returns:
         str: capitalized embedding method name
+    Example:
+        >>> check_embed('tsne')
+        'TSNE'
     """
 
     methods = ['TSNE', 'UMAP', 'MDS']
@@ -124,6 +130,9 @@ def check_distance(method):
         method(str): distance method
     Returns:
         str: capitalized distance method name
+    Example:
+        >>> check_distance('Weighted_unifrac')
+        'weighted_unifrac'
     """
 
     methods = ['weighted_unifrac', 'unweighted_unifrac']
@@ -146,6 +155,9 @@ def check_gen_seed(seed, amin=1, amax=10000):
             Defaults to 10000.
     Returns:
         int: random seed sampled between 1 and 10000
+    Example:
+        >>> check_gen_seed(42)
+        42
     """
 
     if seed is None:
@@ -161,6 +173,9 @@ def plural(count):
         count(int): number of occurences
     Returns:
         str: '' or 's'
+    Example:
+        >>> plural(3)
+        's'
     """
 
     if count == 1:
@@ -178,6 +193,9 @@ def _get_basename(file_name):
         file_name(str): path to file
     Returns:
         str: file basename
+    Example:
+        >>> _get_basename('/path/to/myfile.txt')
+        'myfile'
     """
 
     if ("/") in file_name:

diff --git a/tests/test.py b/tests/test.py
@@ -0,0 +1,9 @@
+import sys
+import os
+parentScriptDir = "/".join(os.path.dirname(
+    os.path.realpath(__file__)).split("/")[:-1])
+sys.path.append(parentScriptDir+"/sourcepredictlib")
+
+print(sys.path)
+
+import ml
diff --git a/tests/test_ml.py b/tests/test_ml.py
@@ -0,0 +1,71 @@
+import sys
+import os
+import pandas as pd
+import random
+
+parentScriptDir = "/".join(os.path.dirname(
+    os.path.realpath(__file__)).split("/")[:-1])
+sys.path.append(parentScriptDir+"/sourcepredictlib")
+random.seed(42)
+
+import ml
+import utils
+
+
+def test_sourceunknown_init():
+
+    PYTHONHASHSEED = 0
+
+    labels = os.path.dirname(os.path.abspath(
+        __file__)) + '/../data/modern_gut_microbiomes_labels.csv'
+    sources = os.path.dirname(os.path.abspath(
+        __file__))+'/../data/modern_gut_microbiomes_sources.csv'
+    sink_file = os.path.dirname(os.path.abspath(
+        __file__))+'/../data/test/dog_test_sample.csv'
+    sink = utils.split_sinks(sink_file)[0]
+
+    su = ml.sourceunknown(source=sources, sink=sink, labels=labels)
+
+    assert su.ref.shape == (5664, 432)
+    assert su.y.shape == (432,)
+    assert su.y_unk.shape == (432,)
+    assert su.tmp_sink.shape == (570, 1)
+    assert su.combined.shape == (5664, 433)
+    assert hash(str(su.combined)) == -1867655657877779130
+
+
+# def test_sourceunknown_add_unkown():
+#     PYTHONHASHSEED = 0
+#     labels = os.path.dirname(os.path.abspath(
+#         __file__)) + '/../data/modern_gut_microbiomes_labels.csv'
+#     sources = os.path.dirname(os.path.abspath(
+#         __file__))+'/../data/modern_gut_microbiomes_sources.csv'
+#     sink_file = os.path.dirname(os.path.abspath(
+#         __file__))+'/../data/test/dog_test_sample.csv'
+#     sink = utils.split_sinks(sink_file)[0]
+
+#     su = ml.sourceunknown(source=sources, sink=sink, labels=labels)
+#     su.add_unknown(alpha=0.1, seed=42)
+
+#     assert su.ref_u.shape == (570, 144)
+#     assert su.ref_u.dtypes ==
+#     assert hash(str(su.ref_u.columns)) == 5867343156924504419
+#     assert hash(str(su.ref_u.index)) == 4313932402357376923
+#     assert hash(str(su.ref_u_labs)) == -5906979299339891562
+
+
+# def test_sourceunknown_normalized():
+
+#     labels = os.path.dirname(os.path.abspath(
+#         __file__)) + '/data/modern_gut_microbiomes_labels.csv'
+#     sources = os.path.dirname(os.path.abspath(
+#         __file__))+'/data/modern_gut_microbiomes_sources.csv'
+#     sink_file = os.path.dirname(os.path.abspath(
+#         __file__))+'data/test/dog_test_sample.csv'
+#     sink = utils.split_sinks(sink_file)[0]
+
+#     su = ml.sourceunknown(source=sources, sink=sink, labels=labels)
+#     su.add_unknown(alpha=0.1, seed=42)
+#     su_rle = su.normalize(method='rle', threads=1)
+#     su_subsample = su.normalize(method='subsample', threads=1)
+#     su_gmpr = su.normalize(method='gmpr', threads=1)
diff --git a/tests/test_normalize.py b/tests/test_normalize.py
@@ -0,0 +1,62 @@
+import sys
+import os
+import numpy as np
+import pandas as pd
+
+parentScriptDir = "/".join(os.path.dirname(
+    os.path.realpath(__file__)).split("/")[:-1])
+sys.path.append(parentScriptDir+"/sourcepredictlib")
+
+import normalize
+
+
+def test_RLE():
+    """
+    Test RLE normalization
+    """
+
+    input_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    output_df = pd.DataFrame(
+        [[1.0, 2.0, 2.0], [5.0, 5.0, 5.0], [9.0, 8.0, 7.0]])
+
+    assert normalize.RLE_normalize(
+        input_df).all().all() == output_df.all().all()
+
+
+def test_subsample():
+    """
+    Test subsample normalization
+    """
+
+    input_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    output_df = pd.DataFrame(
+        [[0.0, 0.0, 0.0], [4.0, 4.0, 4.0], [9.0, 9.0, 9.0]])
+
+    assert normalize.subsample_normalize_pd(
+        input_df).all().all() == output_df.all().all()
+
+
+def test_gmpr_size_factor():
+    """
+    Test GMPR normalization size factor
+    """
+
+    input_ar = np.array([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    output = 1.0137003325955667
+    assert normalize.gmpr_size_factor(col=1, ar=input_ar) == output
+
+
+def test_GMPR():
+    """
+    Test GMPR normalization
+    """
+
+    input_df = pd.DataFrame([[1, 2, 3], [4, 5, 6], [7, 8, 9]])
+    output_df = pd.DataFrame([[1.2331060371652351, 1.9729696594643762, 2.4662120743304703],
+                              [4.932424148660941, 4.932424148660941,
+                                  4.932424148660941],
+                              [8.631742260156646, 7.891878637857505, 7.398636222991411]])
+    assert normalize.GMPR_normalize(
+        input_df, 1).all().all() == output_df.all().all()
+    assert normalize.GMPR_normalize(
+        input_df, 2).all().all() == output_df.all().all()
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -0,0 +1,16 @@
+import sys
+import os
+
+parentScriptDir = "/".join(os.path.dirname(
+    os.path.realpath(__file__)).split("/")[:-1])
+sys.path.append(parentScriptDir+"/sourcepredictlib")
+
+import utils
+
+
+def test_checks():
+    assert utils.check_norm('rle') == 'RLE'
+    assert utils.check_embed('tsne') == 'TSNE'
+    assert utils.check_distance('Weighted_unifrac') == 'weighted_unifrac'
+    assert utils.check_gen_seed(42) == 42
+    assert type(utils.check_gen_seed(seed=None)) is int