Overhaul and improvements of viz, collections, and taxonomy

* Add missing comma from setup.py. Closes #173. * Enable CLI auth via env vars. Closes #154. * Change SystemExit Sentry reporting. Closes #153. * Enable autocomplete politely if possible. Closes #155. * Added PCoA (metric multidimensional scaling) plots. Closes #74. * Cluster samples/abundances in heatmaps. Closes #149. * Allow user to specify linkage for clustering. Closes #150. * Choose altair renderer more intelligently. Closes #145. * Return ResourceList instead of list. Closes #158 and #160. * Return SampleCollection from where() method. Closes #159. * Tags can be looked up more easily. Closes #130. * Fix non-deterministic test failure due to responses * Updated dependency versions to match * Updated CircleCI and tox * Additional docstrings * Bump for version v0.4.0 * Lint
onecodex · Jan 15, 2019 · c922835 · c922835
1 parent 550158d
commit c922835
Show file tree

Hide file tree

Showing 29 changed files with 2,401 additions and 1,908 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -13,84 +13,88 @@ version: 2
 jobs:
   test-py3:
     docker:
-      - image: circleci/python:3.4-stretch
+      - image: circleci/python:3.6.7-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             python3 -m venv venv
             . venv/bin/activate
-            pip install numpy
-            pip install .[all,testing]
+            pip install -q -U pip
+            pip install --progress-bar=off numpy
+            pip install --progress-bar=off .[all,testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
-            py.test --ignore tests/test_minimal.py tests/
+            py.test -v tests/
   test-py2:
     docker:
-      - image: circleci/python:2.7-stretch
+      - image: circleci/python:2.7.15-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
-            pip install numpy
-            pip install .[all,testing]
+            pip install -q -U pip
+            pip install --progress-bar=off numpy
+            pip install --progress-bar=off .[all,testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
-            py.test --ignore tests/test_minimal.py tests/
+            py.test -v tests/
   test-minimal:
     docker:
-      - image: circleci/python:3.4-stretch
+      - image: circleci/python:3.4.9-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
-            pip install numpy
-            pip install .[testing]
+            pip install -q -U pip
+            pip install --progress-bar=off .[testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
-            py.test tests/test_cli.py tests/test_minimal.py tests/test_raven.py
+            py.test -v tests/
   test-simplejson:
     docker:
-      - image: circleci/python:3.4-stretch
+      - image: circleci/python:3.6.7-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
-            pip install numpy
-            pip install simplejson
-            pip install .[all,testing]
+            pip install -q -U pip
+            pip install --progress-bar=off numpy
+            pip install --progress-bar=off simplejson
+            pip install --progress-bar=off .[all,testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
-            py.test --ignore tests/test_minimal.py tests/
+            py.test -v tests/
   lint:
     docker:
-      - image: circleci/python:2.7-stretch
+      - image: circleci/python:3.6.7-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
-            pip install .[testing]
+            pip install -q -U pip
+            pip install --progress-bar=off .[testing]
       - run:
           name: Run flake8
           command: |
@@ -99,20 +103,21 @@ jobs:
             flake8 --ignore E501 --exclude onecodex/schemas/* tests/
   coverage:
     docker:
-      - image: circleci/python:3.4-stretch
+      - image: circleci/python:3.6.7-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
-            pip install numpy
-            pip install .[all,testing]
+            pip install -q -U pip
+            pip install --progress-bar=off numpy
+            pip install --progress-bar=off .[all,testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
-            py.test --cov-report=html --cov=onecodex --ignore tests/test_minimal.py tests/
+            py.test -v --cov-report=html --cov=onecodex tests/
             coveralls
 
diff --git a/onecodex/api.py b/onecodex/api.py
@@ -15,7 +15,7 @@
 
 from onecodex.lib.auth import BearerTokenAuth
 from onecodex.models import _model_lookup
-from onecodex.utils import ModuleAlias, get_raven_client, collapse_user
+from onecodex.utils import get_raven_client, collapse_user
 from onecodex.vendored.potion_client import Client as PotionClient
 from onecodex.vendored.potion_client.converter import PotionJSONSchemaDecoder, PotionJSONDecoder, PotionJSONEncoder
 from onecodex.vendored.potion_client.utils import upper_camel_case
@@ -93,12 +93,6 @@ def __init__(self, api_key=None,
             self._raven_client = None
             self._telemetry = False
 
-        # Try to import and copy key modules onto the Api object
-        for module_name in ['onecodex.viz', 'onecodex.distance']:
-            module = ModuleAlias(module_name)
-            if module._imported:
-                setattr(self, module._name, module)
-
     def _fetch_account_email(self):
         creds_file = os.path.expanduser('~/.onecodex')
 

diff --git a/onecodex/distance.py b/onecodex/distance.py
@@ -1,120 +1,119 @@
+import pandas as pd
 import skbio.diversity
-from skbio.tree import TreeNode
 
 from onecodex.exceptions import OneCodexException
-from onecodex.helpers import normalize_classifications, collate_classification_results
-from onecodex.taxonomy import generate_skbio_tree, prune_to_rank
-
-__all__ = ['alpha_diversity', 'simpson', 'chao1',
-           'beta_diversity', 'unifrac', 'jaccard', 'braycurtis', 'cityblock']
-
-
-ACCEPTABLE_FIELDS = ['abundance', 'readcount_w_children', 'readcount']
-
-
-def alpha_counts(classification, field='readcount_w_children', rank='species'):
-    counts = [t[field] for t in classification.results()['table']
-              if t['rank'] == rank]
-    name = classification.sample.metadata.name \
-        if classification.sample.metadata.name else classification.sample.filename
-    ids = [name]
-    return (counts, ids)
-
-
-def beta_counts(classifications, field='readcount_w_children', rank='species'):
-    normed_classifications, _ = normalize_classifications(classifications)
-    df, tax_info = collate_classification_results(normed_classifications, field=field, rank=rank)
-
-    tax_ids = df.columns.values.tolist()
-    vectors = df.values.tolist()
-    vectors = [[int(i) for i in row] for row in vectors]
-    ids = df.index.values
-    return (vectors, tax_ids, ids)
-
-
-def alpha_diversity(classification, metric, ids=None,
-                    field='readcount_w_children', rank='species', **kwargs):
-    """Caculate the diversity within a community"""
-    assert field in ACCEPTABLE_FIELDS
-    counts, ids = alpha_counts(classification, field=field, rank=rank)
-    return skbio.diversity.alpha_diversity(metric, counts, ids, **kwargs).values[0]
-
-
-def beta_diversity(classifications, metric,
-                   field='readcount_w_children', rank='species', **kwargs):
-    """Calculate the diversity between 2 communities"""
-    assert field in ACCEPTABLE_FIELDS
-    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
-    return skbio.diversity.beta_diversity(metric, counts, ids, **kwargs)
-
-
-def simpson(classification, field='readcount_w_children', rank='species'):
-    """
-    An alpha diversity metric that takes into account the
-    number of species present and their abundances.
-    """
-    assert field in ACCEPTABLE_FIELDS
-    counts, ids = alpha_counts(classification, field=field, rank=rank)
-    return skbio.diversity.alpha_diversity('simpson', counts, ids).values[0]
-
-
-def chao1(classification, bias_corrected=True,
-          field='readcount_w_children', rank='species'):
-    assert field in ACCEPTABLE_FIELDS
-    counts, ids = alpha_counts(classification, field=field, rank=rank)
-    return skbio.diversity \
-                .alpha_diversity('chao1', counts, ids, bias_corrected=bias_corrected) \
-                .values[0]
-
-
-def unifrac(classifications, weighted=True,
-            field='readcount_w_children', rank='species', strict=False):
-    """
-    A beta diversity metric that takes into account the relative relatedness of community members.
-    Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
-    """
-    assert field in ACCEPTABLE_FIELDS
-    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
-
-    tree = None
-    for c in classifications:
-        if strict and c.job.id != classifications[0].job.id:
-            raise OneCodexException('All Classifications must have the same Job for Unifrac')
-        tree = generate_skbio_tree(c, existing_tree=tree)
-
-    # there's a bug (?) in skbio where it expects the root to only have
-    # one child, so we do a little faking here
-    new_tree = TreeNode(name='fake root')
-    new_tree.rank = 'no rank'
-    new_tree.append(tree)
-
-    # prune low-level nodes off the tree so the tips are what we're comparing
-    prune_to_rank(new_tree, rank=rank)
-
-    if weighted:
-        return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
-                                              tree=new_tree, otu_ids=tax_ids)
-    else:
-        return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
-                                              tree=new_tree, otu_ids=tax_ids)
-
-
-def jaccard(classifications, field='readcount_w_children', rank='species'):
-    """Compute the Jaccard dissimilarity between two classifications."""
-    assert field in ACCEPTABLE_FIELDS
-    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
-    return skbio.diversity.beta_diversity('jaccard', counts, ids)
-
-
-def braycurtis(classifications, field='readcount_w_children', rank='species'):
-    """Compute the Bray-Curtis dissimilarity between two classifications."""
-    assert field in ACCEPTABLE_FIELDS
-    counts, _, ids = beta_counts(classifications, field=field, rank=rank)
-    return skbio.diversity.beta_diversity('braycurtis', counts, ids)
-
-
-def cityblock(classifications, field='readcount_w_children', rank='species'):
-    """Compute the Manhattan distance between two classifications."""
-    assert field in ACCEPTABLE_FIELDS
-    counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
-    return skbio.diversity.beta_diversity('cityblock', counts, ids)
+from onecodex.taxonomy import TaxonomyMixin
+
+
+class DistanceMixin(TaxonomyMixin):
+    def alpha_diversity(self, metric='simpson', rank='auto'):
+        """Caculate the diversity within a community.
+
+        Parameters
+        ----------
+        metric : {'simpson', 'chao1'}
+            The diversity metric to calculate.
+        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
+            Analysis will be restricted to abundances of taxa at the specified level.
+
+        Returns
+        -------
+        pandas.DataFrame, a distance matrix.
+        """
+        if metric not in ('simpson', 'chao1'):
+            raise OneCodexException('For alpha diversity, metric must be one of: simpson, chao1')
+
+        # needs read counts, not relative abundances
+        if self._guess_normalized():
+            raise OneCodexException('Alpha diversity requires unnormalized read counts.')
+
+        df = self.to_df(rank=rank, normalize=False)
+
+        output = {
+            'classification_id': [],
+            metric: []
+        }
+
+        for c_id in df.index:
+            output['classification_id'].append(c_id)
+            output[metric].append(skbio.diversity.alpha_diversity(
+                metric,
+                df.loc[c_id].tolist(),
+                [c_id]
+            ).values[0])
+
+        return pd.DataFrame(output).set_index('classification_id')
+
+    def beta_diversity(self, metric='braycurtis', rank='auto'):
+        """Calculate the diversity between two communities.
+
+        Parameters
+        ----------
+        metric : {'jaccard', 'braycurtis', 'cityblock'}
+            The distance metric to calculate.
+        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
+            Analysis will be restricted to abundances of taxa at the specified level.
+
+        Returns
+        -------
+        skbio.stats.distance.DistanceMatrix, a distance matrix.
+        """
+        if metric not in ('jaccard', 'braycurtis', 'cityblock'):
+            raise OneCodexException('For beta diversity, metric must be one of: jaccard, braycurtis, cityblock')
+
+        # needs read counts, not relative abundances
+        if self._guess_normalized():
+            raise OneCodexException('Beta diversity requires unnormalized read counts.')
+
+        df = self.to_df(rank=rank, normalize=False)
+
+        counts = []
+        for c_id in df.index:
+            counts.append(df.loc[c_id].tolist())
+
+        return skbio.diversity.beta_diversity(metric, counts, df.index.tolist())
+
+    def unifrac(self, weighted=True, rank='auto'):
+        """A beta diversity metric that takes into account the relative relatedness of community
+        members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence.
+
+        Parameters
+        ----------
+        weighted : `bool`
+            Calculate the weighted (True) or unweighted (False) distance metric.
+        rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
+            Analysis will be restricted to abundances of taxa at the specified level.
+
+        Returns
+        -------
+        skbio.stats.distance.DistanceMatrix, a distance matrix.
+        """
+        # needs read counts, not relative abundances
+        if self._guess_normalized():
+            raise OneCodexException('UniFrac requires unnormalized read counts.')
+
+        df = self.to_df(rank=rank, normalize=False)
+
+        counts = []
+        for c_id in df.index:
+            counts.append(df.loc[c_id].tolist())
+
+        tax_ids = df.keys().tolist()
+
+        tree = self.tree_build()
+        tree = self.tree_prune_rank(tree, rank=df.ocx_rank)
+
+        # there's a bug (?) in skbio where it expects the root to only have
+        # one child, so we do a little faking here
+        from skbio.tree import TreeNode
+        new_tree = TreeNode(name='fake root')
+        new_tree.rank = 'no rank'
+        new_tree.append(tree)
+
+        # then finally run the calculation and return
+        if weighted:
+            return skbio.diversity.beta_diversity('weighted_unifrac', counts, df.index.tolist(),
+                                                  tree=new_tree, otu_ids=tax_ids)
+        else:
+            return skbio.diversity.beta_diversity('unweighted_unifrac', counts, df.index.tolist(),
+                                                  tree=new_tree, otu_ids=tax_ids)