More changes to merge

* Additional docstrings * Enforce oc_model subclasses OneCodexBase * Update CircleCI
onecodex · Jan 11, 2019 · 1e934f6 · 1e934f6
1 parent f346e38
commit 1e934f6
Show file tree

Hide file tree

Showing 6 changed files with 170 additions and 29 deletions.
diff --git a/.circleci/config.yml b/.circleci/config.yml
@@ -2,33 +2,53 @@ workflows:
   version: 2
   all-tests:
     jobs:
-      - test-py3
-      - test-py2
-      - test-minimal
-      - test-simplejson
+      - test-py37
+      - test-py35
+      - test-py27
+      - test-minimal-py34
+      - test-simplejson-py35
       - coverage
       - lint
 
 version: 2
 jobs:
-  test-py3:
+  test-py37:
     docker:
-      - image: circleci/python:3.6-stretch
+      - image: circleci/python:3.7-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             python3 -m venv venv
             . venv/bin/activate
+            pip install -q -U pip
             pip install --progress-bar=off numpy
             pip install --progress-bar=off .[all,testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
             py.test -v tests/
-  test-py2:
+  test-py35:
+    docker:
+      - image: circleci/python:3.5-stretch
+    steps:
+      - checkout
+      - run:
+          name: Install Python deps in virtual environment
+          command: |
+            python3 -m venv venv
+            . venv/bin/activate
+            pip install -q -U pip
+            pip install --progress-bar=off numpy
+            pip install --progress-bar=off .[all,testing]
+      - run:
+          name: Run tests
+          command: |
+            . venv/bin/activate
+            py.test -v tests/
+  test-py27:
     docker:
       - image: circleci/python:2.7-stretch
     steps:
@@ -38,39 +58,42 @@ jobs:
           command: |
             virtualenv venv
             . venv/bin/activate
+            pip install -q -U pip
             pip install --progress-bar=off numpy
             pip install --progress-bar=off .[all,testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
             py.test -v tests/
-  test-minimal:
+  test-minimal-py34:
     docker:
-      - image: circleci/python:3.6-stretch
+      - image: circleci/python:3.4-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
+            pip install -q -U pip
             pip install --progress-bar=off .[testing]
       - run:
           name: Run tests
           command: |
             . venv/bin/activate
             py.test -v tests/
-  test-simplejson:
+  test-simplejson-py35:
     docker:
-      - image: circleci/python:3.6-stretch
+      - image: circleci/python:3.5-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
+            pip install -q -U pip
             pip install --progress-bar=off numpy
             pip install --progress-bar=off simplejson
             pip install --progress-bar=off .[all,testing]
@@ -81,14 +104,15 @@ jobs:
             py.test -v tests/
   lint:
     docker:
-      - image: circleci/python:3.6-stretch
+      - image: circleci/python:3.5-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
+            pip install -q -U pip
             pip install --progress-bar=off .[testing]
       - run:
           name: Run flake8
@@ -98,14 +122,15 @@ jobs:
             flake8 --ignore E501 --exclude onecodex/schemas/* tests/
   coverage:
     docker:
-      - image: circleci/python:3.6-stretch
+      - image: circleci/python:3.5-stretch
     steps:
       - checkout
       - run:
           name: Install Python deps in virtual environment
           command: |
             virtualenv venv
             . venv/bin/activate
+            pip install -q -U pip
             pip install --progress-bar=off numpy
             pip install --progress-bar=off .[all,testing]
       - run:

diff --git a/onecodex/helpers.py b/onecodex/helpers.py
@@ -7,6 +7,35 @@
 
 # force persistence of our additional taxonomy and metadata dataframe properties
 class ClassificationsDataFrame(pd.DataFrame):
+    """A subclassed `pandas.DataFrame` containing additional metadata pertinent to analysis of
+    One Codex Classifications results. These fields, once part of the DataFrame, will no longer be
+    updated when the contents of the associated `SampleCollection` change. In comparison, the
+    corresponding attributes `_rank`, `_field`, `taxonomy` and `metadata` in a `SampleCollection`
+    are re-generated whenever members of the `SampleCollection` are added or removed.
+
+    Methods from `AnalysisMixin`, such as `to_df`, are available via the `ocx` namespace. For
+    example, `ClassificationsDataFrame().ocx.to_df()`.
+
+    Parameters
+    ----------
+        ocx_rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
+            Analysis was restricted to abundances of taxa at the specified level.
+
+        ocx_field : {'readcount_w_children', 'readcount', 'abundance'}
+            Which field was used for the abundance/count of a particular taxon in a sample.
+
+            - 'readcount_w_children': total reads of this taxon and all its descendants
+            - 'readcount': total reads of this taxon
+            - 'abundance': genome size-normalized relative abundances, from shotgun sequencing
+
+        ocx_metadata : `pandas.DataFrame`
+            A DataFrame containing collated metadata fields for all samples in this analysis.
+
+        ocx_taxonomy : `pandas.DataFrame`
+            A DataFrame containing taxonomy information (i.e., id, name, rank, parent) for all taxa
+            referenced in this analysis.
+    """
+
     _metadata = ['ocx_rank', 'ocx_field', 'ocx_taxonomy', 'ocx_metadata']
 
     def __init__(self, data=None, index=None, columns=None, dtype=None, copy=False, ocx_rank=None,
@@ -34,6 +63,10 @@ def _constructor_sliced(self):
 
 
 class ClassificationsSeries(pd.Series):
+    """A subclassed `pandas.Series` containing additional metadata pertinent to analysis of
+    One Codex Classifications results. See the docstring for `ClassificationsDataFrame`.
+    """
+
     # 'name' is a piece of metadata specified by pd.Series--it's not ours
     _metadata = ['name', 'ocx_rank', 'ocx_field', 'ocx_taxonomy', 'ocx_metadata']
 
@@ -62,8 +95,18 @@ def _constructor_expanddim(self):
 
 
 class AnalysisMixin(VizPCAMixin, VizHeatmapMixin, VizMetadataMixin, VizDistanceMixin):
+    """Contains methods for analyzing Classifications results.
+
+    Notes
+    -----
+    Three DataFrames are required by most methods: collated counts, collated metadata, and taxonomy.
+    This data is obtained from either a `ClassificationsDataFrame` or a `SampleCollection`. Both
+    classes use this mixin. `AnalysisMixin` pulls additional methods in from `onecodex.distance`,
+    `onecodex.taxonomy`, and `onecodex.viz`.
+    """
+
     def _get_auto_rank(self, rank):
-        """Tries to figure out what rank we should use for analyses, mainly called by results()"""
+        """Tries to figure out what rank we should use for analyses"""
 
         if rank == 'auto':
             # if we're an accessor for a ClassificationsDataFrame, use its _rank property
@@ -78,14 +121,45 @@ def _get_auto_rank(self, rank):
             return rank
 
     def _guess_normalized(self):
-        # it's possible that the _results df has already been normalized, which can cause some
-        # methods to fail. we must guess whether this is the case
+        """Returns true if the collated counts in `self._results` appear to be normalized.
 
+        Notes
+        -----
+        It's possible that the _results df has already been normalized, which can cause some
+        methods to fail. This method lets us guess whether that's true and act accordingly.
+        """
         return bool((self._results.sum(axis=1).round(4) == 1.0).all())
 
     def _metadata_fetch(self, metadata_fields):
         """Takes a list of metadata fields, some of which can contain taxon names or taxon IDs, and
-        returns a DataFrame with magically transformed data that can be used for plotting.
+        returns a DataFrame with transformed data that can be used for plotting.
+
+        Notes
+        -----
+        Taxon names and IDs are transformed into the relative abundances of those taxa within their
+        own rank. For example, 'Bacteroides' will return the relative abundances of 'Bacteroides'
+        among all taxa of rank genus. Taxon IDs are stored as strings in `ClassificationsDataFrame`
+        and are coerced to strings if integers are given.
+
+        Metadata fields are returned as is, from the `self.metadata` DataFrame. If multiple metadata
+        fields are specified in a tuple, their values are joined as strings separated by underscore.
+        Multiple metadata fields in tuple must both be categorical. That is, a numerical field and
+        boolean can not be joined, or the result would be something like '87.4_True'.
+
+        The 'Label' field name is transformed to '_display_name'. This lets us label points in plots
+        by the name generated for each sample in `SampleCollection._collate_metadata`.
+
+        Returns
+        -------
+        `pandas.DataFrame`
+            Columns are renamed (if applicable) metadata fields and rows are `Classifications.id`.
+            Elements are transformed values. Not all metadata fields will have been renamed, but will
+            be present in the below `dict` nonetheless.
+        `dict`
+            Keys are metadata fields and values are renamed metadata fields. This can be used to map
+            metadata fields which were passed to this function, to prettier names. For example, if
+            'bacteroid' is passed, it will be matched with the Bacteroides genus and renamed to
+            'Bacteroides (816)', which includes its taxon ID.
         """
         help_metadata = ', '.join(self.metadata.keys())
         magic_metadata = pd.DataFrame({'classification_id': self._results.index}) \
@@ -173,8 +247,8 @@ def _metadata_fetch(self, metadata_fields):
 
     def to_df(self, rank='auto', top_n=None, threshold=None, remove_zeros=True, normalize='auto',
               table_format='wide'):
-        """Takes the ClassificationsDataFrame associated with these samples, or SampleCollection, does some
-        filtering, and returns a ClassificationsDataFrame.
+        """Takes the ClassificationsDataFrame associated with these samples, or SampleCollection,
+        does some filtering, and returns a ClassificationsDataFrame copy.
 
         Parameters
         ----------
@@ -194,7 +268,7 @@ def to_df(self, rank='auto', top_n=None, threshold=None, remove_zeros=True, norm
 
         Returns
         -------
-        ClassificationsDataFrame
+        `ClassificationsDataFrame`
         """
 
         rank = self._get_auto_rank(rank)
@@ -266,6 +340,18 @@ def to_df(self, rank='auto', top_n=None, threshold=None, remove_zeros=True, norm
 
 @pd.api.extensions.register_dataframe_accessor('ocx')
 class OneCodexAccessor(AnalysisMixin):
+    """Accessor object alllowing access of `AnalysisMixin` methods from the 'ocx' namespace of a
+    `ClassificationsDataFrame`.
+
+    Notes
+    -----
+    When instantiated, the accessor will prune the taxonomic tree back to contain only taxa
+    referenced in the classification results (i.e., self._results). Similarly, metadata is sliced
+    such that it contains only those `Classifications.id` in the results. This is because users may
+    filter or modify the classification results to remove classification results (i.e., rows) or
+    taxa (i.e., cols) from the `ClassificationsDataFrame` before accessing this namespace.
+    """
+
     def __init__(self, pandas_obj):
         # copy data from the ClassificationsDataFrame to a new instance of AnalysisMethods
         self.metadata = pandas_obj.ocx_metadata

diff --git a/onecodex/models/__init__.py b/onecodex/models/__init__.py
@@ -27,10 +27,22 @@ class AnalysisMixin(object):
 
 
 class ResourceList(object):
-    """
-    In OneCodexBase, when attributes are lists, actions performed on the returned lists are not
-    passed through to the underlying resource list. This class passes those actions through, and
-    will generally act like a list.
+    """Wrapper around lists of onecodex-wrapped potion objects.
+
+    Parameters
+    ----------
+    _resource : `list`
+        A list of potion objects, which are generally stored in `OneCodexBase._resource`.
+    oc_model : `OneCodexBase`
+        A class which inherits from `OneCodexBase`, for example, `models.Tags`.
+
+    Notes
+    -----
+    In OneCodexBase, when attributes are lists (e.g., `Samples.tags`), actions performed on the
+    returned lists are not passed through to the underlying potion object's list. This class passes
+    those actions through, and will generally act like a list.
+
+    See https://github.com/onecodex/onecodex/issues/40
     """
 
     def _update(self):
@@ -59,6 +71,10 @@ def _check_valid_resource(self, other, check_for_dupes=True):
                 raise OneCodexException('{} cannot contain duplicate objects'.format(self.__class__.__name__))
 
     def __init__(self, _resource, oc_model):
+        if not issubclass(oc_model, OneCodexBase):
+            raise ValueError("Expected object of type '{}', got '{}'"
+                             .format(OneCodexBase.__name__, oc_model.__name__))
+
         # turn potion Resource objects into OneCodex objects
         self._resource = _resource
         self._oc_model = oc_model
@@ -162,6 +178,15 @@ def remove(self, x):
 
 
 class SampleCollection(ResourceList, AnalysisMixin):
+    """A collection of `Samples` or `Classifications` objects with many methods are analysis of
+    classifications results.
+
+    Notes
+    -----
+    Inherits from `ResourceList` to provide a list-like API, and `AnalysisMixin` to provide relevant
+    analysis methods.
+    """
+
     def __init__(self, _resource, oc_model, skip_missing=True, label=None, field='auto'):
         self._kwargs = {'skip_missing': skip_missing,
                         'label': label,
@@ -376,8 +401,7 @@ def taxonomy(self):
         return self._cached['taxonomy']
 
     def to_otu(self, biom_id=None):
-        """
-        Converts a list of objects associated with a classification result into a `dict` resembling
+        """Converts a list of objects associated with a classification result into a `dict` resembling
         an OTU table.
 
         Parameters

diff --git a/onecodex/taxonomy.py b/onecodex/taxonomy.py
@@ -3,7 +3,8 @@
 
 class TaxonomyMixin(object):
     def tree_build(self):
-        """Build a tree from the taxonomy data present in this ResultsDataFrame or SampleCollection.
+        """Build a tree from the taxonomy data present in this `ClassificationsDataFrame` or
+        `SampleCollection`.
 
         Returns
         -------
@@ -79,6 +80,11 @@ def tree_prune_rank(self, tree, rank='species'):
         -------
         `skbio.tree.TreeNode`, the root of the tree where all tips are at the given rank, and all
         tips have a path back to the root node.
+
+        Examples
+        --------
+        tree_prune_rank(tree, 'species') will remove all subspecies/strain nodes and return a tree
+        containing all genus-level nodes and higher.
         """
         if rank is None:
             return tree.copy()

diff --git a/tox.ini b/tox.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py27,py36,coverage,lint,minimal,simplejson
+envlist = py27,py35,py37,coverage,lint,minimal,simplejson
 
 [testenv]
 commands =

diff --git a/tox.setup.ini b/tox.setup.ini
@@ -1,5 +1,5 @@
 [tox]
-envlist = py27,py36
+envlist = py27,py35,py37
 
 [testenv]
 deps = numpy