Skip to content

Commit

Permalink
Overhaul and improvements of viz, collections, and taxonomy
Browse files Browse the repository at this point in the history
* Add missing comma from setup.py. Closes #173.
* Enable CLI auth via env vars. Closes #154.
* Change SystemExit Sentry reporting. Closes #153.
* Enable autocomplete politely if possible. Closes #155.
* Added PCoA (metric multidimensional scaling) plots. Closes #74.
* Cluster samples/abundances in heatmaps. Closes #149.
* Allow user to specify linkage for clustering. Closes #150.
* Choose altair renderer more intelligently. Closes #145.
* Return ResourceList instead of list. Closes #158 and #160.
* Return SampleCollection from where() method. Closes #159.
* Tags can be looked up more easily. Closes #130.
* Fix non-deterministic test failure due to responses
* Updated dependency versions to match
* Updated CircleCI and tox
* Additional docstrings
* Bump for version v0.4.0
* Lint
  • Loading branch information
polyatail authored and boydgreenfield committed Jan 15, 2019
1 parent 550158d commit c922835
Show file tree
Hide file tree
Showing 29 changed files with 2,401 additions and 1,908 deletions.
51 changes: 28 additions & 23 deletions .circleci/config.yml
Original file line number Diff line number Diff line change
Expand Up @@ -13,84 +13,88 @@ version: 2
jobs:
test-py3:
docker:
- image: circleci/python:3.4-stretch
- image: circleci/python:3.6.7-stretch
steps:
- checkout
- run:
name: Install Python deps in virtual environment
command: |
python3 -m venv venv
. venv/bin/activate
pip install numpy
pip install .[all,testing]
pip install -q -U pip
pip install --progress-bar=off numpy
pip install --progress-bar=off .[all,testing]
- run:
name: Run tests
command: |
. venv/bin/activate
py.test --ignore tests/test_minimal.py tests/
py.test -v tests/
test-py2:
docker:
- image: circleci/python:2.7-stretch
- image: circleci/python:2.7.15-stretch
steps:
- checkout
- run:
name: Install Python deps in virtual environment
command: |
virtualenv venv
. venv/bin/activate
pip install numpy
pip install .[all,testing]
pip install -q -U pip
pip install --progress-bar=off numpy
pip install --progress-bar=off .[all,testing]
- run:
name: Run tests
command: |
. venv/bin/activate
py.test --ignore tests/test_minimal.py tests/
py.test -v tests/
test-minimal:
docker:
- image: circleci/python:3.4-stretch
- image: circleci/python:3.4.9-stretch
steps:
- checkout
- run:
name: Install Python deps in virtual environment
command: |
virtualenv venv
. venv/bin/activate
pip install numpy
pip install .[testing]
pip install -q -U pip
pip install --progress-bar=off .[testing]
- run:
name: Run tests
command: |
. venv/bin/activate
py.test tests/test_cli.py tests/test_minimal.py tests/test_raven.py
py.test -v tests/
test-simplejson:
docker:
- image: circleci/python:3.4-stretch
- image: circleci/python:3.6.7-stretch
steps:
- checkout
- run:
name: Install Python deps in virtual environment
command: |
virtualenv venv
. venv/bin/activate
pip install numpy
pip install simplejson
pip install .[all,testing]
pip install -q -U pip
pip install --progress-bar=off numpy
pip install --progress-bar=off simplejson
pip install --progress-bar=off .[all,testing]
- run:
name: Run tests
command: |
. venv/bin/activate
py.test --ignore tests/test_minimal.py tests/
py.test -v tests/
lint:
docker:
- image: circleci/python:2.7-stretch
- image: circleci/python:3.6.7-stretch
steps:
- checkout
- run:
name: Install Python deps in virtual environment
command: |
virtualenv venv
. venv/bin/activate
pip install .[testing]
pip install -q -U pip
pip install --progress-bar=off .[testing]
- run:
name: Run flake8
command: |
Expand All @@ -99,20 +103,21 @@ jobs:
flake8 --ignore E501 --exclude onecodex/schemas/* tests/
coverage:
docker:
- image: circleci/python:3.4-stretch
- image: circleci/python:3.6.7-stretch
steps:
- checkout
- run:
name: Install Python deps in virtual environment
command: |
virtualenv venv
. venv/bin/activate
pip install numpy
pip install .[all,testing]
pip install -q -U pip
pip install --progress-bar=off numpy
pip install --progress-bar=off .[all,testing]
- run:
name: Run tests
command: |
. venv/bin/activate
py.test --cov-report=html --cov=onecodex --ignore tests/test_minimal.py tests/
py.test -v --cov-report=html --cov=onecodex tests/
coveralls
8 changes: 1 addition & 7 deletions onecodex/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@

from onecodex.lib.auth import BearerTokenAuth
from onecodex.models import _model_lookup
from onecodex.utils import ModuleAlias, get_raven_client, collapse_user
from onecodex.utils import get_raven_client, collapse_user
from onecodex.vendored.potion_client import Client as PotionClient
from onecodex.vendored.potion_client.converter import PotionJSONSchemaDecoder, PotionJSONDecoder, PotionJSONEncoder
from onecodex.vendored.potion_client.utils import upper_camel_case
Expand Down Expand Up @@ -93,12 +93,6 @@ def __init__(self, api_key=None,
self._raven_client = None
self._telemetry = False

# Try to import and copy key modules onto the Api object
for module_name in ['onecodex.viz', 'onecodex.distance']:
module = ModuleAlias(module_name)
if module._imported:
setattr(self, module._name, module)

def _fetch_account_email(self):
creds_file = os.path.expanduser('~/.onecodex')

Expand Down
233 changes: 116 additions & 117 deletions onecodex/distance.py
Original file line number Diff line number Diff line change
@@ -1,120 +1,119 @@
import pandas as pd
import skbio.diversity
from skbio.tree import TreeNode

from onecodex.exceptions import OneCodexException
from onecodex.helpers import normalize_classifications, collate_classification_results
from onecodex.taxonomy import generate_skbio_tree, prune_to_rank

__all__ = ['alpha_diversity', 'simpson', 'chao1',
'beta_diversity', 'unifrac', 'jaccard', 'braycurtis', 'cityblock']


ACCEPTABLE_FIELDS = ['abundance', 'readcount_w_children', 'readcount']


def alpha_counts(classification, field='readcount_w_children', rank='species'):
counts = [t[field] for t in classification.results()['table']
if t['rank'] == rank]
name = classification.sample.metadata.name \
if classification.sample.metadata.name else classification.sample.filename
ids = [name]
return (counts, ids)


def beta_counts(classifications, field='readcount_w_children', rank='species'):
normed_classifications, _ = normalize_classifications(classifications)
df, tax_info = collate_classification_results(normed_classifications, field=field, rank=rank)

tax_ids = df.columns.values.tolist()
vectors = df.values.tolist()
vectors = [[int(i) for i in row] for row in vectors]
ids = df.index.values
return (vectors, tax_ids, ids)


def alpha_diversity(classification, metric, ids=None,
field='readcount_w_children', rank='species', **kwargs):
"""Caculate the diversity within a community"""
assert field in ACCEPTABLE_FIELDS
counts, ids = alpha_counts(classification, field=field, rank=rank)
return skbio.diversity.alpha_diversity(metric, counts, ids, **kwargs).values[0]


def beta_diversity(classifications, metric,
field='readcount_w_children', rank='species', **kwargs):
"""Calculate the diversity between 2 communities"""
assert field in ACCEPTABLE_FIELDS
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
return skbio.diversity.beta_diversity(metric, counts, ids, **kwargs)


def simpson(classification, field='readcount_w_children', rank='species'):
"""
An alpha diversity metric that takes into account the
number of species present and their abundances.
"""
assert field in ACCEPTABLE_FIELDS
counts, ids = alpha_counts(classification, field=field, rank=rank)
return skbio.diversity.alpha_diversity('simpson', counts, ids).values[0]


def chao1(classification, bias_corrected=True,
field='readcount_w_children', rank='species'):
assert field in ACCEPTABLE_FIELDS
counts, ids = alpha_counts(classification, field=field, rank=rank)
return skbio.diversity \
.alpha_diversity('chao1', counts, ids, bias_corrected=bias_corrected) \
.values[0]


def unifrac(classifications, weighted=True,
field='readcount_w_children', rank='species', strict=False):
"""
A beta diversity metric that takes into account the relative relatedness of community members.
Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence
"""
assert field in ACCEPTABLE_FIELDS
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)

tree = None
for c in classifications:
if strict and c.job.id != classifications[0].job.id:
raise OneCodexException('All Classifications must have the same Job for Unifrac')
tree = generate_skbio_tree(c, existing_tree=tree)

# there's a bug (?) in skbio where it expects the root to only have
# one child, so we do a little faking here
new_tree = TreeNode(name='fake root')
new_tree.rank = 'no rank'
new_tree.append(tree)

# prune low-level nodes off the tree so the tips are what we're comparing
prune_to_rank(new_tree, rank=rank)

if weighted:
return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids,
tree=new_tree, otu_ids=tax_ids)
else:
return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids,
tree=new_tree, otu_ids=tax_ids)


def jaccard(classifications, field='readcount_w_children', rank='species'):
"""Compute the Jaccard dissimilarity between two classifications."""
assert field in ACCEPTABLE_FIELDS
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
return skbio.diversity.beta_diversity('jaccard', counts, ids)


def braycurtis(classifications, field='readcount_w_children', rank='species'):
"""Compute the Bray-Curtis dissimilarity between two classifications."""
assert field in ACCEPTABLE_FIELDS
counts, _, ids = beta_counts(classifications, field=field, rank=rank)
return skbio.diversity.beta_diversity('braycurtis', counts, ids)


def cityblock(classifications, field='readcount_w_children', rank='species'):
"""Compute the Manhattan distance between two classifications."""
assert field in ACCEPTABLE_FIELDS
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank)
return skbio.diversity.beta_diversity('cityblock', counts, ids)
from onecodex.taxonomy import TaxonomyMixin


class DistanceMixin(TaxonomyMixin):
def alpha_diversity(self, metric='simpson', rank='auto'):
"""Caculate the diversity within a community.
Parameters
----------
metric : {'simpson', 'chao1'}
The diversity metric to calculate.
rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
Analysis will be restricted to abundances of taxa at the specified level.
Returns
-------
pandas.DataFrame, a distance matrix.
"""
if metric not in ('simpson', 'chao1'):
raise OneCodexException('For alpha diversity, metric must be one of: simpson, chao1')

# needs read counts, not relative abundances
if self._guess_normalized():
raise OneCodexException('Alpha diversity requires unnormalized read counts.')

df = self.to_df(rank=rank, normalize=False)

output = {
'classification_id': [],
metric: []
}

for c_id in df.index:
output['classification_id'].append(c_id)
output[metric].append(skbio.diversity.alpha_diversity(
metric,
df.loc[c_id].tolist(),
[c_id]
).values[0])

return pd.DataFrame(output).set_index('classification_id')

def beta_diversity(self, metric='braycurtis', rank='auto'):
"""Calculate the diversity between two communities.
Parameters
----------
metric : {'jaccard', 'braycurtis', 'cityblock'}
The distance metric to calculate.
rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
Analysis will be restricted to abundances of taxa at the specified level.
Returns
-------
skbio.stats.distance.DistanceMatrix, a distance matrix.
"""
if metric not in ('jaccard', 'braycurtis', 'cityblock'):
raise OneCodexException('For beta diversity, metric must be one of: jaccard, braycurtis, cityblock')

# needs read counts, not relative abundances
if self._guess_normalized():
raise OneCodexException('Beta diversity requires unnormalized read counts.')

df = self.to_df(rank=rank, normalize=False)

counts = []
for c_id in df.index:
counts.append(df.loc[c_id].tolist())

return skbio.diversity.beta_diversity(metric, counts, df.index.tolist())

def unifrac(self, weighted=True, rank='auto'):
"""A beta diversity metric that takes into account the relative relatedness of community
members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence.
Parameters
----------
weighted : `bool`
Calculate the weighted (True) or unweighted (False) distance metric.
rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional
Analysis will be restricted to abundances of taxa at the specified level.
Returns
-------
skbio.stats.distance.DistanceMatrix, a distance matrix.
"""
# needs read counts, not relative abundances
if self._guess_normalized():
raise OneCodexException('UniFrac requires unnormalized read counts.')

df = self.to_df(rank=rank, normalize=False)

counts = []
for c_id in df.index:
counts.append(df.loc[c_id].tolist())

tax_ids = df.keys().tolist()

tree = self.tree_build()
tree = self.tree_prune_rank(tree, rank=df.ocx_rank)

# there's a bug (?) in skbio where it expects the root to only have
# one child, so we do a little faking here
from skbio.tree import TreeNode
new_tree = TreeNode(name='fake root')
new_tree.rank = 'no rank'
new_tree.append(tree)

# then finally run the calculation and return
if weighted:
return skbio.diversity.beta_diversity('weighted_unifrac', counts, df.index.tolist(),
tree=new_tree, otu_ids=tax_ids)
else:
return skbio.diversity.beta_diversity('unweighted_unifrac', counts, df.index.tolist(),
tree=new_tree, otu_ids=tax_ids)
Loading

0 comments on commit c922835

Please sign in to comment.