-
Notifications
You must be signed in to change notification settings - Fork 12
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Overhaul and improvements of viz, collections, and taxonomy
* Add missing comma from setup.py. Closes #173. * Enable CLI auth via env vars. Closes #154. * Change SystemExit Sentry reporting. Closes #153. * Enable autocomplete politely if possible. Closes #155. * Added PCoA (metric multidimensional scaling) plots. Closes #74. * Cluster samples/abundances in heatmaps. Closes #149. * Allow user to specify linkage for clustering. Closes #150. * Choose altair renderer more intelligently. Closes #145. * Return ResourceList instead of list. Closes #158 and #160. * Return SampleCollection from where() method. Closes #159. * Tags can be looked up more easily. Closes #130. * Fix non-deterministic test failure due to responses * Updated dependency versions to match * Updated CircleCI and tox * Additional docstrings * Bump for version v0.4.0 * Lint
- Loading branch information
1 parent
550158d
commit c922835
Showing
29 changed files
with
2,401 additions
and
1,908 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,120 +1,119 @@ | ||
import pandas as pd | ||
import skbio.diversity | ||
from skbio.tree import TreeNode | ||
|
||
from onecodex.exceptions import OneCodexException | ||
from onecodex.helpers import normalize_classifications, collate_classification_results | ||
from onecodex.taxonomy import generate_skbio_tree, prune_to_rank | ||
|
||
__all__ = ['alpha_diversity', 'simpson', 'chao1', | ||
'beta_diversity', 'unifrac', 'jaccard', 'braycurtis', 'cityblock'] | ||
|
||
|
||
ACCEPTABLE_FIELDS = ['abundance', 'readcount_w_children', 'readcount'] | ||
|
||
|
||
def alpha_counts(classification, field='readcount_w_children', rank='species'): | ||
counts = [t[field] for t in classification.results()['table'] | ||
if t['rank'] == rank] | ||
name = classification.sample.metadata.name \ | ||
if classification.sample.metadata.name else classification.sample.filename | ||
ids = [name] | ||
return (counts, ids) | ||
|
||
|
||
def beta_counts(classifications, field='readcount_w_children', rank='species'): | ||
normed_classifications, _ = normalize_classifications(classifications) | ||
df, tax_info = collate_classification_results(normed_classifications, field=field, rank=rank) | ||
|
||
tax_ids = df.columns.values.tolist() | ||
vectors = df.values.tolist() | ||
vectors = [[int(i) for i in row] for row in vectors] | ||
ids = df.index.values | ||
return (vectors, tax_ids, ids) | ||
|
||
|
||
def alpha_diversity(classification, metric, ids=None, | ||
field='readcount_w_children', rank='species', **kwargs): | ||
"""Caculate the diversity within a community""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, ids = alpha_counts(classification, field=field, rank=rank) | ||
return skbio.diversity.alpha_diversity(metric, counts, ids, **kwargs).values[0] | ||
|
||
|
||
def beta_diversity(classifications, metric, | ||
field='readcount_w_children', rank='species', **kwargs): | ||
"""Calculate the diversity between 2 communities""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) | ||
return skbio.diversity.beta_diversity(metric, counts, ids, **kwargs) | ||
|
||
|
||
def simpson(classification, field='readcount_w_children', rank='species'): | ||
""" | ||
An alpha diversity metric that takes into account the | ||
number of species present and their abundances. | ||
""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, ids = alpha_counts(classification, field=field, rank=rank) | ||
return skbio.diversity.alpha_diversity('simpson', counts, ids).values[0] | ||
|
||
|
||
def chao1(classification, bias_corrected=True, | ||
field='readcount_w_children', rank='species'): | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, ids = alpha_counts(classification, field=field, rank=rank) | ||
return skbio.diversity \ | ||
.alpha_diversity('chao1', counts, ids, bias_corrected=bias_corrected) \ | ||
.values[0] | ||
|
||
|
||
def unifrac(classifications, weighted=True, | ||
field='readcount_w_children', rank='species', strict=False): | ||
""" | ||
A beta diversity metric that takes into account the relative relatedness of community members. | ||
Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence | ||
""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) | ||
|
||
tree = None | ||
for c in classifications: | ||
if strict and c.job.id != classifications[0].job.id: | ||
raise OneCodexException('All Classifications must have the same Job for Unifrac') | ||
tree = generate_skbio_tree(c, existing_tree=tree) | ||
|
||
# there's a bug (?) in skbio where it expects the root to only have | ||
# one child, so we do a little faking here | ||
new_tree = TreeNode(name='fake root') | ||
new_tree.rank = 'no rank' | ||
new_tree.append(tree) | ||
|
||
# prune low-level nodes off the tree so the tips are what we're comparing | ||
prune_to_rank(new_tree, rank=rank) | ||
|
||
if weighted: | ||
return skbio.diversity.beta_diversity('weighted_unifrac', counts, ids, | ||
tree=new_tree, otu_ids=tax_ids) | ||
else: | ||
return skbio.diversity.beta_diversity('unweighted_unifrac', counts, ids, | ||
tree=new_tree, otu_ids=tax_ids) | ||
|
||
|
||
def jaccard(classifications, field='readcount_w_children', rank='species'): | ||
"""Compute the Jaccard dissimilarity between two classifications.""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) | ||
return skbio.diversity.beta_diversity('jaccard', counts, ids) | ||
|
||
|
||
def braycurtis(classifications, field='readcount_w_children', rank='species'): | ||
"""Compute the Bray-Curtis dissimilarity between two classifications.""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, _, ids = beta_counts(classifications, field=field, rank=rank) | ||
return skbio.diversity.beta_diversity('braycurtis', counts, ids) | ||
|
||
|
||
def cityblock(classifications, field='readcount_w_children', rank='species'): | ||
"""Compute the Manhattan distance between two classifications.""" | ||
assert field in ACCEPTABLE_FIELDS | ||
counts, tax_ids, ids = beta_counts(classifications, field=field, rank=rank) | ||
return skbio.diversity.beta_diversity('cityblock', counts, ids) | ||
from onecodex.taxonomy import TaxonomyMixin | ||
|
||
|
||
class DistanceMixin(TaxonomyMixin): | ||
def alpha_diversity(self, metric='simpson', rank='auto'): | ||
"""Caculate the diversity within a community. | ||
Parameters | ||
---------- | ||
metric : {'simpson', 'chao1'} | ||
The diversity metric to calculate. | ||
rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional | ||
Analysis will be restricted to abundances of taxa at the specified level. | ||
Returns | ||
------- | ||
pandas.DataFrame, a distance matrix. | ||
""" | ||
if metric not in ('simpson', 'chao1'): | ||
raise OneCodexException('For alpha diversity, metric must be one of: simpson, chao1') | ||
|
||
# needs read counts, not relative abundances | ||
if self._guess_normalized(): | ||
raise OneCodexException('Alpha diversity requires unnormalized read counts.') | ||
|
||
df = self.to_df(rank=rank, normalize=False) | ||
|
||
output = { | ||
'classification_id': [], | ||
metric: [] | ||
} | ||
|
||
for c_id in df.index: | ||
output['classification_id'].append(c_id) | ||
output[metric].append(skbio.diversity.alpha_diversity( | ||
metric, | ||
df.loc[c_id].tolist(), | ||
[c_id] | ||
).values[0]) | ||
|
||
return pd.DataFrame(output).set_index('classification_id') | ||
|
||
def beta_diversity(self, metric='braycurtis', rank='auto'): | ||
"""Calculate the diversity between two communities. | ||
Parameters | ||
---------- | ||
metric : {'jaccard', 'braycurtis', 'cityblock'} | ||
The distance metric to calculate. | ||
rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional | ||
Analysis will be restricted to abundances of taxa at the specified level. | ||
Returns | ||
------- | ||
skbio.stats.distance.DistanceMatrix, a distance matrix. | ||
""" | ||
if metric not in ('jaccard', 'braycurtis', 'cityblock'): | ||
raise OneCodexException('For beta diversity, metric must be one of: jaccard, braycurtis, cityblock') | ||
|
||
# needs read counts, not relative abundances | ||
if self._guess_normalized(): | ||
raise OneCodexException('Beta diversity requires unnormalized read counts.') | ||
|
||
df = self.to_df(rank=rank, normalize=False) | ||
|
||
counts = [] | ||
for c_id in df.index: | ||
counts.append(df.loc[c_id].tolist()) | ||
|
||
return skbio.diversity.beta_diversity(metric, counts, df.index.tolist()) | ||
|
||
def unifrac(self, weighted=True, rank='auto'): | ||
"""A beta diversity metric that takes into account the relative relatedness of community | ||
members. Weighted UniFrac looks at abundances, unweighted UniFrac looks at presence. | ||
Parameters | ||
---------- | ||
weighted : `bool` | ||
Calculate the weighted (True) or unweighted (False) distance metric. | ||
rank : {'auto', 'kingdom', 'phylum', 'class', 'order', 'family', 'genus', 'species'}, optional | ||
Analysis will be restricted to abundances of taxa at the specified level. | ||
Returns | ||
------- | ||
skbio.stats.distance.DistanceMatrix, a distance matrix. | ||
""" | ||
# needs read counts, not relative abundances | ||
if self._guess_normalized(): | ||
raise OneCodexException('UniFrac requires unnormalized read counts.') | ||
|
||
df = self.to_df(rank=rank, normalize=False) | ||
|
||
counts = [] | ||
for c_id in df.index: | ||
counts.append(df.loc[c_id].tolist()) | ||
|
||
tax_ids = df.keys().tolist() | ||
|
||
tree = self.tree_build() | ||
tree = self.tree_prune_rank(tree, rank=df.ocx_rank) | ||
|
||
# there's a bug (?) in skbio where it expects the root to only have | ||
# one child, so we do a little faking here | ||
from skbio.tree import TreeNode | ||
new_tree = TreeNode(name='fake root') | ||
new_tree.rank = 'no rank' | ||
new_tree.append(tree) | ||
|
||
# then finally run the calculation and return | ||
if weighted: | ||
return skbio.diversity.beta_diversity('weighted_unifrac', counts, df.index.tolist(), | ||
tree=new_tree, otu_ids=tax_ids) | ||
else: | ||
return skbio.diversity.beta_diversity('unweighted_unifrac', counts, df.index.tolist(), | ||
tree=new_tree, otu_ids=tax_ids) |
Oops, something went wrong.