Skip to content

Commit

Permalink
Adding NCBI taxonomy support
Browse files Browse the repository at this point in the history
  • Loading branch information
r78v10a07 committed Feb 4, 2022
1 parent bead64e commit c724fdc
Show file tree
Hide file tree
Showing 2 changed files with 118 additions and 50 deletions.
2 changes: 1 addition & 1 deletion docs/source/index.rst
Original file line number Diff line number Diff line change
Expand Up @@ -85,7 +85,7 @@ Citation
Help and Support
----------------

For query/questions regarding snakePipes, please write on biostars with the tag **#PM4NGS**
For query/questions regarding PM4NGS, please write veraalva@ncbi.nlm.nih.gov

For feature requests or bug reports, please open an issue on `our GitHub Repository <https://github.com/ncbi/pm4ngs>`__.

Expand Down
166 changes: 117 additions & 49 deletions src/pm4ngs/taxonomy.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,37 @@
import os
import pickle
import requests
import shutil
import tarfile
import tempfile
import shutil

import networkx as nx
import pandas
import requests


def parse_nodes_file(node_file, taxid):
with open(node_file, 'r') as fin:
for line in fin:
f = line.strip().split('\t|\t')
node = {}
node['id'] = f[0]
node['name'] = taxid[f[0]]['scientific name']
edge = ()
if f[1] != node['id']:
edge = (f[1], node['id'])
yield (f[0], node), edge


def parse_tax_name_file(name_file):
tax_id = {}
with open(name_file, 'r') as fin:
for line in fin:
line = line.strip()
f = line.split('\t|\t')
if f[0] not in tax_id:
tax_id[f[0]] = {}
tax_id[f[0]][f[3].replace('\t|', '').strip()] = f[1]
return tax_id


class Taxonomy:
Expand All @@ -16,15 +42,23 @@ def __init__(self, *args, **kwargs):
tax_pickle_file = kwargs.get('tax_pickle_file', None)
group_pickle_file = kwargs.get('group_pickle_file', None)
self.taxonomy_groups = kwargs.get('taxonomy_groups', {
'bacteria': {'taxid': '2', 'nodes': set(), 'sequences': set()},
'archaea': {'taxid': '2157', 'nodes': set(), 'sequences': set()},
'viridiplantae': {'taxid': '33090', 'nodes': set(), 'sequences': set()},
'fungi': {'taxid': '4751', 'nodes': set(), 'sequences': set()},
'arthropoda': {'taxid': '6656', 'nodes': set(), 'sequences': set()},
'chordata': {'taxid': '7711', 'nodes': set(), 'sequences': set()},
'metazoa': {'taxid': '33208', 'nodes': set(), 'sequences': set()},
'eukaryota': {'taxid': '2759', 'nodes': set(), 'sequences': set()},
'viruses': {'taxid': '10239', 'nodes': set(), 'sequences': set()},
'bacteria': {'taxid': '2', 'nodes': set(), 'sequences': set(), 'size': 0},
'archaea': {'taxid': '2157', 'nodes': set(), 'sequences': set(), 'size': 0},
'viridiplantae': {'taxid': '33090', 'nodes': set(), 'sequences': set(), 'size': 0},
'fungi': {'taxid': '4751', 'nodes': set(), 'sequences': set(), 'size': 0},
'arthropoda': {'taxid': '6656', 'nodes': set(), 'sequences': set(), 'size': 0},
'neoteleostei': {'taxid': '123365', 'nodes': set(), 'sequences': set(), 'size': 0},
'actinopterygii': {'taxid': '7898', 'nodes': set(), 'sequences': set(), 'size': 0},
'glires': {'taxid': '314147', 'nodes': set(), 'sequences': set(), 'size': 0},
'primates': {'taxid': '9443', 'nodes': set(), 'sequences': set(), 'size': 0},
'carnivora': {'taxid': '33554', 'nodes': set(), 'sequences': set(), 'size': 0},
'artiodactyla': {'taxid': '91561', 'nodes': set(), 'sequences': set(), 'size': 0},
'amphibia': {'taxid': '8292', 'nodes': set(), 'sequences': set(), 'size': 0},
'sauropsida': {'taxid': '8457', 'nodes': set(), 'sequences': set(), 'size': 0},
'sarcopterygii': {'taxid': '8287', 'nodes': set(), 'sequences': set(), 'size': 0},
'chordata': {'taxid': '7711', 'nodes': set(), 'sequences': set(), 'size': 0},
'eukaryota': {'taxid': '2759', 'nodes': set(), 'sequences': set(), 'size': 0},
'viruses': {'taxid': '10239', 'nodes': set(), 'sequences': set(), 'size': 0},
})
self.tax = nx.DiGraph()
if name_file and node_file:
Expand Down Expand Up @@ -80,33 +114,30 @@ def find_node(self, id):
return nodes[0], a
return None, None

def parse_nodes_file(self, node_file, taxid):
with open(node_file, 'r') as fin:
for line in fin:
f = line.strip().split('\t|\t')
node = {}
node['id'] = f[0]
node['name'] = taxid[f[0]]['scientific name']
edge = ()
if f[1] != node['id']:
edge = (f[1], node['id'])
yield (f[0], node), edge

def parse_tax_name_file(self, name_file):
tax_id = {}
with open(name_file, 'r') as fin:
for line in fin:
line = line.strip()
f = line.split('\t|\t')
if f[0] not in tax_id:
tax_id[f[0]] = {}
tax_id[f[0]][f[3].replace('\t|', '').strip()] = f[1]
return tax_id
def find_node_by_name(self, name):
for n in self.tax.nodes(data=True):
if n[1]['name'].casefold() == name.casefold():
return n
return None

def get_node_lineage(self, node):
a = ""
for i in nx.shortest_path(self.tax, source="1", target=node[0])[2:]:
ns = [y for x, y in self.tax.nodes(data=True) if y['id'] == i]
if ns:
if a:
a += "; "
a += ns[0]['name']
return a

def get_lineage_by_name(self, name):
n = self.find_node_by_name(name)
return self.get_node_lineage(n)

def create_taxonomy_graph(self, name_file, node_file):
tax_id = self.parse_tax_name_file(name_file)
tax_id = parse_tax_name_file(name_file)
print('Taxonomies: {}'.format(len(tax_id)))
entries = self.parse_nodes_file(node_file, tax_id)
entries = parse_nodes_file(node_file, tax_id)
nodes, edges = zip(*entries)
print('{} nodes created'.format(len(nodes)))
self.tax.add_nodes_from(nodes)
Expand All @@ -124,27 +155,64 @@ def get_taxonomy_group_nodes(self, taxid, to_exclude):
list(set(self.taxonomy_groups[taxid]['nodes']) - to_exclude)
print('{} with {} taxa'.format(taxid, len(self.taxonomy_groups[taxid]['nodes'])))

def add_sequences_sizefrom_gtax_idx(self, taxonomy_group):
if os.path.exists('{}.idx'.format(taxonomy_group)):
df = pandas.read_csv('{}.idx'.format(taxonomy_group), sep='\t', header=None)
print('{} sequences loaded from the index'.format(len(df)))
seq = {}
for g in df.groupby(2):
seq[str(g[0])] = {'sequences': set(g[1][0].unique()), 'size': g[1][3].sum()}
nx.set_node_attributes(self.tax, seq)

def create_taxonomy_groups(self):
inserted = set()
nodes = self.tax.nodes(data=True)
for k in self.taxonomy_groups:
if k != 'metazoa' and k != 'eukaryota':
self.taxonomy_groups[k]['nodes'] = \
self.get_successors(self.taxonomy_groups[k]['taxid'])
print('{} with {} taxa'.format(k, len(self.taxonomy_groups[k]['nodes'])))

to_exclude = set(list(self.taxonomy_groups['arthropoda']['nodes'])
+ list(self.taxonomy_groups['chordata']['nodes']))
self.get_taxonomy_group_nodes('metazoa', to_exclude)
to_exclude = set(
list(to_exclude) +
list(self.taxonomy_groups['viridiplantae']['nodes']) +
list(self.taxonomy_groups['fungi']['nodes']) +
list(self.taxonomy_groups['metazoa']['nodes']))
self.get_taxonomy_group_nodes('eukaryota', to_exclude)
self.taxonomy_groups[k]['nodes'] = self.get_successors(self.taxonomy_groups[k]['taxid']).difference(
inserted)
inserted.update(self.taxonomy_groups[k]['nodes'])
self.add_sequences_sizefrom_gtax_idx(k)
for node_id in self.taxonomy_groups[k]['nodes']:
node_id = str(node_id)
if 'size' in nodes[node_id]:
self.taxonomy_groups[k]['sequences'].update(nodes[node_id]['sequences'])
self.taxonomy_groups[k]['size'] += nodes[node_id]['size']

def create_pickle(self, tax_pickle_file, group_pickle_file):
print('Printing tax graph pickle file')
pickle.dump(self.tax, open(tax_pickle_file, "wb"))
print('Printing tax group pickle file')
pickle.dump(self.taxonomy_groups, open(group_pickle_file, "wb"))

def print_size(self, node_name, deep=1, step=1, min_size=10.0, min_size_child=50.0):
nodes = self.tax.nodes(data=True)
node = self.find_node_by_name(node_name)
size = 0
taxa = 0
for node_id in self.successors(node[1]['id']):
if 'sequences' in nodes[node_id]:
taxa += 1
size += nodes[node_id]['size']
size = size / 1e+9
if size >= min_size:
print('{}{} {} => {:.2f} GB'.format(' ' * step, node_name, node[1]['id'], size))
step += 1
if step < deep and size >= min_size_child:
for t in self.tax.successors(node[1]['id']):
if t != node[1]['id']:
self.print_size(self.find_node(t)[0]['name'], deep, step, min_size, min_size_child)

def resume(self):
data = []
nodes = self.tax.nodes(data=True)
for k in self.taxonomy_groups:
taxas = 0
for n in self.taxonomy_groups[k]['nodes']:
if 'sequences' in nodes[str(n)]:
taxas += 1
data.append([k, len(self.taxonomy_groups[k]['nodes']),
taxas,
len(self.taxonomy_groups[k]['sequences']),
round(self.taxonomy_groups[k]['size'] / 1e+9, 2)])
return pandas.DataFrame(data, columns=['Taxonomy', 'Taxas', 'Taxas with sequences', 'Sequences', 'Size (GB)'])

0 comments on commit c724fdc

Please sign in to comment.