From c724fdc4b23db4c2c0a57d168376a0b9cef9f51e Mon Sep 17 00:00:00 2001 From: Roberto Vera Alvarez Date: Fri, 4 Feb 2022 09:48:24 -0500 Subject: [PATCH] Adding NCBI taxonomy support --- docs/source/index.rst | 2 +- src/pm4ngs/taxonomy.py | 166 +++++++++++++++++++++++++++++------------ 2 files changed, 118 insertions(+), 50 deletions(-) diff --git a/docs/source/index.rst b/docs/source/index.rst index 56de45d..f233134 100644 --- a/docs/source/index.rst +++ b/docs/source/index.rst @@ -85,7 +85,7 @@ Citation Help and Support ---------------- -For query/questions regarding snakePipes, please write on biostars with the tag **#PM4NGS** +For query/questions regarding PM4NGS, please write veraalva@ncbi.nlm.nih.gov For feature requests or bug reports, please open an issue on `our GitHub Repository `__. diff --git a/src/pm4ngs/taxonomy.py b/src/pm4ngs/taxonomy.py index 3f052d0..c8da030 100644 --- a/src/pm4ngs/taxonomy.py +++ b/src/pm4ngs/taxonomy.py @@ -1,11 +1,37 @@ import os import pickle -import requests +import shutil import tarfile import tempfile -import shutil import networkx as nx +import pandas +import requests + + +def parse_nodes_file(node_file, taxid): + with open(node_file, 'r') as fin: + for line in fin: + f = line.strip().split('\t|\t') + node = {} + node['id'] = f[0] + node['name'] = taxid[f[0]]['scientific name'] + edge = () + if f[1] != node['id']: + edge = (f[1], node['id']) + yield (f[0], node), edge + + +def parse_tax_name_file(name_file): + tax_id = {} + with open(name_file, 'r') as fin: + for line in fin: + line = line.strip() + f = line.split('\t|\t') + if f[0] not in tax_id: + tax_id[f[0]] = {} + tax_id[f[0]][f[3].replace('\t|', '').strip()] = f[1] + return tax_id class Taxonomy: @@ -16,15 +42,23 @@ def __init__(self, *args, **kwargs): tax_pickle_file = kwargs.get('tax_pickle_file', None) group_pickle_file = kwargs.get('group_pickle_file', None) self.taxonomy_groups = kwargs.get('taxonomy_groups', { - 'bacteria': {'taxid': '2', 'nodes': set(), 'sequences': set()}, - 'archaea': {'taxid': '2157', 'nodes': set(), 'sequences': set()}, - 'viridiplantae': {'taxid': '33090', 'nodes': set(), 'sequences': set()}, - 'fungi': {'taxid': '4751', 'nodes': set(), 'sequences': set()}, - 'arthropoda': {'taxid': '6656', 'nodes': set(), 'sequences': set()}, - 'chordata': {'taxid': '7711', 'nodes': set(), 'sequences': set()}, - 'metazoa': {'taxid': '33208', 'nodes': set(), 'sequences': set()}, - 'eukaryota': {'taxid': '2759', 'nodes': set(), 'sequences': set()}, - 'viruses': {'taxid': '10239', 'nodes': set(), 'sequences': set()}, + 'bacteria': {'taxid': '2', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'archaea': {'taxid': '2157', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'viridiplantae': {'taxid': '33090', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'fungi': {'taxid': '4751', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'arthropoda': {'taxid': '6656', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'neoteleostei': {'taxid': '123365', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'actinopterygii': {'taxid': '7898', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'glires': {'taxid': '314147', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'primates': {'taxid': '9443', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'carnivora': {'taxid': '33554', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'artiodactyla': {'taxid': '91561', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'amphibia': {'taxid': '8292', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'sauropsida': {'taxid': '8457', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'sarcopterygii': {'taxid': '8287', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'chordata': {'taxid': '7711', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'eukaryota': {'taxid': '2759', 'nodes': set(), 'sequences': set(), 'size': 0}, + 'viruses': {'taxid': '10239', 'nodes': set(), 'sequences': set(), 'size': 0}, }) self.tax = nx.DiGraph() if name_file and node_file: @@ -80,33 +114,30 @@ def find_node(self, id): return nodes[0], a return None, None - def parse_nodes_file(self, node_file, taxid): - with open(node_file, 'r') as fin: - for line in fin: - f = line.strip().split('\t|\t') - node = {} - node['id'] = f[0] - node['name'] = taxid[f[0]]['scientific name'] - edge = () - if f[1] != node['id']: - edge = (f[1], node['id']) - yield (f[0], node), edge - - def parse_tax_name_file(self, name_file): - tax_id = {} - with open(name_file, 'r') as fin: - for line in fin: - line = line.strip() - f = line.split('\t|\t') - if f[0] not in tax_id: - tax_id[f[0]] = {} - tax_id[f[0]][f[3].replace('\t|', '').strip()] = f[1] - return tax_id + def find_node_by_name(self, name): + for n in self.tax.nodes(data=True): + if n[1]['name'].casefold() == name.casefold(): + return n + return None + + def get_node_lineage(self, node): + a = "" + for i in nx.shortest_path(self.tax, source="1", target=node[0])[2:]: + ns = [y for x, y in self.tax.nodes(data=True) if y['id'] == i] + if ns: + if a: + a += "; " + a += ns[0]['name'] + return a + + def get_lineage_by_name(self, name): + n = self.find_node_by_name(name) + return self.get_node_lineage(n) def create_taxonomy_graph(self, name_file, node_file): - tax_id = self.parse_tax_name_file(name_file) + tax_id = parse_tax_name_file(name_file) print('Taxonomies: {}'.format(len(tax_id))) - entries = self.parse_nodes_file(node_file, tax_id) + entries = parse_nodes_file(node_file, tax_id) nodes, edges = zip(*entries) print('{} nodes created'.format(len(nodes))) self.tax.add_nodes_from(nodes) @@ -124,22 +155,28 @@ def get_taxonomy_group_nodes(self, taxid, to_exclude): list(set(self.taxonomy_groups[taxid]['nodes']) - to_exclude) print('{} with {} taxa'.format(taxid, len(self.taxonomy_groups[taxid]['nodes']))) + def add_sequences_sizefrom_gtax_idx(self, taxonomy_group): + if os.path.exists('{}.idx'.format(taxonomy_group)): + df = pandas.read_csv('{}.idx'.format(taxonomy_group), sep='\t', header=None) + print('{} sequences loaded from the index'.format(len(df))) + seq = {} + for g in df.groupby(2): + seq[str(g[0])] = {'sequences': set(g[1][0].unique()), 'size': g[1][3].sum()} + nx.set_node_attributes(self.tax, seq) + def create_taxonomy_groups(self): + inserted = set() + nodes = self.tax.nodes(data=True) for k in self.taxonomy_groups: - if k != 'metazoa' and k != 'eukaryota': - self.taxonomy_groups[k]['nodes'] = \ - self.get_successors(self.taxonomy_groups[k]['taxid']) - print('{} with {} taxa'.format(k, len(self.taxonomy_groups[k]['nodes']))) - - to_exclude = set(list(self.taxonomy_groups['arthropoda']['nodes']) - + list(self.taxonomy_groups['chordata']['nodes'])) - self.get_taxonomy_group_nodes('metazoa', to_exclude) - to_exclude = set( - list(to_exclude) + - list(self.taxonomy_groups['viridiplantae']['nodes']) + - list(self.taxonomy_groups['fungi']['nodes']) + - list(self.taxonomy_groups['metazoa']['nodes'])) - self.get_taxonomy_group_nodes('eukaryota', to_exclude) + self.taxonomy_groups[k]['nodes'] = self.get_successors(self.taxonomy_groups[k]['taxid']).difference( + inserted) + inserted.update(self.taxonomy_groups[k]['nodes']) + self.add_sequences_sizefrom_gtax_idx(k) + for node_id in self.taxonomy_groups[k]['nodes']: + node_id = str(node_id) + if 'size' in nodes[node_id]: + self.taxonomy_groups[k]['sequences'].update(nodes[node_id]['sequences']) + self.taxonomy_groups[k]['size'] += nodes[node_id]['size'] def create_pickle(self, tax_pickle_file, group_pickle_file): print('Printing tax graph pickle file') @@ -147,4 +184,35 @@ def create_pickle(self, tax_pickle_file, group_pickle_file): print('Printing tax group pickle file') pickle.dump(self.taxonomy_groups, open(group_pickle_file, "wb")) + def print_size(self, node_name, deep=1, step=1, min_size=10.0, min_size_child=50.0): + nodes = self.tax.nodes(data=True) + node = self.find_node_by_name(node_name) + size = 0 + taxa = 0 + for node_id in self.successors(node[1]['id']): + if 'sequences' in nodes[node_id]: + taxa += 1 + size += nodes[node_id]['size'] + size = size / 1e+9 + if size >= min_size: + print('{}{} {} => {:.2f} GB'.format(' ' * step, node_name, node[1]['id'], size)) + step += 1 + if step < deep and size >= min_size_child: + for t in self.tax.successors(node[1]['id']): + if t != node[1]['id']: + self.print_size(self.find_node(t)[0]['name'], deep, step, min_size, min_size_child) + + def resume(self): + data = [] + nodes = self.tax.nodes(data=True) + for k in self.taxonomy_groups: + taxas = 0 + for n in self.taxonomy_groups[k]['nodes']: + if 'sequences' in nodes[str(n)]: + taxas += 1 + data.append([k, len(self.taxonomy_groups[k]['nodes']), + taxas, + len(self.taxonomy_groups[k]['sequences']), + round(self.taxonomy_groups[k]['size'] / 1e+9, 2)]) + return pandas.DataFrame(data, columns=['Taxonomy', 'Taxas', 'Taxas with sequences', 'Sequences', 'Size (GB)'])