In [1]:
import metagenompy
import numpy as np
from collections.abc import Iterable
import json

In [2]:
total_ncbi_networkx = metagenompy.generate_taxonomy_network(auto_download=False)

Parsing names: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 3442683/3442683 [00:03<00:00, 899096.74it/s]
Parsing nodes: 100%|████████████████████████████████████████████████████████████████████████████████████████████| 2344973/2344973 [00:14<00:00, 161949.70it/s]


In [3]:
def reveal_all_attributes_of_nx(temp_nx):
    '''
    shows all keys that are present at least once as "attributes" of a networkx
    '''
    attribute_set=set()
    for temp_node in temp_nx.nodes:
        #attribute_list.add(set(temp_nx.nodes[temp_node].keys()))
        #print(set(temp_nx.nodes[temp_node].keys()))
        attribute_set=attribute_set.union(temp_nx.nodes[temp_node].keys())
        #hold=input('hold')
    return attribute_set

In [4]:
a={1,2}
b={2,3}
a.union(b)
a

{1, 2}

In [5]:
all_attribute_set=reveal_all_attributes_of_nx(total_ncbi_networkx)

In [6]:
all_attribute_set

{'acronym',
 'authority',
 'blast_name',
 'common_name',
 'equivalent_name',
 'genbank_acronym',
 'genbank_common_name',
 'genbank_synonym',
 'in-part',
 'includes',
 'rank',
 'scientific_name',
 'synonym',
 'type_material'}

In [7]:
set_that_encapulates_node_id={
    'common_name',
    'genbank_common_name',
    'scientific_name'
}

In [8]:
len(total_ncbi_networkx.nodes)

2344973

In [9]:
def flatten(xs):
    '''
    given a list of elements (can contain arbitrarily nested lists)
    creates a generator? of flattned elements
    warning: strings will become lists of char
    '''
    for x in xs:
        if isinstance(x, Iterable) and not isinstance(x, (str, bytes)):
            yield from flatten(x)
        else:
            yield x

In [10]:
a=[1,2,3,[4]]
a='hello'
a=['hellow',[1,23,4]]

list(flatten(a))

['hellow', 1, 23, 4]

In [11]:
#see what the elements of each attribute look like
all_values_of_each_attribute={
    element:list() for element in all_attribute_set
}

for temp_node in total_ncbi_networkx.nodes:
    #print(temp_node)
    #print(total_ncbi_networkx.nodes[temp_node])
    for temp_attribute in total_ncbi_networkx.nodes[temp_node].keys():
        #print(np.asarray(total_ncbi_networkx.nodes[temp_node][temp_attribute]).flatten())
        #hold=input('hold')
        all_values_of_each_attribute[temp_attribute].append(
            total_ncbi_networkx.nodes[temp_node][temp_attribute]
        )
    #np.flatten(total_ncbi_networkx.nodes[a]

In [12]:
for element in all_values_of_each_attribute.keys():
    print(len(all_values_of_each_attribute[element]))
    print(element)
    print('-'*100)

10554
common_name
----------------------------------------------------------------------------------------------------
1150
acronym
----------------------------------------------------------------------------------------------------
29957
genbank_common_name
----------------------------------------------------------------------------------------------------
491203
authority
----------------------------------------------------------------------------------------------------
485
genbank_acronym
----------------------------------------------------------------------------------------------------
228
blast_name
----------------------------------------------------------------------------------------------------
56148
type_material
----------------------------------------------------------------------------------------------------
2344973
scientific_name
----------------------------------------------------------------------------------------------------
447
in-part
---------------------------

In [13]:
# for element in all_values_of_each_attribute.keys():
#     print(element)
#     hold=input('hold')
#     print(all_values_of_each_attribute[element][0:1000])
#     hold=input('hold')
    
    #print('-'*100)
    #hold=input('hold')

In [14]:
print(total_ncbi_networkx.nodes['2'])

{'rank': 'superkingdom', 'scientific_name': 'Bacteria', 'blast_name': 'bacteria', 'genbank_common_name': 'eubacteria', 'in-part': ['Monera', 'Procaryotae', 'Prokaryotae', 'Prokaryota', 'prokaryote', 'prokaryotes']}


In [15]:
list(flatten(total_ncbi_networkx.nodes['2']))

['rank', 'scientific_name', 'blast_name', 'genbank_common_name', 'in-part']

In [16]:
all_values_of_each_attribute

{'common_name': [['Scheibenbakterien', 'Scheibenbakterien Muller 1911'],
  'iron bacteria',
  'Lyme Disease Borrelia',
  'E. coli',
  'rickettsialpox',
  'agent of human granulocytic ehrlichiosis',
  'equine monocytic ehrlichiosis agent',
  ['Bacteroides-Cytophaga-Flexibacter group',
   'BCF group',
   'CFB group',
   'Cytophaga-Flexibacter-Bacteroides phylum'],
  'purple sulfur bacteria',
  'photosynthetic flexibacteria',
  ['blue-green bacteria', 'cyanophytes'],
  'prochlorophytes',
  ['purple bacteria and relatives',
   'purple bacteria',
   'purple non-sulfur bacteria',
   'purple photosynthetic bacteria and relatives',
   'purple photosynthetic bacteria'],
  ['low GC Gram+', 'low G+C Gram-positive bacteria'],
  ['anthrax bacterium', 'anthrax'],
  'Melanogaster sex ratio organism',
  'lemon',
  'Ichang papeda',
  ['apfelsine', 'naranja', 'navel orange', 'Valencia orange'],
  'eukaryotes',
  'rhodophytes',
  ['carageen', 'carrageen', 'Irish moss'],
  ['chrysomonads', 'chrysophytes']

In [17]:
all_values_of_each_attribute['scientific_name'][300000]

'Thiomonas sp. ML1-15'

In [18]:
def create_one_values_to_node_id_dict(temp_nx,temp_node,relevant_node_set):
    '''
    This takes a single node and returns a dict
    where the keys (probably many) are the nested values of the node
    and the value for each key is the node ID
    '''
    one_node_id_dict=dict()
    #we make scientific name the endpoint so that it works like the mesh hierarchies
    #scientific_name=temp_nx.nodes[temp_node][temp_attribute]
    for temp_attribute in relevant_node_set:
        #print(temp_attribute)
        if temp_attribute not in temp_nx.nodes[temp_node].keys():
            continue
        elif isinstance(temp_nx.nodes[temp_node][temp_attribute],str):
            #print(total_ncbi_networkx.nodes[temp_node][temp_attribute])
            one_node_id_dict[temp_nx.nodes[temp_node][temp_attribute]]=temp_node
        else:
            #print(set(flatten(total_ncbi_networkx.nodes[temp_node][temp_attribute])))
            temp_dict={
                element:temp_node for element in set(flatten(temp_nx.nodes[temp_node][temp_attribute]))
            }
            one_node_id_dict.update(temp_dict)
    return one_node_id_dict
    

In [19]:
create_one_values_to_node_id_dict(total_ncbi_networkx,'9606',set_that_encapulates_node_id)

{'man': '9606', 'Homo sapiens': '9606', 'human': '9606'}

In [20]:
total_ncbi_networkx.nodes['566']

{'rank': 'species',
 'synonym': ['Alma group 1',
  'API group 2',
  'CDC Enteric Group 1',
  'Enterobacter vulneris',
  'Escherichia vulneris'],
 'type_material': ['ATCC 33821',
  'CCUG 15715',
  'CDC 875-72',
  'CIP 103177',
  'DSM 4564',
  'HAMBI 1694',
  'JCM 1688',
  'LMG 7868',
  'LMG:7868',
  'NBRC 102420',
  'NCTC 12130'],
 'equivalent_name': 'Escherichia/Shigella vulneris',
 'authority': ['Escherichia vulneris Brenner et al. 1983',
  'Pseudescherichia vulneris (Brenner et al. 1983) Alnajar and Gupta 2017'],
 'scientific_name': 'Pseudescherichia vulneris'}

In [21]:
def create_all_attribute_to_node_id_dict(temp_nx,relevant_node_set):
    '''
    takes an entire networkx and features that help to differentiate nodes
    returns a dict of {attribute:node_id}
    '''
    total_feature_node_id_dict=dict()
    for i,temp_node in enumerate(temp_nx.nodes):
        #if i%100!=0:
        #    continue
        total_feature_node_id_dict.update(
            create_one_values_to_node_id_dict(temp_nx,temp_node,relevant_node_set)
        )
    #    if i >100:
    #        break
    #print(total_feature_node_id_dict)
    return total_feature_node_id_dict

In [22]:
my_dict=create_all_attribute_to_node_id_dict(total_ncbi_networkx,set_that_encapulates_node_id)

In [23]:
with open('small_ncbi_names.json', 'w') as fp:
    json.dump(my_dict, fp,indent=4)

In [24]:
#print(my_dict.keys())