In [1]:
import networkx as nx
import pandas as pd
from tqdm import tqdm
from IPython.display import display

In [2]:
data_path = "./nodes.dmp"

re_expression = "[|\t]+"

# only read the first three columns
df = pd.read_csv(
    data_path, sep=re_expression, header=None, engine="python", usecols=[0, 1, 2]
)

df.columns = ["Parent Node ID", "Node ID", "rank"]

In [3]:
df.head(30)

Unnamed: 0,Parent Node ID,Node ID,rank
0,1,1,no rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species
5,10,1706371,genus
6,11,1707,species
7,13,203488,genus
8,14,13,species
9,16,32011,genus


In [4]:
def build(df: pd.DataFrame) -> nx.DiGraph:
    G = nx.DiGraph()
    for _, row in tqdm(df.iterrows()):
        parent_node_id = row['Parent Node ID']
        node_id = row['Node ID']

        G.add_node(parent_node_id)
        G.add_node(node_id)
        
        G.add_edge(parent_node_id, node_id)
    return G

In [5]:
total_G = build(df)

2442791it [01:58, 20680.33it/s]


In [6]:
print(f"Is the graph a DAG: {nx.is_directed_acyclic_graph(total_G)}") # topological sorting to determine if DAG

Is the graph a DAG: False


In [10]:
def is_rooted(G: nx.DiGraph) -> bool:

    root_nodes = 0
    for node in G.nodes():
        in_degree = G.in_degree(node)
        if in_degree == 0:
            print("W")
            root_nodes += 1
        elif in_degree > 1:
            print("NCBI taxonomy is not rooted because a node has more than two parents")
            return False
    if root_nodes != 1:
        print(f"NCBI taxonomy is not rooted because it has {root nodes} roots")
        return False
        
    return True
is_rooted(total_G)

NCBI taxonomy is not rooted because a node has more than two parents


False

In [11]:
def is_dag(G: nx.DiGraph) -> bool:
    # Copy the graphs so that the modifications dont affect the original graph
    G_copy = G.copy()
    
    # Remove self-loop if it exists at the root (node 1)
    if G_copy.has_edge(1, 1):
        G_copy.remove_edge(1, 1)
    
    is_dag = nx.is_directed_acyclic_graph(G_copy)
    if not is_dag:
        return False
    else:
        print("NCBI taxonomy is DAG")
        return True
    
is_dag(total_G)

NCBI taxonomy is DAG


True

there is a cycle from node 1 to it self. It is however the only cylce in the graph, as if i remove this node, the graph becomes acyclic

In [29]:
print(f"There are {len(df['Node ID'])} nodes in the NCBI taxonomy")

There are 2442791 nodes in the NCBI taxonomy


In [8]:
standard_taxonomic_ranks = [
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species",
]

display(df["rank"].value_counts()[standard_taxonomic_ranks])

rank
kingdom         13
phylum         291
class          461
order         1760
family        9907
genus       104194
species    1995381
Name: count, dtype: int64

In [26]:
filtered_df = df[df['rank'].isin(standard_taxonomic_ranks)]
filtered_G = build(filtered_df)

2112007it [04:47, 7357.23it/s]


## What is the name of the kingdom taxonomic rank in the NCBI taxonomy?

In [None]:
for idx, node in df[df['Parent Node ID'] == 1].iterrows():
    #parent_node_id = node['Parent Node ID']
    #print(node['Parent Node ID'])
    #print(df[df['Node ID'] == parent_node_id]['rank'])

Parent Node ID          1
Node ID                 1
rank              no rank
Name: 0, dtype: object


# Other

In [None]:
df['rank'].value_counts()

rank
species             1995381
no rank              233587
genus                104194
strain                45245
subspecies            27134
family                 9907
varietas               9243
subfamily              3203
tribe                  2304
order                  1760
subgenus               1740
isolate                1322
serotype               1235
clade                   915
superfamily             891
forma specialis         747
forma                   633
subtribe                582
section                 479
class                   461
suborder                373
species group           347
phylum                  291
subclass                166
serogroup               140
infraorder              130
species subgroup        129
superorder               57
subphylum                32
parvorder                26
genotype                 21
subsection               21
infraclass               19
biotype                  17
kingdom                  13
morph          

## How many nodes are there in the NCBI taxonomy, once restricted to the seven standard taxonomic ranks?

In [30]:
print(f"There are {len(filtered_df)} nodes in the restricted NCBI taxonomy")

There are 2112007 nodes in the restricted NCBI taxonomy
