In [40]:
import networkx as nx
import pandas as pd

data_path = "handins/handin4/nodes.dmp"

re_expression = "[|\t]+"

# only read the first three columns
df = pd.read_csv(
    data_path, sep=re_expression, header=None, engine="python", usecols=[0, 1, 2]
)

df.columns = ["Node ID", "Parent Node ID", "rank"]

df.head(10)

Unnamed: 0,Node ID,Parent Node ID,rank
0,1,1,no rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species
5,10,1706371,genus
6,11,1707,species
7,13,203488,genus
8,14,13,species
9,16,32011,genus


# 1. Store the data as a rooted tree

there is a cycle from node 1 to it self. It is however the only cylce in the graph, as if i remove this node, the graph becomes acyclic

In [10]:
# remove the first and only cyclic edge to make it a DAG
df = df[1:]

In [75]:
# Store the data as a graph
def build(df: pd.DataFrame) -> nx.DiGraph:
    G = nx.DiGraph()
    for _, row in df.iterrows():
        parent_node_id = row['Parent Node ID']
        node_id = row['Node ID']

        # Add nodes
        if not G.has_node(parent_node_id):
            G.add_node(parent_node_id, name=parent_node_id)
        
        G.add_node(node_id, name=node_id)
        
        # Add edges
        #if parent_node_id != node_id: # no cycles
        G.add_edge(parent_node_id, node_id)
    return G

total_G = build(df)
#total_G.number_of_nodes() # number of nodes in graph

In [80]:
total_G.remove_edge(1, 1) # remove the one cycle there is in the graph
print(nx.is_tree(total_G))
print(nx.is_directed_acyclic_graph(total_G))

True
True


# 2. Is the NBCI taxonomy a rooted tree, a directed acyclic graph, or a directed graph with cycles?

In [43]:
# Check if the tree is a rooted tree
nx.is_tree(total_G)

False

In [26]:
# topological sorting to determine if DAG
is_dag = nx.is_directed_acyclic_graph(total_G)
print(f"Is the graph a DAG: {is_dag}")

if is_dag:
    num_nodes = len(total_G.nodes())
    num_edges = len(total_G.edges())
    in_degrees = dict(total_G.in_degree())

    if num_edges == num_nodes - 1:  # property of trees
        root_count = sum(1 for node, in_degree in in_degrees.items() if in_degree == 1)
        if root_count == 1:
            print("The DAG is a rooted tree.")
        else:
            print("The graph is DAG but not a rooted tree (multiple roots).")
    else:
        print("The graph is a DAG but not a rooted tree (not enough edges).")
else:
    print("The graph is not a DAG.")

Is the graph a DAG: True
The DAG is a DAG but not a rooted tree (multiple roots).


# 3. how many nodes are there in the NCBI taxonomy?

In [39]:
print(f"There are {len(df['Node ID'])} nodes in the NCBI taxonomy")

There are 2442790 nodes in the NCBI taxonomy


# 4. Restric the rooted tree to the seven standard taxonomic ranks

the correct number of nodes is 2111999

In [None]:
standard_taxonomic_ranks = [
    "superkingdom", # is actually kingdom
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species",
]

non_standard_nodes = list(df[~df['rank'].isin(standard_taxonomic_ranks)]['Node ID'].values) # list of nodes ID with non-standard taxonomic rank

In [95]:
restricted_tree = total_G.copy()

for node_ID in non_standard_nodes:
    if (parent_node_ID := [c for c, _ in restricted_tree.in_edges(node_ID)]) and (
        (parent_node_ID) != node_ID
    ):
        parent_node_ID = parent_node_ID[0]
        
        # connect each child node from current node to the parent node of current node
        for child_node_ID in [child for _, child in restricted_tree.out_edges(node_ID)]:
            restricted_tree.add_edge(parent_node_ID, child_node_ID)

        # remove current node
        restricted_tree.remove_node(node_ID)
    else:
        continue

print(nx.is_tree(restricted_tree))
print(nx.is_directed_acyclic_graph(restricted_tree))
print(restricted_tree.number_of_nodes())

# 5.  What is the name of the kingdom taxonomic rank in the NCBI taxonomy?

In [24]:
df[df['rank'].isin(standard_taxonomic_ranks)]

Unnamed: 0,Node ID,Parent Node ID,rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species
5,10,1706371,genus
...,...,...,...
2442784,2978044,475815,species
2442785,2978049,651142,family
2442786,2978050,2978049,genus
2442787,2978051,2978049,genus


In [None]:
for idx, node in df[df['Parent Node ID'] == 1].iterrows():
    #parent_node_id = node['Parent Node ID']
    #print(node['Parent Node ID'])
    #print(df[df['Node ID'] == parent_node_id]['rank'])

Parent Node ID          1
Node ID                 1
rank              no rank
Name: 0, dtype: object


# Other