In [98]:
import networkx as nx
import pandas as pd
from tqdm import tqdm
from IPython.display import display

In [99]:
data_path = "./nodes.dmp"

re_expression = "[|\t]+"

# only read the first three columns
df = pd.read_csv(
    data_path, sep=re_expression, header=None, engine="python", usecols=[0, 1, 2]
)

df.columns = ["Node ID","Parent Node ID", "rank"]

In [100]:
def read_nodes(data_path):
    re_expression = "[|\t]+"

    # only read the first three columns
    df = pd.read_csv(
        data_path, sep=re_expression, header=None, engine="python", usecols=[0, 1, 2]
    )

    df.columns = ["Node ID", "Parent Node ID", "rank"]
    #df = df[1:] # remove the first entry
    
    return df

nodes_df = read_nodes("./nodes.dmp")


In [200]:
nodes_df

Unnamed: 0,Node ID,Parent Node ID,rank
0,1,1,no rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species
...,...,...,...
2442786,2978050,2978049,genus
2442787,2978051,2978049,genus
2442788,2978052,2978049,genus
2442789,2978053,2978052,no rank


In [199]:
a = nodes_df[nodes_df['Node ID'] == 2]
a

Unnamed: 0,Node ID,Parent Node ID,rank
1,2,131567,superkingdom


In [102]:
[1642257,
 2645396,
 474171,
 7198,
 7197,
 41831,
 43787,
 7148,
 7147,
 33392,
 33340,
 7496,
 85512,
 50557,
 6960,
 197562,
 197563,
 6656,
 88770,
 1206794,
 33317,
 33213,
 6072,
 33208,
 33154,
 2759,
 131567,
 1]

['superkingdom',
 'clade',
 'kingdom',
 'phylum',
 'class',
 'family',
 'subfamily',
 'genus',
 'no rank',
 'species']

In [153]:
def read_mapping(mapping_path):
    mappings = {}
    with open(mapping_path, 'r') as f:
        for line in f:
            tokens = line.strip().split()
            sequence_id = tokens[0]
            tax_ids = list(map(int, tokens[1:]))
            mappings[sequence_id] = tax_ids
    return mappings

def find_lineage(tax_id):
    lineage = []
    while tax_id != 1:
        node_info = nodes_df[nodes_df['Node ID'] == tax_id].iloc[0]
        lineage.append((node_info['Node ID'], node_info['rank']))
        tax_id = node_info['Parent Node ID']
        
    # Add the root
    node_info = nodes_df[nodes_df['Node ID'] == tax_id].iloc[0]
    
    lineage.append((node_info['Node ID'], node_info['rank']))
    
    return lineage[::-1] # reverse to get the lineage from root to leaf


In [155]:
mappings = read_mapping("./mapping.txt")

# Find lineages
lineages = {}
for sequence_id, tax_ids in mappings.items():
    lineages_for_seq = []
    for tax_id in tax_ids:
        lineage = find_lineage(tax_id)
        lineages_for_seq.append(lineage)
    lineages[sequence_id] = lineages_for_seq

{'R00010': [[(1, 'no rank'), (131567, 'no rank'), (2759, 'superkingdom'), (33154, 'clade'), (33208, 'kingdom'), (6072, 'clade'), (33213, 'clade'), (33317, 'clade'), (1206794, 'clade'), (88770, 'clade'), (6656, 'phylum'), (197563, 'clade'), (197562, 'clade'), (6960, 'subphylum'), (50557, 'class'), (85512, 'clade'), (7496, 'subclass'), (33340, 'infraclass'), (33392, 'cohort'), (7147, 'order'), (7148, 'suborder'), (43787, 'infraorder'), (41831, 'superfamily'), (7197, 'family'), (7198, 'subfamily'), (474171, 'genus'), (2645396, 'no rank'), (1642257, 'species')], [(1, 'no rank'), (131567, 'no rank'), (2759, 'superkingdom'), (33154, 'clade'), (33208, 'kingdom'), (6072, 'clade'), (33213, 'clade'), (33317, 'clade'), (1206794, 'clade'), (88770, 'clade'), (6656, 'phylum'), (197563, 'clade'), (197562, 'clade'), (6960, 'subphylum'), (50557, 'class'), (85512, 'clade'), (7496, 'subclass'), (33340, 'infraclass'), (33392, 'cohort'), (7399, 'order'), (156408, 'no rank'), (828009, 'species')], [(1, 'no 

In [156]:
list(a[1] for a in lineages['R00060'][0])

['no rank',
 'superkingdom',
 'clade',
 'kingdom',
 'phylum',
 'class',
 'family',
 'subfamily',
 'genus',
 'no rank',
 'species']

In [234]:
def find_ancestors(node):
    ancestors = []
    while True:  
        ancestors.append(node)
        if node == 1:
            break
            
        node = nodes_df.loc[nodes_df['Node ID'] == node, 'Parent Node ID'].values[0]
        
    return ancestors

## What is the taxonomic rank of the LCA of the mapped nodes for each of the sequence reads?

In [192]:
def find_lca(nodes):
    lists_of_ancestors = [find_ancestors(node) for node in nodes]
    common_ancestors = set(lists_of_ancestors[0])
    for ancestors in lists_of_ancestors[1:]:
        common_ancestors.intersection_update(set(ancestors))
    lca = max(common_ancestors)  # Lowest common ancestor
    return lca

In [193]:
for read, nodes in mappings.items():
    lca = find_lca(nodes)
    lca_rank = df.loc[df['Node ID'] == lca, 'rank'].values[0]
    print(f"The LCA of the mapped nodes for read {read} is at rank {lca_rank}")

The LCA of the mapped nodes for read R00010 is at rank no rank
The LCA of the mapped nodes for read R00020 is at rank no rank
The LCA of the mapped nodes for read R00030 is at rank no rank
The LCA of the mapped nodes for read R00040 is at rank no rank
The LCA of the mapped nodes for read R00050 is at rank no rank
The LCA of the mapped nodes for read R00060 is at rank no rank
The LCA of the mapped nodes for read R00070 is at rank no rank
The LCA of the mapped nodes for read R00080 is at rank no rank
The LCA of the mapped nodes for read R00090 is at rank no rank
The LCA of the mapped nodes for read R00100 is at rank no rank


## Do the sequence reads come from archaea, bacteria, eukaryota, or viruses?

In [204]:
def find_origin_of_sequence_read(sequence_read_mappings):
    origins = set()  # To hold unique superkingdoms/domains
    for tax_id in sequence_read_mappings:
        lineage = find_lineage(tax_id)  # Assuming you have a function to find lineage
        for node_id, rank in lineage:
            if rank == 'superkingdom':
                origin = nodes_df.loc[nodes_df['Node ID'] == node_id, 'Node ID'].values[0]
                origins.add(origin)
    return origins

dict = {2 : 'Bacteria', 2157: 'Archaea', 2759: 'Eukaryota', 10239: 'Viruses'}


for read, tax_ids in mappings.items():
    origins = find_origin_of_sequence_read(tax_ids)
    origins = [dict[a] for a in origins]
    print(f"Sequence read {read} maps to superkingdoms: {origins}")

Sequence read R00010 maps to superkingdoms: ['Bacteria', 'Eukaryota']
Sequence read R00020 maps to superkingdoms: ['Bacteria', 'Viruses', 'Eukaryota']
Sequence read R00030 maps to superkingdoms: ['Bacteria', 'Archaea', 'Eukaryota']
Sequence read R00040 maps to superkingdoms: ['Bacteria', 'Eukaryota']
Sequence read R00050 maps to superkingdoms: ['Bacteria', 'Viruses', 'Eukaryota']
Sequence read R00060 maps to superkingdoms: ['Eukaryota', 'Bacteria', 'Viruses']
Sequence read R00070 maps to superkingdoms: ['Bacteria', 'Eukaryota']
Sequence read R00080 maps to superkingdoms: ['Bacteria', 'Viruses', 'Eukaryota']
Sequence read R00090 maps to superkingdoms: ['Bacteria', 'Viruses', 'Archaea', 'Eukaryota']
Sequence read R00100 maps to superkingdoms: ['Bacteria', 'Archaea', 'Eukaryota']


## Given a file nodes.dmp for the NCBI taxonomy and a file mapping.txt of mappings of sequence reads to NCBI taxonomic identifiers, write a Python script to build the LCA skeleton tree for each sequence read.

In [207]:
G = nx.DiGraph()

for _, row in nodes_df.iterrows():
    G.add_edge(row['Parent Node ID'], row['Node ID'])

In [212]:
if G.has_edge(1,1):
    G.remove_edge(1,1)
    
def find_lca(G, node_list):
    ancestors_list = [nx.ancestors(G, node) for node in node_list]
    common_ancestors = set.intersection(*map(set, ancestors_list))
    common_ancestors.add(1)
    
    # Find the LCA among common ancestors by traversing up the tree
    lca = None
    if common_ancestors:
        lca = min(common_ancestors)
    return lca

lca_skeleton_tree = {}
for read_id, node_list in mappings.items():
    lca = find_lca(G, node_list)
    if lca is not None:
        # Get all nodes from the root to each node in node_list and the LCA
        nodes_to_include = set()
        nodes_to_include.add(1)
        for node in node_list:
            nodes_to_include.update(nx.shortest_path(G, source=1, target=node))
            nodes_to_include.update(nx.shortest_path(G, source=1, target=lca))
        
        skeleton_tree = G.subgraph(nodes_to_include)
        
        lca_skeleton_tree[read_id] = skeleton_tree

##  How many nodes are there in the LCA skeleton tree for each of the sequence reads?

In [222]:
num_nodes_per_read = {}

for read_id, lca_tree in lca_skeleton_tree.items():
    num_nodes = len(lca_tree.nodes())
    num_nodes_per_read[read_id] = num_nodes

print("Number of nodes in LCA skeleton tree for each sequence read:")
for read_id, num_nodes in num_nodes_per_read.items():
    print(f"{read_id}: {num_nodes} nodes")

Number of nodes in LCA skeleton tree for each sequence read:
R00010: 109 nodes
R00020: 193 nodes
R00030: 266 nodes
R00040: 308 nodes
R00050: 399 nodes
R00060: 407 nodes
R00070: 491 nodes
R00080: 553 nodes
R00090: 578 nodes
R00100: 631 nodes


## What is the taxonomic rank of the root of the LCA skeleton tree for each of the sequence reads?

In [251]:
root_rank_per_read = {}

for read_id, lca_tree in lca_skeleton_tree.items():
    root_node = [node for node, in_degree in lca_tree.in_degree() if in_degree == 0][0] 
    root_rank = nodes_df[nodes_df['Node ID'] == root_node].iloc[0]['rank']
    root_rank_per_read[read_id] = root_rank

print("Taxonomic rank of the root of the LCA skeleton tree for each sequence read:")
for read_id, root_rank in root_rank_per_read.items():
    print(f"{read_id}: {root_rank}")

Taxonomic rank of the root of the LCA skeleton tree for each sequence read:
R00010: no rank
R00020: no rank
R00030: no rank
R00040: no rank
R00050: no rank
R00060: no rank
R00070: no rank
R00080: no rank
R00090: no rank
R00100: no rank
