In [64]:
import pandas as pd

In [65]:
palmdb = pd.read_csv("../virus-watch-data/virus_ref/u_tax.tsv", sep="\t")
palmdb

Unnamed: 0,Label,phylum,class,order,family,genus,species
0,u9,Kitrinoviricota,Flasuviricetes,Amarillovirales,Flaviviridae,Flavivirus,West Nile virus
1,u10,Negarnaviricota,Monjiviricetes,Mononegavirales,Filoviridae,Ebolavirus,Zaire ebolavirus
2,u6,Negarnaviricota,Insthoviricetes,Articulavirales,Orthomyxoviridae,Betainfluenzavirus,Influenza B virus
3,u13,Negarnaviricota,Monjiviricetes,Mononegavirales,Rhabdoviridae,Lyssavirus,Rabies lyssavirus
4,u1,Pisuviricota,Pisoniviricetes,Nidovirales,Coronaviridae,Betacoronavirus,Severe acute respiratory syndrome-related coro...
...,...,...,...,...,...,...,...
296618,u296621,Lenarviricota,Allassoviricetes,Levivirales,.,.,.
296619,u296608,.,.,.,.,.,.
296620,u296622,Lenarviricota,Howeltoviricetes,Cryppavirales,.,.,.
296621,u296618,Pisuviricota,Pisoniviricetes,Picornavirales,Picornaviridae,Enterovirus,Enterovirus C


Add phylum "Unclassified" to all labels without classification and "Unclassified sp." labels with only species classified: 

In [66]:
for index, row in palmdb.iterrows():
    if row["phylum"] == "." and row["class"] == "." and row["order"] == "." and row["family"] == "." and row["genus"] == ".":
        if row["species"] == ".":
            row["phylum"] = "Unclassified"
        else:
            row["phylum"] = "Unclassified sp."

In [67]:
palmdb[palmdb["phylum"] == "Unclassified sp."]

Unnamed: 0,Label,phylum,class,order,family,genus,species
742,u752,Unclassified sp.,.,.,.,.,Thika virus
779,u787,Unclassified sp.,.,.,.,.,Posavirus sp.
919,u929,Unclassified sp.,.,.,.,.,Craigies Hill virus
1047,u1056,Unclassified sp.,.,.,.,.,Nora virus
1050,u1060,Unclassified sp.,.,.,.,.,Beihai picorna-like virus 15
...,...,...,...,...,...,...,...
295780,u295783,Unclassified sp.,.,.,.,.,Beihai mollusks virus 1
296408,u296412,Unclassified sp.,.,.,.,.,Thika virus
296422,u296427,Unclassified sp.,.,.,.,.,Nora virus
296572,u296577,Unclassified sp.,.,.,.,.,Drosophila A virus


Make barplot showing number of viruses per phylum, class, order etc. and then overlay with how many of these can also be found in NCBI data.  
Alternative: Make taxonomic overview with tags for which are in NCBI data.

In [68]:
def df_to_dict(df):
    """
    Convert data frame to node_to_children dictionary as a first step to converting it to Newick format. 
    
    Every row in the data frame (df) should stand for one complete branch of the tree from the root to the leaves in order
    from left to right. 
    
    Returns formatted dictionary.
    
    Source:
    https://stackoverflow.com/questions/65017040/how-to-convert-pandas-dataframe-to-dictionary-for-newick-format
    """
    node_to_children = {}

    # Setting arbritrary branch length 1
    branch_length = 1

    # Iterate over dataframe row-wise, assuming that every row stands for one complete branch of the tree
    for row in df.itertuples():
        # Remove index at position 0 and elements that contain no child (".")
        row_list = [element for element in row[1:] if element != "."]

        for i in range(len(row_list)-1):
            if row_list[i] in node_to_children.keys():
                # Check if parent entry already existing 
                if row_list[i+1] in node_to_children[row_list[i]].keys():
                    # If already exists --> next
                    continue
                else:
                    # If entry does not exist --> update dict and add the connection
                    node_to_children[row_list[i]].update({row_list[i+1]:branch_length})
            else:
                # Add the branching point
                node_to_children[row_list[i]] = {row_list[i+1]:branch_length}
    
    return node_to_children


def newickify(node_to_children, root_node) -> str:
    """
    Function to convert a dictionary to Newick format.
    
    Source:
    https://stackoverflow.com/questions/50003007/how-to-convert-python-dictionary-to-newick-form-format
    """
    # Setting arbritrary branch length 1
    branch_length = 1
    
    visited_nodes = set()

    def newick_render_node(name, distance: float) -> str:
        assert name not in visited_nodes, "Error: The tree may not be circular!"

        if name not in node_to_children:
            # Leafs
            return F'{name}:{distance}'
        else:
            # Nodes
            visited_nodes.add(name)
            children = node_to_children[name]
            children_strings = [newick_render_node(child, children[child]) for child in children.keys()]
            children_strings = ",".join(children_strings)
            return F'({children_strings}){name}:{distance}'

    newick_string = newick_render_node(root_node, branch_length) + ';'

    # Ensure no entries in the dictionary were left unused
    assert visited_nodes == set(node_to_children.keys()), "Error: some nodes aren't in the tree"

    return newick_string

Exclude label and species to create a clean tree:

In [69]:
# Remove non-relevent columns and change order of columns
phylogeny_data_clean = palmdb[["phylum", "class", "order", "family", "genus"]]

# Add "root" node
phylogeny_data_clean.insert(0, "root", "root")

## Convert data frame to node_to_children dictionary
node_to_children = df_to_dict(phylogeny_data_clean)

## Convert dictionary to Newick format
nw_string = newickify(node_to_children, root_node='root')

In [70]:
with open("palmdb_newick.txt", "w") as file:
    file.write(nw_string)

Get species counts for each genus:

In [73]:
gtn = pd.read_csv("tree_endnodes.csv", sep=",", header=2)
gtn = gtn.drop("↓ ↓ INSERT YOUR DATA BELOW ↓ ↓", axis=1)
gtn

Unnamed: 0,Tree node ID,Tree node label
0,Negevirus,Negevirus
1,Unclassified sp.,Unclassified sp.
2,Fusariviridae,Fusariviridae
3,Unclassified,Unclassified
4,Botybirnavirus,Botybirnavirus
...,...,...
301,Pasivirus,Pasivirus
302,Oscivirus,Oscivirus
303,Harkavirus,Harkavirus
304,Aalivirus,Aalivirus


Species counts in PalmDB database:

In [74]:
palmdb_sp_counts = []

for end_node in gtn["Tree node ID"].values:
    palmdb_sp_counts.append(len(palmdb[palmdb.isin([end_node]).any(axis=1)]))
    
gtn["palmdb_sp_count"] = palmdb_sp_counts
gtn

Unnamed: 0,Tree node ID,Tree node label,palmdb_sp_count
0,Negevirus,Negevirus,57
1,Unclassified sp.,Unclassified sp.,1598
2,Fusariviridae,Fusariviridae,390
3,Unclassified,Unclassified,97106
4,Botybirnavirus,Botybirnavirus,11
...,...,...,...
301,Pasivirus,Pasivirus,152
302,Oscivirus,Oscivirus,13
303,Harkavirus,Harkavirus,2
304,Aalivirus,Aalivirus,3


Species counts in NCBI/RefSeq (getting counts from http://www.virusite.org/index.php on 2023/03/28):

In [101]:
import requests
import time

In [135]:
ncbi_sp_counts = []
for end_node in gtn["Tree node ID"].values:
    time.sleep(1)
    # Note rows was set to 15000 to overwrite maximum of 25 results
    r = requests.get(f"http://www.virusite.org/index.php?nav=browse&query1={end_node}&field1=virus.name&expand=false&search_nav=virus&sort=name&order=asc&rows=15000&page=1")
    df_list = pd.read_html(r.content)
    df = df_list[-1]
    
    ncbi_sp_counts.append(len(df))
    
gtn["ncbi_sp_count"] = ncbi_sp_counts
gtn

Unnamed: 0,Tree node ID,Tree node label,palmdb_sp_count,ncbi_sp_count,palmdb_sp_count_log,ncbi_sp_count_log
0,Negevirus,Negevirus,57,8,4.043051,2.079442
1,Unclassified sp.,Unclassified sp.,1598,28,7.376508,3.218876
2,Fusariviridae,Fusariviridae,390,14,5.966147,2.639057
3,Unclassified,Unclassified,97106,3,11.483558,0.000000
4,Botybirnavirus,Botybirnavirus,11,3,2.397895,1.098612
...,...,...,...,...,...,...
301,Pasivirus,Pasivirus,152,1,5.023881,0.000000
302,Oscivirus,Oscivirus,13,2,2.564949,0.693147
303,Harkavirus,Harkavirus,2,1,0.693147,0.000000
304,Aalivirus,Aalivirus,3,1,1.098612,0.000000


Manually check the numbers of the "Unclassified" (0) and "Unclassified sp." (see below) endnotes for NCBI:

In [136]:
# Number of Unclassified sp. for NCBI
r = requests.get(f"http://www.virusite.org/index.php?nav=browse&query1=unclassified+viruses&field1=virus.name&expand=false&search_nav=virus&sort=name&order=asc&rows=15000&page=1")
df_list = pd.read_html(r.content)
df = df_list[-1]
len(df)

313

In [140]:
gtn.loc[gtn.index[gtn["Tree node ID"] == "Unclassified sp."][0], "ncbi_sp_count"] = len(df)

In [141]:
# Manually change "Unclassified" to 0
gtn.loc[gtn.index[gtn["Tree node ID"] == "Unclassified"][0], "ncbi_sp_count"] = 0

In [142]:
gtn

Unnamed: 0,Tree node ID,Tree node label,palmdb_sp_count,ncbi_sp_count,palmdb_sp_count_log,ncbi_sp_count_log
0,Negevirus,Negevirus,57,8,4.043051,2.079442
1,Unclassified sp.,Unclassified sp.,1598,313,7.376508,3.218876
2,Fusariviridae,Fusariviridae,390,14,5.966147,2.639057
3,Unclassified,Unclassified,97106,0,11.483558,0.000000
4,Botybirnavirus,Botybirnavirus,11,3,2.397895,1.098612
...,...,...,...,...,...,...
301,Pasivirus,Pasivirus,152,1,5.023881,0.000000
302,Oscivirus,Oscivirus,13,2,2.564949,0.693147
303,Harkavirus,Harkavirus,2,1,0.693147,0.000000
304,Aalivirus,Aalivirus,3,1,1.098612,0.000000


Log numbers:

In [143]:
from numpy import *

In [144]:
# Take log of values while masking 0s to keep them
gtn["palmdb_sp_count_log"] = ma.log(gtn["palmdb_sp_count"].values).filled(0)
gtn["ncbi_sp_count_log"] = ma.log(gtn["ncbi_sp_count"].values).filled(0)

In [145]:
gtn

Unnamed: 0,Tree node ID,Tree node label,palmdb_sp_count,ncbi_sp_count,palmdb_sp_count_log,ncbi_sp_count_log
0,Negevirus,Negevirus,57,8,4.043051,2.079442
1,Unclassified sp.,Unclassified sp.,1598,313,7.376508,5.746203
2,Fusariviridae,Fusariviridae,390,14,5.966147,2.639057
3,Unclassified,Unclassified,97106,0,11.483558,0.000000
4,Botybirnavirus,Botybirnavirus,11,3,2.397895,1.098612
...,...,...,...,...,...,...
301,Pasivirus,Pasivirus,152,1,5.023881,0.000000
302,Oscivirus,Oscivirus,13,2,2.564949,0.693147
303,Harkavirus,Harkavirus,2,1,0.693147,0.000000
304,Aalivirus,Aalivirus,3,1,1.098612,0.000000


In [146]:
gtn["palmdb_sp_count-ncbi_sp_count"] = gtn["palmdb_sp_count"] - gtn["ncbi_sp_count"]
gtn["palmdb_sp_count_log-ncbi_sp_count_log"] = gtn["palmdb_sp_count_log"] - gtn["ncbi_sp_count_log"]

In [149]:
gtn.to_csv("tree_endnodes_with_counts.csv")

Ended up plotting "ncbi_sp_count_log" and "palmdb_sp_count_log-ncbi_sp_count_log" (replaced all negative values with 0).

___