In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [4]:
df_annot_flt = pd.read_csv('./df_annot_flt_en.csv', index_col=0)
df_annot_flt

Unnamed: 0_level_0,meta_name,meta_rank,meta_WikiPage,meta_AlsoKnownAs,meta_ThumbnailURL,meta_LatinName,Nucleotide_Count,parent_taxid
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2842321,Kolmioviridae,family,,,,Kolmioviridae,,10239
39759,Deltavirus,genus,,,,Deltavirus,,2842321
12475,Hepatitis D virus,species,[Hepatitis D virus](https://en.wikipedia.org/w...,"Hepatitis D virus,Hepatitis D Virus,HDV",,Hepatitis delta virus,3860.0,39759
185752,Avsunviroidae,family,[Avsunviroidae](https://en.wikipedia.org/wiki/...,,,Avsunviroidae,,10239
185759,Pelamoviroid,genus,,,,Pelamoviroid,,185752
...,...,...,...,...,...,...,...,...
695564,Erebidae,family,[Erebidae](https://en.wikipedia.org/wiki/Erebi...,,https://upload.wikimedia.org/wikipedia/commons...,Erebidae,,7088
39465,Hyphantria,genus,[Hyphantria](https://en.wikipedia.org/wiki/Hyp...,,https://upload.wikimedia.org/wikipedia/commons...,Hyphantria,,695564
39466,Fall webworm,species,[Fall webworm](https://en.wikipedia.org/wiki/F...,Hyphantria cunea,https://upload.wikimedia.org/wikipedia/commons...,Hyphantria cunea,831.0,39465
289280,Arctia,genus,[Arctia](https://en.wikipedia.org/wiki/Arctia),,https://upload.wikimedia.org/wikipedia/commons...,Arctia,,695564


In [5]:
selected_ranks = {'species', 'genus', 'family', 'order', 'class', 'phylum', 'kingdom', 'superkingdom'}


def get_taxa_path(index: int, path=[], df_annot=None):
    cur_row = df_annot.loc[index]
    if cur_row['meta_rank'] in selected_ranks or cur_row.name == 1:
        #path with names for debug
        # path.append((cur_row['name'], cur_row['meta_rank'], cur_row['meta_LatinName']))
        
        #path with taxids only
        path.append(int(cur_row.name))
        
    if cur_row.name == 1:
        return path
    else:
        get_taxa_path(cur_row['parent_taxid'], path, df_annot)
        return path

get_taxa_path(9606, [], df_annot_flt)

[9606, 9605, 9604, 9443, 40674, 7711, 33208, 2759, 1]

In [6]:
linages = []
used_taxids = list()
for ind in df_annot_flt[df_annot_flt.meta_rank == 'species'].index:
    cur_path = get_taxa_path(ind, [], df_annot_flt)
    linages.append(cur_path[:-1])
    for taxid in cur_path:
        used_taxids.append(taxid)
len(used_taxids)

65743

In [7]:
# taxa abundancies in the raw EN tree
taxa_abundancy_raw = pd.Series(used_taxids).value_counts().rename('taxa_abundancy')
taxa_abundancy_raw.index.name = 'taxid'
taxa_abundancy = df_annot_flt[['meta_LatinName', 'meta_rank']].join(taxa_abundancy_raw)\
    .sort_values('taxa_abundancy', ascending=False)
taxa_abundancy[taxa_abundancy.meta_rank == 'superkingdom']

Unnamed: 0_level_0,meta_LatinName,meta_rank,taxa_abundancy
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2759,Eukaryota,superkingdom,5351
2,Bacteria,superkingdom,2073
10239,Viruses,superkingdom,155
2157,Archaea,superkingdom,27


In [8]:
linages_vir = [x for x in linages if x[-1] == 10239]

In [9]:
phylum2type = {
    'Pisuviricota__2732408': '+',
    'Lenarviricota__2732407': '+',
    'Kitrinoviricota__2732406': '+',
    'Negarnaviricota__2497569': '-',
    'Duplornaviricota__2732405': 'ds',
}

In [10]:
data_vir = []
for lin in linages_vir[::-1]:
    cur_tax = lin[0]
    one = {
        cur_tax: {taxa_abundancy.loc[x, 'meta_rank']: taxa_abundancy.loc[x, 'meta_LatinName'] + f'__{x}' for x in lin}
    }
    data_vir.append(pd.DataFrame(one).T)
df_vir_linages = pd.concat(data_vir)
df_vir_linages = df_vir_linages.join(df_annot_flt[['Nucleotide_Count']])
df_vir_linages['Nucleotide_Count'] = df_vir_linages['Nucleotide_Count'].astype(int)
df_vir_linages['Type'] = df_vir_linages.phylum.map(phylum2type)
df_vir_linages.index.name = 'taxid'
df_vir_linages = df_vir_linages[~df_vir_linages.Type.isna()].sort_values('Nucleotide_Count', ascending=False)
df_vir_linages.to_csv('./vir_linages.csv')
df_vir_linages

Unnamed: 0_level_0,species,genus,family,order,class,phylum,kingdom,superkingdom,Nucleotide_Count,Type
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
694009,Severe acute respiratory syndrome-related coro...,Betacoronavirus__694002,Coronaviridae__11118,Nidovirales__76804,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,8959662,+
11320,Influenza A virus__11320,Alphainfluenzavirus__197911,Orthomyxoviridae__11308,Articulavirales__2499411,Insthoviricetes__2497577,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,1117536,-
11103,Hepacivirus C__11103,Hepacivirus__11102,Flaviviridae__11050,Amarillovirales__2732545,Flasuviricetes__2732462,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,273723,+
11983,Norwalk virus__11983,Norovirus__142786,Caliciviridae__11974,Picornavirales__464095,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,60009,+
12637,Dengue virus__12637,Flavivirus__11051,Flaviviridae__11050,Amarillovirales__2732545,Flasuviricetes__2732462,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,50000,+
...,...,...,...,...,...,...,...,...,...,...
33760,Prune dwarf virus__33760,Ilarvirus__12316,Bromoviridae__39740,Martellivirales__2732544,Alsuviricetes__2732461,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,556,+
12844,Sweet potato feathery mottle virus__12844,Potyvirus__12195,Potyviridae__39729,Patatavirales__2732550,Stelpaviricetes__2732507,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,538,+
12169,Potato virus S__12169,Carlavirus__12163,Betaflexiviridae__675068,Tymovirales__675063,Alsuviricetes__2732461,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,527,+
2560743,Schmallenberg orthobunyavirus__2560743,Orthobunyavirus__11572,Peribunyaviridae__1980416,Bunyavirales__1980410,Ellioviricetes__2497576,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,522,-


In [11]:
df_annot_flt[df_annot_flt['parent_taxid'] == 2732396]

Unnamed: 0_level_0,meta_name,meta_rank,meta_WikiPage,meta_AlsoKnownAs,meta_ThumbnailURL,meta_LatinName,Nucleotide_Count,parent_taxid
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
10993,Birnaviridae,family,[Birnaviridae](https://en.wikipedia.org/wiki/B...,,https://upload.wikimedia.org/wikipedia/commons...,Birnaviridae,,2732396
2732407,Lenarviricota,phylum,[Lenarviricota](https://en.wikipedia.org/wiki/...,,https://upload.wikimedia.org/wikipedia/commons...,Lenarviricota,,2732396
2732405,Duplornaviricota,phylum,[Duplornaviricota](https://en.wikipedia.org/wi...,,,Duplornaviricota,,2732396
2497569,Negative-strand RNA virus,phylum,[Negative-strand RNA virus](https://en.wikiped...,,https://upload.wikimedia.org/wikipedia/commons...,Negarnaviricota,,2732396
2732406,Kitrinoviricota,phylum,[Kitrinoviricota](https://en.wikipedia.org/wik...,,,Kitrinoviricota,,2732396
2732408,Pisuviricota,phylum,[Pisuviricota](https://en.wikipedia.org/wiki/P...,,https://upload.wikimedia.org/wikipedia/commons...,Pisuviricota,,2732396


In [13]:
df_vir_linages.phylum.value_counts()

phylum
Pisuviricota__2732408        39
Kitrinoviricota__2732406     34
Negarnaviricota__2497569     15
Duplornaviricota__2732405     6
Lenarviricota__2732407        1
Name: count, dtype: int64

In [14]:
df_vir_linages.Type.value_counts()

Type
+     74
-     15
ds     6
Name: count, dtype: int64

## Select species for dataset

In [30]:
# all DS
ds_sp = df_vir_linages[df_vir_linages.Type == 'ds']
ds_sp

Unnamed: 0_level_0,species,genus,family,order,class,phylum,kingdom,superkingdom,Nucleotide_Count,Type
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
38170,Avian orthoreovirus__38170,Orthoreovirus__10882,Reoviridae__10880,Reovirales__2732541,Resentoviricetes__2732459,Duplornaviricota__2732405,Orthornavirae__2732396,Viruses__10239,4363,ds
1157337,Piscine orthoreovirus__1157337,Orthoreovirus__10882,Reoviridae__10880,Reovirales__2732541,Resentoviricetes__2732459,Duplornaviricota__2732405,Orthornavirae__2732396,Viruses__10239,2478,ds
351073,Mammalian orthoreovirus__351073,Orthoreovirus__10882,Reoviridae__10880,Reovirales__2732541,Resentoviricetes__2732459,Duplornaviricota__2732405,Orthornavirae__2732396,Viruses__10239,2366,ds
40054,Epizootic hemorrhagic disease virus__40054,Orbivirus__10892,Reoviridae__10880,Reovirales__2732541,Resentoviricetes__2732459,Duplornaviricota__2732405,Orthornavirae__2732396,Viruses__10239,1773,ds
10990,Rice black streaked dwarf virus__10990,Fijivirus__10988,Reoviridae__10880,Reovirales__2732541,Resentoviricetes__2732459,Duplornaviricota__2732405,Orthornavirae__2732396,Viruses__10239,1149,ds
77763,Banna virus__77763,Seadornavirus__208294,Reoviridae__10880,Reovirales__2732541,Resentoviricetes__2732459,Duplornaviricota__2732405,Orthornavirae__2732396,Viruses__10239,506,ds


In [31]:
# all neg
neg_sp = df_vir_linages[df_vir_linages.Type == '-']
neg_sp

Unnamed: 0_level_0,species,genus,family,order,class,phylum,kingdom,superkingdom,Nucleotide_Count,Type
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11320,Influenza A virus__11320,Alphainfluenzavirus__197911,Orthomyxoviridae__11308,Articulavirales__2499411,Insthoviricetes__2497577,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,1117536,-
162145,Human metapneumovirus__162145,Metapneumovirus__162387,Pneumoviridae__11244,Mononegavirales__11157,Monjiviricetes__2497574,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,12076,-
12814,Respiratory syncytial virus__12814,,Pneumoviridae__11244,Mononegavirales__11157,Monjiviricetes__2497574,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,4243,-
186538,Zaire ebolavirus__186538,Ebolavirus__186536,Filoviridae__11266,Mononegavirales__11157,Monjiviricetes__2497574,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,3862,-
1980486,Puumala orthohantavirus__1980486,Orthohantavirus__1980442,Hantaviridae__1980413,Bunyavirales__1980410,Ellioviricetes__2497576,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,3641,-
11620,Lassa mammarenavirus__11620,Mammarenavirus__1653394,Arenaviridae__11617,Bunyavirales__1980410,Ellioviricetes__2497576,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,2777,-
31604,Small ruminant morbillivirus__31604,Morbillivirus__11229,Paramyxoviridae__11158,Mononegavirales__11157,Monjiviricetes__2497574,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,2280,-
1933309,Oropouche orthobunyavirus__1933309,Orthobunyavirus__11572,Peribunyaviridae__1980416,Bunyavirales__1980410,Ellioviricetes__2497576,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,2158,-
1980917,Salmonid novirhabdovirus__1980917,Novirhabdovirus__186778,Rhabdoviridae__11270,Mononegavirales__11157,Monjiviricetes__2497574,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,1026,-
12331,Rice stripe tenuivirus__12331,Tenuivirus__12329,Phenuiviridae__1980418,Bunyavirales__1980410,Ellioviricetes__2497576,Negarnaviricota__2497569,Orthornavirae__2732396,Viruses__10239,803,-


In [32]:
pos_sp = df_vir_linages[df_vir_linages.Type == '+'].iloc[1:17]
pos_sp

Unnamed: 0_level_0,species,genus,family,order,class,phylum,kingdom,superkingdom,Nucleotide_Count,Type
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1
11103,Hepacivirus C__11103,Hepacivirus__11102,Flaviviridae__11050,Amarillovirales__2732545,Flasuviricetes__2732462,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,273723,+
11983,Norwalk virus__11983,Norovirus__142786,Caliciviridae__11974,Picornavirales__464095,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,60009,+
12637,Dengue virus__12637,Flavivirus__11051,Flaviviridae__11050,Amarillovirales__2732545,Flasuviricetes__2732462,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,50000,+
138948,Enterovirus A__138948,Enterovirus__12059,Picornaviridae__12058,Picornavirales__464095,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,44425,+
28344,Porcine reproductive and respiratory syndrome ...,,Arteriviridae__76803,Nidovirales__76804,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,41373,+
11082,West Nile virus__11082,Flavivirus__11051,Flaviviridae__11050,Amarillovirales__2732545,Flasuviricetes__2732462,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,23308,+
694014,Avian coronavirus__694014,Gammacoronavirus__694013,Coronaviridae__11118,Nidovirales__76804,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,18179,+
12110,Foot-and-mouth disease virus__12110,Aphthovirus__12109,Picornaviridae__12058,Picornavirales__464095,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,15462,+
138950,Enterovirus C__138950,Enterovirus__12059,Picornaviridae__12058,Picornavirales__464095,Pisoniviricetes__2732506,Pisuviricota__2732408,Orthornavirae__2732396,Viruses__10239,15075,+
1678143,Orthohepevirus A__1678143,Orthohepevirus__1678141,Hepeviridae__291484,Hepelivirales__2732543,Alsuviricetes__2732461,Kitrinoviricota__2732406,Orthornavirae__2732396,Viruses__10239,11182,+


In [35]:
dataset_sp = pd.concat([ds_sp, neg_sp, pos_sp])
dataset_sp.to_csv('./species.csv')
dataset_sp.index.to_series().to_csv('./taxids.txt', header=None, index=False)