In [1]:
import pandas as pd
import numpy as np

In [2]:
taxonomy = pd.read_csv("taxonomy.csv", sep=";")
taxonomy.columns

Index(['acc', 'tax_id', 'rank', 'name', 'total_count', 'self_count', 'ilevel',
       'ileft', 'iright'],
      dtype='object')

## Group by SRR

### Retrieve accessions

In [3]:
accessions = taxonomy["acc"].unique()

### Retrieve names for taxonomy ranks

In [4]:
def get_all_unique_names_by_rank(taxonomy_rank: str):
    return taxonomy[taxonomy["rank"]==taxonomy_rank]["name"].unique()

In [11]:
ranks = taxonomy["rank"].unique()
names = {
	"subspecies" : get_all_unique_names_by_rank("subspecies"),
	"nan" : get_all_unique_names_by_rank("nan"),
	"order" : get_all_unique_names_by_rank("order"),
	"infraclass" : get_all_unique_names_by_rank("infraclass"),
	"genus" : get_all_unique_names_by_rank("genus"),
	"species" : get_all_unique_names_by_rank("species"),
	"tribe" : get_all_unique_names_by_rank("tribe"),
	"family" : get_all_unique_names_by_rank("family"),
	"superfamily" : get_all_unique_names_by_rank("superfamily"),
	"parvorder" : get_all_unique_names_by_rank("parvorder"),
	"superorder" : get_all_unique_names_by_rank("superorder"),
	"subfamily" : get_all_unique_names_by_rank("subfamily"),
	"superkingdom" : get_all_unique_names_by_rank("superkingdom"),
	"subclass" : get_all_unique_names_by_rank("subclass"),
	"class" : get_all_unique_names_by_rank("class"),
	"infraorder" : get_all_unique_names_by_rank("infraorder"),
	"superclass" : get_all_unique_names_by_rank("superclass"),
	"suborder" : get_all_unique_names_by_rank("suborder"),
	"subtribe" : get_all_unique_names_by_rank("subtribe"),
	"subphylum" : get_all_unique_names_by_rank("subphylum"),
	"subgenus" : get_all_unique_names_by_rank("subgenus"),
	"cohort" : get_all_unique_names_by_rank("cohort"),
	"subcohort" : get_all_unique_names_by_rank("subcohort"),
	"species subgroup" : get_all_unique_names_by_rank("species subgroup"),
	"species group" : get_all_unique_names_by_rank("species group"),
	"varietas" : get_all_unique_names_by_rank("varietas"),
	"phylum" : get_all_unique_names_by_rank("phylum"),
	"kingdom" : get_all_unique_names_by_rank("kingdom"),
	"subkingdom" : get_all_unique_names_by_rank("subkingdom")
}
ranks[0]

'subspecies'

### Generate rank taxonomy abundance table per accessions

In [27]:
def abundance_table_by_rank(rank):
    result = pd.DataFrame(columns=names[str(rank)])
    result.insert(0, "Sample", np.nan)
    for acc in accessions:
        tmp = taxonomy[(taxonomy["acc"]==acc) & (taxonomy["rank"]==rank)][["name", "total_count"]].set_index("name").T
        tmp.insert(0, "Sample", str(acc))
        result = pd.concat([result, tmp])
    result = result.fillna(0)
    result = result.set_index("Sample")
    return result

In [28]:
for rank in ranks:
    tmp = abundance_table_by_rank(rank)
    tmp.to_csv(f"{'_'.join(str(rank).split(' '))}_data.csv", sep=";")

subspecies


Unnamed: 0,Gorilla gorilla gorilla,Odobenus rosmarus divergens,Fragaria vesca subsp. vesca,Cucurbita pepo subsp. pepo,Perognathus longimembris pacificus,Aegilops tauschii subsp. tauschii,Colobus angolensis palliatus,Aquila chrysaetos chrysaetos,Musa acuminata subsp. malaccensis,Canis lupus familiaris,...,Culex pipiens pallens,Terrapene carolina triunguis,Peromyscus maniculatus bairdii,Olea europaea subsp. europaea,Saimiri boliviensis boliviensis,Beta vulgaris subsp. vulgaris,Hordeum vulgare subsp. vulgare,Ceratotherium simum simum,Daucus carota subsp. sativus,Pediculus humanus corporis
0,42449,0,0,0,0,2,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,0,0,0,0,0,7,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,7,0,0,0,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
207,8496,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
208,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
209,24429,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
210,9988,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
