In [3]:
import pandas as pd
import numpy as np

In [11]:
taxonomy = pd.read_csv("../taxonomy/taxonomy.csv", sep=";")
taxonomy = taxonomy.drop(['tax_id','ilevel','ileft', 'iright'], axis=1)
taxonomy.columns

Index(['acc', 'rank', 'name', 'total_count', 'self_count'], dtype='object')

## Group by SRR

### Retrieve accessions

In [5]:
accessions = taxonomy["acc"].unique()

### Retrieve names for taxonomy ranks

In [6]:
def get_all_unique_names_by_rank(taxonomy_rank: str):
    return taxonomy[taxonomy["rank"]==taxonomy_rank]["name"].unique()

In [7]:
ranks = taxonomy["rank"].unique()
names = {
	"subspecies" : get_all_unique_names_by_rank("subspecies"),
	"nan" : get_all_unique_names_by_rank("nan"),
	"order" : get_all_unique_names_by_rank("order"),
	"infraclass" : get_all_unique_names_by_rank("infraclass"),
	"genus" : get_all_unique_names_by_rank("genus"),
	"species" : get_all_unique_names_by_rank("species"),
	"tribe" : get_all_unique_names_by_rank("tribe"),
	"family" : get_all_unique_names_by_rank("family"),
	"superfamily" : get_all_unique_names_by_rank("superfamily"),
	"parvorder" : get_all_unique_names_by_rank("parvorder"),
	"superorder" : get_all_unique_names_by_rank("superorder"),
	"subfamily" : get_all_unique_names_by_rank("subfamily"),
	"superkingdom" : get_all_unique_names_by_rank("superkingdom"),
	"subclass" : get_all_unique_names_by_rank("subclass"),
	"class" : get_all_unique_names_by_rank("class"),
	"infraorder" : get_all_unique_names_by_rank("infraorder"),
	"superclass" : get_all_unique_names_by_rank("superclass"),
	"suborder" : get_all_unique_names_by_rank("suborder"),
	"subtribe" : get_all_unique_names_by_rank("subtribe"),
	"subphylum" : get_all_unique_names_by_rank("subphylum"),
	"subgenus" : get_all_unique_names_by_rank("subgenus"),
	"cohort" : get_all_unique_names_by_rank("cohort"),
	"subcohort" : get_all_unique_names_by_rank("subcohort"),
	"species subgroup" : get_all_unique_names_by_rank("species subgroup"),
	"species group" : get_all_unique_names_by_rank("species group"),
	"varietas" : get_all_unique_names_by_rank("varietas"),
	"phylum" : get_all_unique_names_by_rank("phylum"),
	"kingdom" : get_all_unique_names_by_rank("kingdom"),
	"subkingdom" : get_all_unique_names_by_rank("subkingdom")
}

### Generate rank taxonomy abundance table per accessions

In [18]:
def abundance_table_by_rank(rank):
    result = pd.DataFrame(columns=names[str(rank)])
    result.insert(0, "Sample", np.nan)
    for acc in accessions:
        tmp = taxonomy[(taxonomy["acc"]==acc) & (taxonomy["rank"]==rank)][["name", "total_count"]].set_index("name").T
        tmp.insert(0, "Sample", str(acc))
        print(tmp)
        break
        result = pd.concat([result, tmp])
    result = result.fillna(0)
    result = result.set_index("Sample")
#     return result

In [19]:
abundance_table_by_rank('species')

name              Sample  Homo sapiens  Pongo abelii  Aegilops tauschii   
total_count  SRR15595321       9264410         42252                  2  \

name         Nomascus leucogenys  Gorilla gorilla  Pan troglodytes   
total_count                 7798            42449            39049  \

name         Erythranthe guttata  Pan paniscus  
total_count                    5         47672  


In [11]:
for rank in ranks:
    tmp = abundance_table_by_rank(rank)
    tmp.to_csv(f"../taxonomy/{'_'.join(str(rank).split(' '))}_data.csv", sep=";")