In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [2]:
# https://github.com/zyxue/ncbitax2lin/blob/master/ncbitax2lin/data_io.py

def load_names(names_file: str) -> pd.DataFrame:
    """
    load names.dmp and convert it into a pandas.DataFrame
    """
    df_data = pd.read_csv(
        names_file,
        sep="|",
        header=None,
        index_col=False,
        names=["tax_id", "name_txt", "unique_name", "name_class"],
    )

    return (
        df_data.assign(
            name_txt=lambda df: df["name_txt"].str.strip(),
            unique_name=lambda df: df["unique_name"].str.strip(),
            name_class=lambda df: df["name_class"].str.strip(),
        )
        .loc[lambda df: df["name_class"] == "scientific name"]
        .reset_index(drop=True)
        .drop("name_class", axis=1)
    )


def load_nodes(nodes_file: str) -> pd.DataFrame:
    """
    load nodes.dmp and convert it into a pandas.DataFrame
    """
    df_data = pd.read_csv(
        nodes_file,
        sep="|",
        header=None,
        index_col=False,
        names=[
            "tax_id",
            "parent_tax_id",
            "rank",
            "embl_code",
            "division_id",
            "inherited_div_flag",
            "genetic_code_id",
            "inherited_GC__flag",
            "mitochondrial_genetic_code_id",
            "inherited_MGC_flag",
            "GenBank_hidden_flag",
            "hidden_subtree_root_flag",
            "comments",
        ],
    )

    return df_data.assign(
        rank=lambda df: df["rank"].str.strip(),
        embl_code=lambda df: df["embl_code"].str.strip(),
        comments=lambda df: df["comments"].str.strip(),
    )

## Load data

In [3]:
_c = ["tax_id", "parent_tax_id", "rank", "genetic_code_id"]
nodes = load_nodes("../data/nt_taxid/taxdmp/nodes.dmp")[_c].set_index('tax_id')
nodes.head()

Unnamed: 0_level_0,parent_tax_id,rank,genetic_code_id
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,no rank,1
2,131567,superkingdom,11
6,335928,genus,11
7,6,species,11
9,32199,species,11


In [4]:
names = load_names('../data/nt_taxid/taxdmp/names.dmp').set_index('tax_id')
names

Unnamed: 0_level_0,name_txt,unique_name
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,root,
2,Bacteria,Bacteria <bacteria>
6,Azorhizobium,
7,Azorhizobium caulinodans,
9,Buchnera aphidicola,
...,...,...
3127371,Clematicissus simsiana,
3127489,Fluctibacter,
3127503,Chicitaea,
3127506,Candidatus Thalassospirochaeta,


In [6]:
linages = pd.read_csv('../data/nt_taxid/ncbi_lineages_2024-04-20.csv.gz', index_col=0)
linages

  linages = pd.read_csv('../data/nt_taxid/ncbi_lineages_2024-04-20.csv.gz', index_col=0)


Unnamed: 0_level_0,superkingdom,phylum,class,order,family,genus,species,biotype,clade,clade1,...,subphylum,subsection,subspecies,subtribe,superclass,superfamily,superorder,superphylum,tribe,varietas
tax_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1,,,,,,,,,,,...,,,,,,,,,,
2,Bacteria,,,,,,,,,,...,,,,,,,,,,
6,Bacteria,Pseudomonadota,Alphaproteobacteria,Hyphomicrobiales,Xanthobacteraceae,Azorhizobium,,,,,...,,,,,,,,,,
7,Bacteria,Pseudomonadota,Alphaproteobacteria,Hyphomicrobiales,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans,,,,...,,,,,,,,,,
9,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3127371,Eukaryota,Streptophyta,Magnoliopsida,Vitales,Vitaceae,Clematicissus,Clematicissus simsiana,,Embryophyta,Tracheophyta,...,Streptophytina,,,,,,,,Ampelopsideae,
3127489,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Alteromonadaceae,Fluctibacter,,,,,...,,,,,,,,,,
3127503,Eukaryota,Ascomycota,Lecanoromycetes,Sarrameanales,Sarrameanaceae,Chicitaea,,,Opisthokonta,saccharomyceta,...,Pezizomycotina,,,,,,,,,
3127506,Bacteria,Spirochaetota,Spirochaetia,Spirochaetales,Spirochaetaceae,Candidatus Thalassospirochaeta,,,,,...,,,,,,,,,,


In [5]:
taxid2cnt = pd.read_csv("../data/nt_taxid/taxid2nseqs.csv", index_col=0)
taxid2cnt_flt = taxid2cnt[taxid2cnt.nseqs > 9]
taxid2cnt_flt

Unnamed: 0_level_0,nseqs
taxid,Unnamed: 1_level_1
0,1297
7,56
9,1222
14,10
17,54
...,...
908352,146
929453,102
940851,28
948766,27


## Assign parent taxid

1. get only species that have at least 10 entry sequences in the nt database
2. extract linages for them
3. set family level as parent for those that has family level
4. for others set level that includes more than ... descendants

In [49]:
linages_flt = linages.join(taxid2cnt_flt, how='inner')

# get only species
indexes_species = linages_flt.index.intersection(nodes[nodes['rank'] == 'species'].index).values
# and append sars-cov-2, Influenza B virus, Influenza A virus despite they are not species
indexes_species = np.append(indexes_species, [2697049, 11320, 11520])

linages_sp = linages_flt.loc[indexes_species]
assert not linages_sp.species.isna().any()

print(linages_sp.shape)
linages_sp.head()

(215260, 70)


Unnamed: 0,superkingdom,phylum,class,order,family,genus,species,biotype,clade,clade1,...,subsection,subspecies,subtribe,superclass,superfamily,superorder,superphylum,tribe,varietas,nseqs
7,Bacteria,Pseudomonadota,Alphaproteobacteria,Hyphomicrobiales,Xanthobacteraceae,Azorhizobium,Azorhizobium caulinodans,,,,...,,,,,,,,,,56
9,Bacteria,Pseudomonadota,Gammaproteobacteria,Enterobacterales,Erwiniaceae,Buchnera,Buchnera aphidicola,,,,...,,,,,,,,,,1222
14,Bacteria,Dictyoglomota,Dictyoglomia,Dictyoglomales,Dictyoglomaceae,Dictyoglomus,Dictyoglomus thermophilum,,,,...,,,,,,,,,,10
17,Bacteria,Pseudomonadota,Betaproteobacteria,Nitrosomonadales,Methylophilaceae,Methylophilus,Methylophilus methylotrophus,,,,...,,,,,,,,,,54
23,Bacteria,Pseudomonadota,Gammaproteobacteria,Alteromonadales,Shewanellaceae,Shewanella,Shewanella colwelliana,,,,...,,,,,,,,,,63


In [50]:
# check that we remove low number of distinct species in species linages
linages_flt[~linages_flt.index.isin(indexes_species)].sort_values('nseqs').iloc[:, [0,1,2,3,4,6,-1]].tail(30)

Unnamed: 0,superkingdom,phylum,class,order,family,species,nseqs
31138,Eukaryota,Chordata,,Testudines,Emydidae,Trachemys scripta,47981
3664,Eukaryota,Streptophyta,Magnoliopsida,Cucurbitales,Cucurbitaceae,Cucurbita pepo,49462
481459,Eukaryota,Chordata,Actinopteri,Perciformes,Gasterosteidae,Gasterosteus aculeatus,50121
2991368,Eukaryota,Chordata,,Testudines,Emydidae,Malaclemys terrapin,50318
947029,Eukaryota,Chordata,Aves,Passeriformes,Corvidae,Aphelocoma californica,50342
81972,Eukaryota,Streptophyta,Magnoliopsida,Brassicales,Brassicaceae,Arabidopsis lyrata,52859
39432,Eukaryota,Chordata,Mammalia,Primates,Cebidae,Saimiri boliviensis,53066
214687,Eukaryota,Streptophyta,Magnoliopsida,Zingiberales,Musaceae,Musa acuminata,53405
79200,Eukaryota,Streptophyta,Magnoliopsida,Apiales,Apiaceae,Daucus carota,55044
77932,Eukaryota,Chordata,Mammalia,Perissodactyla,Rhinocerotidae,Diceros bicornis,56405


In [51]:
# common case - search outgroup among family relatives
linages_with_fam = linages_sp[(~linages_sp.family.isna())]

# if there is no family label - search outgroup among genus relatives
linages_wihout_fam = linages_sp[(linages_sp.family.isna())]
linages_wihout_fam_with_genus = linages_wihout_fam[~linages_wihout_fam.genus.isna()]

# if there is no genus label - don't include such species(taxids)
linages_wihout_fam_without_genus = linages_wihout_fam[linages_wihout_fam.genus.isna()]

linages_wihout_fam_with_genus.shape[0], linages_wihout_fam.shape[0], linages_with_fam.shape[0]

(1409, 5434, 209826)

In [52]:
# check that we remove low number of distinct species in species linages
linages_wihout_fam_without_genus.sort_values('nseqs').iloc[:, [0,1,2,3,4,6,-1]].tail(20)

Unnamed: 0,superkingdom,phylum,class,order,family,species,nseqs
152509,Bacteria,Bacteroidota,,,,uncultured Bacteroidetes bacterium,30603
29281,Archaea,Thermoproteota,,,,uncultured crenarchaeote,33090
1211,Bacteria,Cyanobacteriota,,,,uncultured cyanobacterium,34500
152507,Bacteria,Actinomycetota,Actinomycetes,,,uncultured Actinomycetes bacterium,37024
86473,Bacteria,Pseudomonadota,Gammaproteobacteria,,,uncultured Gammaproteobacteria bacterium,37045
91750,Bacteria,Pseudomonadota,Alphaproteobacteria,,,uncultured Alphaproteobacteria bacterium,37175
2832643,Viruses,Uroviricota,Caudoviricetes,,,Caudoviricetes sp.,39585
241622,Eukaryota,Mucoromycota,,,,uncultured Glomeromycotina,50826
164851,Bacteria,,,,,uncultured soil bacterium,74535
358574,,,,,,uncultured microorganism,110347


### set outgrp taxid

In [53]:
taxid2ndesc = nodes.parent_tax_id.value_counts().to_dict()
taxid2parent = nodes.parent_tax_id.to_dict()
taxid2rank = nodes['rank'].to_dict()

In [54]:
def get_taxid_of_specific_rank(taxid: int, rank='family'):
    cur_rank = taxid2rank.get(taxid)
    if cur_rank == rank:
        return taxid

    parent_taxid = taxid2parent.get(taxid)
    if parent_taxid is None or parent_taxid == 1:
        return None
    
    return get_taxid_of_specific_rank(parent_taxid, rank)

In [55]:
get_taxid_of_specific_rank(11520, 'genus')

197912

In [56]:
species_with_fam = names.loc[linages_with_fam.index]
species_with_gen = names.loc[linages_wihout_fam_with_genus.index]

species_with_fam['parent_taxid'] = species_with_fam.index.to_series().apply(get_taxid_of_specific_rank, rank='family')
species_with_gen['parent_taxid'] = species_with_gen.index.to_series().apply(get_taxid_of_specific_rank, rank='genus')

assert species_with_fam.isna().sum().sum() == 0
assert species_with_gen.isna().sum().sum() == 0

In [57]:
species_with_fam.parent_taxid.map(taxid2rank).value_counts()

family    209826
Name: parent_taxid, dtype: int64

In [58]:
species_with_gen.parent_taxid.map(taxid2rank).value_counts()

genus    1409
Name: parent_taxid, dtype: int64

In [59]:
species_full = pd.concat([species_with_fam, species_with_gen]).drop('unique_name', axis=1)
# species_full = species_full.join(nodes[['genetic_code_id']])
species_full.index.name = 'taxid'
species_full

Unnamed: 0_level_0,name_txt,parent_taxid
taxid,Unnamed: 1_level_1,Unnamed: 2_level_1
7,Azorhizobium caulinodans,335928
9,Buchnera aphidicola,1903409
14,Dictyoglomus thermophilum,203488
17,Methylophilus methylotrophus,32011
23,Shewanella colwelliana,267890
...,...,...
3080365,Crustaphytum fragile,2479497
3081338,Quasiconcha sp. ZYZ-2023a,707788
3095211,Tricholomopsis depressa,151293
3095998,Arxiella longispora,1577851


In [60]:
species_full.to_csv('../data/nt_taxid/species.csv.gz')

In [115]:
# species_full.genetic_code_id.value_counts()

1     196694
11     13696
12       333
6        281
4        159
10        43
26        16
31         4
29         3
16         2
30         1
Name: genetic_code_id, dtype: int64

## Legacy

In [6]:
nodes['rank'].value_counts()

species             2101850
no rank              240472
genus                108761
strain                46271
subspecies            29259
family                10356
varietas               9894
subfamily              3270
tribe                  2363
order                  1900
subgenus               1822
isolate                1304
serotype               1232
clade                   959
superfamily             898
forma specialis         755
forma                   685
subtribe                586
section                 534
class                   531
suborder                374
species group           359
phylum                  311
subclass                169
serogroup               153
infraorder              135
species subgroup        134
superorder               57
subsection               41
subphylum                31
parvorder                26
genotype                 22
infraclass               19
biotype                  17
kingdom                  13
morph               

In [18]:
nodes_tree = nodes[['tax_id', 'parent_tax_id', 'rank']]

In [19]:
nodes_tree

Unnamed: 0,tax_id,parent_tax_id,rank
0,1,1,no rank
1,2,131567,superkingdom
2,6,335928,genus
3,7,6,species
4,9,32199,species
...,...,...,...
2565603,3127371,165295,species
2565604,3127489,72275,genus
2565605,3127503,388443,genus
2565606,3127506,137,genus


In [29]:
species = pd.merge(nodes[nodes['rank'] == 'species'], names, on='tax_id')[list(names.columns) + ['parent_tax_id']]

In [31]:
names

Unnamed: 0,tax_id,name_txt,unique_name
0,1,root,
1,2,Bacteria,Bacteria <bacteria>
2,6,Azorhizobium,
3,7,Azorhizobium caulinodans,
4,9,Buchnera aphidicola,
...,...,...,...
2565603,3127371,Clematicissus simsiana,
2565604,3127489,Fluctibacter,
2565605,3127503,Chicitaea,
2565606,3127506,Candidatus Thalassospirochaeta,


In [36]:
species

Unnamed: 0,tax_id,name_txt,unique_name,parent_tax_id
0,7,Azorhizobium caulinodans,,6
1,9,Buchnera aphidicola,,32199
2,11,Cellulomonas gilvus,,1707
3,14,Dictyoglomus thermophilum,,13
4,17,Methylophilus methylotrophus,,16
...,...,...,...,...
2101845,3127163,Russula sp. SA2,,2602424
2101846,3127164,Russula sp. TAS8,,2602424
2101847,3127342,Potentilla glaucophylla,,23204
2101848,3127370,Clematicissus granulosa,,165295


In [37]:
un = species[species.unique_name != '']

In [39]:
un#[un.tax_id > 1000]

Unnamed: 0,tax_id,name_txt,unique_name,parent_tax_id
6626,29375,Lacrimispora xylanolytica,Lacrimispora xylanolytica <Lacrimispora xylano...,2719231
9840,36929,Crassostrea belcheri,Crassostrea belcheri <Crassostrea belcheri>,6564
16629,49494,Canistrum giganteum,Canistrum giganteum <Canistrum giganteum>,49493
28825,71170,earthworms,earthworms <earthworms>,71169
32438,76643,Abronia aurita,Abronia aurita <lizards & snakes>,76641
...,...,...,...,...
2025660,2975071,Vector pCV-zCas9-puro,Vector pCV-zCas9-puro <NCBI:txid2975071>,29278
2029724,2980510,Actinauge abyssorum,Actinauge abyssorum <Actinauge abyssorum>,478387
2029996,2980905,uncultured Hymenoscyphus,uncultured Hymenoscyphus <NCBI:txid2980905>,261668
2083511,3077842,Eleutherodactylus petersi,Eleutherodactylus petersi <Eleutherodactylus p...,122123


In [43]:
species = species.drop('unique_name', axis=1).rename(columns={'parent_tax_id':"genus_tax_id"})

In [65]:
species[
    (~species.name_txt.str.contains('sp.')) & 
    (~species.name_txt.str.contains('cell line')) & 
    (~species.name_txt.str.contains('xenograft')) &
    (~species.name_txt.str.contains('metagenome')) & 
    (~species.name_txt.str.contains('sample')) 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (species.name_txt.str.len() > 30)
].to_csv('names_ex_sp.csv', index=False)

In [66]:
species[
    # (~species.name_txt.str.contains('sp.')) & 
    (~species.name_txt.str.contains('cell line')) & 
    (~species.name_txt.str.contains('xenograft')) &
    (~species.name_txt.str.contains('metagenome')) & 
    (~species.name_txt.str.contains('sample')) 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (species.name_txt.str.len() > 30)
].to_csv('names.csv', index=False)

In [2]:
import pandas as pd

spec = pd.read_csv('names.zip', compression='gzip')

In [7]:
spec

Unnamed: 0,tax_id,name_txt,genus_tax_id
0,7,Azorhizobium caulinodans,6
1,9,Buchnera aphidicola,32199
2,11,Cellulomonas gilvus,1707
3,14,Dictyoglomus thermophilum,13
4,17,Methylophilus methylotrophus,16
...,...,...,...
785414,3127091,Pulchroboletus erubescens,1511087
785415,3127110,Jumillera viridis,3127107
785416,3127342,Potentilla glaucophylla,23204
785417,3127370,Clematicissus granulosa,165295
