In [2]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

In [19]:
# https://github.com/zyxue/ncbitax2lin/blob/master/ncbitax2lin/data_io.py

def load_names(names_file: str) -> pd.DataFrame:
    """
    load names.dmp and convert it into a pandas.DataFrame
    """
    df_data = pd.read_csv(
        names_file,
        sep="|",
        header=None,
        index_col=False,
        names=["tax_id", "name_txt", "unique_name", "name_class"],
    )

    return (
        df_data.assign(
            name_txt=lambda df: df["name_txt"].str.strip(),
            unique_name=lambda df: df["unique_name"].str.strip(),
            name_class=lambda df: df["name_class"].str.strip(),
        )
        .loc[lambda df: df["name_class"] == "scientific name"]
        .reset_index(drop=True)
        .drop("name_class", axis=1)
    )


def load_nodes(nodes_file: str) -> pd.DataFrame:
    """
    load nodes.dmp and convert it into a pandas.DataFrame
    """
    df_data = pd.read_csv(
        nodes_file,
        sep="|",
        header=None,
        index_col=False,
        names=[
            "tax_id",
            "parent_tax_id",
            "rank",
            "embl_code",
            "division_id",
            "inherited_div_flag",
            "genetic_code_id",
            "inherited_GC__flag",
            "mitochondrial_genetic_code_id",
            "inherited_MGC_flag",
            "GenBank_hidden_flag",
            "hidden_subtree_root_flag",
            "comments",
        ],
    )

    return df_data.assign(
        rank=lambda df: df["rank"].str.strip(),
        embl_code=lambda df: df["embl_code"].str.strip(),
        comments=lambda df: df["comments"].str.strip(),
    )

In [21]:
nodes = load_nodes("./nodes.dmp")
nodes.head()

Unnamed: 0,tax_id,parent_tax_id,rank,embl_code,division_id,inherited_div_flag,genetic_code_id,inherited_GC__flag,mitochondrial_genetic_code_id,inherited_MGC_flag,GenBank_hidden_flag,hidden_subtree_root_flag,comments
0,1,1,no rank,,8,0,1,0,0,0,0,0,
1,2,131567,superkingdom,,0,0,11,0,0,0,0,0,
2,6,335928,genus,,0,1,11,1,0,1,0,0,code compliant
3,7,6,species,AC,0,1,11,1,0,1,1,0,code compliant; specified
4,9,32199,species,BA,0,1,11,1,0,1,1,0,code compliant; specified


In [23]:
nodes['rank'].value_counts()

species             2101850
no rank              240472
genus                108761
strain                46271
subspecies            29259
family                10356
varietas               9894
subfamily              3270
tribe                  2363
order                  1900
subgenus               1822
isolate                1304
serotype               1232
clade                   959
superfamily             898
forma specialis         755
forma                   685
subtribe                586
section                 534
class                   531
suborder                374
species group           359
phylum                  311
subclass                169
serogroup               153
infraorder              135
species subgroup        134
superorder               57
subsection               41
subphylum                31
parvorder                26
genotype                 22
infraclass               19
biotype                  17
kingdom                  13
morph               

In [28]:
names = load_names('./names.dmp')

In [29]:
species = pd.merge(nodes[nodes['rank'] == 'species'], names, on='tax_id')[list(names.columns) + ['parent_tax_id']]

In [31]:
names

Unnamed: 0,tax_id,name_txt,unique_name
0,1,root,
1,2,Bacteria,Bacteria <bacteria>
2,6,Azorhizobium,
3,7,Azorhizobium caulinodans,
4,9,Buchnera aphidicola,
...,...,...,...
2565603,3127371,Clematicissus simsiana,
2565604,3127489,Fluctibacter,
2565605,3127503,Chicitaea,
2565606,3127506,Candidatus Thalassospirochaeta,


In [36]:
species

Unnamed: 0,tax_id,name_txt,unique_name,parent_tax_id
0,7,Azorhizobium caulinodans,,6
1,9,Buchnera aphidicola,,32199
2,11,Cellulomonas gilvus,,1707
3,14,Dictyoglomus thermophilum,,13
4,17,Methylophilus methylotrophus,,16
...,...,...,...,...
2101845,3127163,Russula sp. SA2,,2602424
2101846,3127164,Russula sp. TAS8,,2602424
2101847,3127342,Potentilla glaucophylla,,23204
2101848,3127370,Clematicissus granulosa,,165295


In [37]:
un = species[species.unique_name != '']

In [39]:
un#[un.tax_id > 1000]

Unnamed: 0,tax_id,name_txt,unique_name,parent_tax_id
6626,29375,Lacrimispora xylanolytica,Lacrimispora xylanolytica <Lacrimispora xylano...,2719231
9840,36929,Crassostrea belcheri,Crassostrea belcheri <Crassostrea belcheri>,6564
16629,49494,Canistrum giganteum,Canistrum giganteum <Canistrum giganteum>,49493
28825,71170,earthworms,earthworms <earthworms>,71169
32438,76643,Abronia aurita,Abronia aurita <lizards & snakes>,76641
...,...,...,...,...
2025660,2975071,Vector pCV-zCas9-puro,Vector pCV-zCas9-puro <NCBI:txid2975071>,29278
2029724,2980510,Actinauge abyssorum,Actinauge abyssorum <Actinauge abyssorum>,478387
2029996,2980905,uncultured Hymenoscyphus,uncultured Hymenoscyphus <NCBI:txid2980905>,261668
2083511,3077842,Eleutherodactylus petersi,Eleutherodactylus petersi <Eleutherodactylus p...,122123


In [43]:
species = species.drop('unique_name', axis=1).rename(columns={'parent_tax_id':"genus_tax_id"})

In [65]:
species[
    (~species.name_txt.str.contains('sp.')) & 
    (~species.name_txt.str.contains('cell line')) & 
    (~species.name_txt.str.contains('xenograft')) &
    (~species.name_txt.str.contains('metagenome')) & 
    (~species.name_txt.str.contains('sample')) 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (species.name_txt.str.len() > 30)
].to_csv('names_ex_sp.csv', index=False)

In [66]:
species[
    # (~species.name_txt.str.contains('sp.')) & 
    (~species.name_txt.str.contains('cell line')) & 
    (~species.name_txt.str.contains('xenograft')) &
    (~species.name_txt.str.contains('metagenome')) & 
    (~species.name_txt.str.contains('sample')) 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (~species.name_txt.str.contains('')) & 
    # (species.name_txt.str.len() > 30)
].to_csv('names.csv', index=False)

In [2]:
import pandas as pd

spec = pd.read_csv('names.zip', compression='gzip')

In [7]:
spec

Unnamed: 0,tax_id,name_txt,genus_tax_id
0,7,Azorhizobium caulinodans,6
1,9,Buchnera aphidicola,32199
2,11,Cellulomonas gilvus,1707
3,14,Dictyoglomus thermophilum,13
4,17,Methylophilus methylotrophus,16
...,...,...,...
785414,3127091,Pulchroboletus erubescens,1511087
785415,3127110,Jumillera viridis,3127107
785416,3127342,Potentilla glaucophylla,23204
785417,3127370,Clematicissus granulosa,165295
