All GBIF backbone versions are available at https://hosted-datasets.gbif.org/datasets/backbone/

In [1]:
import pandas as pd

See https://api.gbif.org/v1/species/5386/synonyms?limit=50

For an ID with rank RANK, the accepted ID seems to be under RANK_key
e.g. if ID1 is a synonym of ID2, rank RANK, both should have the same RANK_key that indicates which one is considered "accepted"

In [2]:
year = "2022" # "2017" # 

In [3]:
path = f"gbif-backbone-{year}.tsv"
raw = pd.read_csv(open(path, "r"), sep="\t", dtype=str, names=[
    "id",
    "status",
    "rank",
    "kingdom_key",
    "phylum_key",
    "class_key",
    "order_key",
    "family_key",
    "genus_key",
    "species_key",
    "name_id",
    "scientific_name",
    "canonical_name"
])
raw.head(1)

Unnamed: 0,id,status,rank,kingdom_key,phylum_key,class_key,order_key,family_key,genus_key,species_key,name_id,scientific_name,canonical_name
0,id,status,rank,kingdom_key,phylum_key,class_key,order_key,family_key,genus_key,species_key,name_id,scientific_name,canonical_name


In [4]:
ranks = [
    "kingdom",
    "phylum",
    "class",
    "order",
    "family",
    "genus",
    "species"
]

id_map = raw[["id", "canonical_name"]].set_index("id")["canonical_name"].str.lower().to_dict()

In [5]:
table = raw[raw["rank"] == "SPECIES"][["id"]].copy()
for rank in ranks:
    key = f"{rank}_key"
    table[rank] = raw[key].map(lambda x: id_map[x] if x in id_map else "")
table = table.set_index("id")

In [18]:
table.to_csv(f"gbif-backbone-{year}-dereferenced.tsv", sep="\t")

In [6]:
table

Unnamed: 0_level_0,kingdom,phylum,class,order,family,genus,species
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
1029123,animalia,arthropoda,ostracoda,podocopida,cyprididae,heterocypris,heterocypris reptans
1029054,animalia,arthropoda,ostracoda,podocopida,cyprididae,chlamydotheca,chlamydotheca azteca
1030157,animalia,arthropoda,ostracoda,podocopida,cushmanideidae,pontocythere,pontocythere sclerochilus
1030395,animalia,arthropoda,ostracoda,podocopida,entocytheridae,entocythere,entocythere reddelli
1030508,animalia,arthropoda,ostracoda,podocopida,entocytheridae,ankylocythere,ankylocythere chipola
...,...,...,...,...,...,...,...
2527087,fungi,basidiomycota,agaricomycetes,agaricales,marasmiaceae,hydropus,hydropus marginellus
8015505,fungi,basidiomycota,agaricomycetes,agaricales,tricholomataceae,melanoleuca,melanoleuca odorifera
5240325,fungi,basidiomycota,agaricomycetes,agaricales,amanitaceae,amanita,amanita phalloides
2545555,fungi,basidiomycota,agaricomycetes,polyporales,polyporaceae,lentinus,lentinus squarrosulus


In [37]:
len(table[(table["kingdom"] == "fungi") * (table["rank"] == "species")])

311304

In [27]:
sum(table[table["kingdom"] == "fungi"]["family"].value_counts() > 0)

2306

In [15]:
raw["id"].value_counts()

id
ACCEPTED               4116297
SYNONYM                2656200
HOMOTYPIC_SYNONYM       408709
DOUBTFUL                302915
HETEROTYPIC_SYNONYM     164760
PROPARTE_SYNONYM          2973
Name: count, dtype: int64

In [17]:
2656200 + 408709 + 164760 + 2973

3232642

In [9]:
raw["rank"].value_counts()

0           FAMILY
1           FAMILY
2           FAMILY
3          SPECIES
4          SPECIES
            ...   
7651849    SPECIES
7651850    SPECIES
7651851    SPECIES
7651852    SPECIES
7651853    SPECIES
Name: rank, Length: 7651854, dtype: object