In [1]:
import pandas as pd
import os
import sys
import dotenv
import pygbif.species as gbif
import pytaxize.gn as gn
import numpy as np
sys.path.append(os.getcwd())

dotenv.load_dotenv(".env")
data_dir = os.getenv("DATA_PATH")
# data_dir = "G:\GITAR\PubData"


# set data dir to "G:\GITAR\PubData"
# data_dir = r"G:\GITAR\PubData\\"

# Bring in new species lists


invasive_all_source = pd.read_csv( data_dir + "\species lists\invasive_all_source.csv", dtype={"usageKey": str})
gbif_backbone = pd.read_csv( data_dir + "\GBIF data\GBIF_backbone_invasive.csv", dtype={"usageKey": str})
cabi_gbif = pd.read_csv( data_dir + "\species lists\gbif_matched\cabi_gbif.csv", dtype={"usageKey": str})
eppo_gbif = pd.read_csv( data_dir + "\species lists\gbif_matched\eppo_gbif.csv", dtype={"usageKey": str})
sinas_gbif = pd.read_csv( data_dir + "\species lists\gbif_matched\sinas_gbif.csv", dtype={"usageKey": str})
daisie_gbif = pd.read_csv( data_dir + "\species lists\gbif_matched\daisie_gbif.csv", dtype={"usageKey": str})
data_files = [cabi_gbif, eppo_gbif, daisie_gbif]

  eppo_gbif = pd.read_csv( data_dir + "\species lists\gbif_matched\eppo_gbif.csv", dtype={"usageKey": str})


In [7]:
#make column 'species' in sinasGBIF that copies taxonSINAS
sinas_gbif['species'] = sinas_gbif['taxonSINAS']
#rename sinas_gbif matchtype to matchType
sinas_gbif.rename(columns={'matchtype': 'matchType'}, inplace=True)
#if matchType is NA set to ""
sinas_gbif['matchType'] = sinas_gbif['matchType'].replace(np.nan, '', regex=True)

data_files = [sinas_gbif]

In [14]:
test = [cabi_gbif.head(50).copy()]


In [44]:
import pandas as pd
from tqdm import tqdm
from tenacity import retry, stop_after_attempt, wait_fixed, retry_if_exception_type
from requests.exceptions import HTTPError, Timeout
from pygbif import species


# Retry decorator to handle HTTP and timeout errors
@retry(
    stop=stop_after_attempt(5),
    wait=wait_fixed(5),
    retry=retry_if_exception_type((HTTPError, Timeout)),
)
def get_species_name_backbone(taxon, strict):
    return species.name_backbone(taxon, verbose=True, strict=strict)


def strip_author_name(taxon):
    # Return the first two words of the species name string
    return " ".join(taxon.split()[:2])


def check_gbif_tax(dat):
    # Initialize new columns
    dat["scientificName"] = None
    dat["Taxon"] = dat["Taxon_orig"]
    dat["GBIFstatus"] = "Missing"
    dat["GBIFmatchtype"] = None
    dat["GBIFnote"] = None
    dat["GBIFstatus_Synonym"] = None
    dat["species"] = None
    dat["genus"] = None
    dat["family"] = None
    dat["class"] = None
    dat["order"] = None
    dat["phylum"] = None
    dat["kingdom"] = None
    dat["GBIFtaxonRank"] = None
    dat["GBIFusageKey"] = None
    dat["note"] = None

    # Determine taxlist based on columns available
    if "kingdom_user" in dat.columns:
        taxlist_lifeform = dat[["Taxon", "kingdom_user"]].drop_duplicates()
        taxlist = taxlist_lifeform["Taxon"].unique()
    elif "Author" in dat.columns:
        taxlist = (
            dat[["Taxon", "Author"]]
            .drop_duplicates()
            .apply(lambda x: " ".join(x), axis=1)
            .unique()
        )
    else:
        taxlist = dat["Taxon"].unique()

    n_taxa = len(taxlist)

    mismatches = pd.DataFrame(columns=["Taxon", "status", "matchType"])

    # Initialize progress bar
    tqdm.pandas()

    for j in tqdm(range(n_taxa), desc="Processing taxa"):
        taxon = taxlist[j]
        ind_tax = dat.index[dat["Taxon_orig"] == taxon]
        taxon = (
            taxon.replace(" sp.", " ")
            .replace(" spp.", " ")
            .replace(" .f ", " ")
            .replace(" .var", "")
        )
        try:
            db_all = get_species_name_backbone(taxon, strict=True)
        except (HTTPError, Timeout):
            print(f"Failed to retrieve data for {taxon} after 5 attempts. Skipping.")
            continue
        db = {k: v for k, v in db_all.items() if k != "alternatives"}
        alternatives = db_all.get("alternatives", [])
        if (
            db.get("status") == "ACCEPTED" and db.get("matchType") == "EXACT"
        ):  # exact match
            dat.loc[ind_tax, "Taxon"] = db.get("canonicalName")
            dat.loc[ind_tax, "scientificName"] = db.get("scientificName")
            dat.loc[ind_tax, "GBIFstatus"] = db.get("status")
            dat.loc[ind_tax, "GBIFmatchtype"] = db.get("matchType")
            dat.loc[ind_tax, "GBIFtaxonRank"] = db.get("rank")
            dat.loc[ind_tax, "GBIFusageKey"] = db.get("usageKey")

            dat.loc[ind_tax, "species"] = db.get("species")
            dat.loc[ind_tax, "genus"] = db.get("genus")
            dat.loc[ind_tax, "family"] = db.get("family")
            dat.loc[ind_tax, "class"] = db.get("class")
            dat.loc[ind_tax, "order"] = db.get("order")
            dat.loc[ind_tax, "phylum"] = db.get("phylum")
            dat.loc[ind_tax, "kingdom"] = db.get("kingdom")
            dat.loc[ind_tax, "note"] = "Exact match"
            continue

        elif (
            db.get("status") == "SYNONYM"
            and db.get("matchType") == "EXACT"
            and ("species" in db or "genus" in db)
        ):
            dat.loc[ind_tax, "GBIFstatus"] = db.get("status")
            dat.loc[ind_tax, "GBIFmatchtype"] = db.get("matchType")
            dat.loc[ind_tax, "GBIFtaxonRank"] = db.get("rank")
            dat.loc[ind_tax, "GBIFusageKey"] = db.get("usageKey")

            if any(
                alt.get("status") == "ACCEPTED" and alt.get("matchType") == "EXACT"
                for alt in alternatives
            ):
                accepted_alt = next(
                    alt
                    for alt in alternatives
                    if alt.get("status") == "ACCEPTED"
                    and alt.get("matchType") == "EXACT"
                )
                dat.loc[ind_tax, "scientificName"] = accepted_alt.get("scientificName")
                dat.loc[ind_tax, "Taxon"] = accepted_alt.get("canonicalName")

                dat.loc[ind_tax, "species"] = accepted_alt.get("species")
                dat.loc[ind_tax, "genus"] = accepted_alt.get("genus")
                dat.loc[ind_tax, "family"] = accepted_alt.get("family")
                dat.loc[ind_tax, "class"] = accepted_alt.get("class")
                dat.loc[ind_tax, "order"] = accepted_alt.get("order")
                dat.loc[ind_tax, "phylum"] = accepted_alt.get("phylum")
                dat.loc[ind_tax, "kingdom"] = accepted_alt.get("kingdom")
                dat.loc[ind_tax, "GBIFstatus_Synonym"] = "ACCEPTED"
                dat.loc[ind_tax, "usageKey"] = accepted_alt.get("usageKey")
                dat.loc[ind_tax, "GBIFstatus"] = "ACCEPTED"
                dat.loc[ind_tax, "note"] = "Synonym with accepted alt"
                continue

            elif db.get("rank") == "SPECIES":
                dat.loc[ind_tax, "Taxon"] = db.get("species")
                dat.loc[ind_tax, "GBIFstatus"] = db.get("status")
                dat.loc[ind_tax, "GBIFmatchtype"] = db.get("matchType")
                dat.loc[ind_tax, "GBIFtaxonRank"] = db.get("rank")
                dat.loc[ind_tax, "GBIFusageKey"] = db.get("usageKey")
                dat.loc[ind_tax, "note"] = "Synonym with no accepted alt, species rank"

                try:
                    db_all_2 = get_species_name_backbone(
                        dat.loc[ind_tax, "Taxon"].iloc[0], strict=True
                    )
                except (HTTPError, Timeout):
                    print(
                        f"Failed to retrieve data for {dat.loc[ind_tax, 'Taxon'].iloc[0]} after 5 attempts. Skipping."
                    )
                    continue

                db_2 = db_all_2

                if db_2.get("matchType") == "EXACT":
                    dat.loc[ind_tax, "scientificName"] = db_2.get("scientificName")
                    dat.loc[ind_tax, "GBIFstatus_Synonym"] = db_2.get("status")
                    dat.loc[ind_tax, "species"] = db_2.get("species")
                    dat.loc[ind_tax, "genus"] = db_2.get("genus")
                    dat.loc[ind_tax, "family"] = db_2.get("family")
                    dat.loc[ind_tax, "class"] = db_2.get("class")
                    dat.loc[ind_tax, "order"] = db_2.get("order")
                    dat.loc[ind_tax, "phylum"] = db_2.get("phylum")
                    dat.loc[ind_tax, "kingdom"] = db_2.get("kingdom")
                    dat.loc[ind_tax, "note"] = (
                        "Synonym with no accepted alt, species rank, exact match"
                    )
            elif db.get("rank") == "GENUS":
                dat.loc[ind_tax, "Taxon"] = db.get("genus")
                dat.loc[ind_tax, "GBIFstatus"] = db.get("status")
                dat.loc[ind_tax, "GBIFmatchtype"] = db.get("matchType")
                dat.loc[ind_tax, "GBIFtaxonRank"] = db.get("rank")
                dat.loc[ind_tax, "GBIFusageKey"] = db.get("usageKey")
                dat.loc[ind_tax, "note"] = "Synonym with no accepted alt, genus rank"
                try:
                    db_all_2 = get_species_name_backbone(
                        dat.loc[ind_tax, "Taxon"].iloc[0], strict=True
                    )
                except (HTTPError, Timeout):
                    print(
                        f"Failed to retrieve data for {dat.loc[ind_tax, 'Taxon'].iloc[0]} after 5 attempts. Skipping."
                    )
                    continue
        elif db.get("status") == "DOUBTFUL" and db.get("matchType") == "EXACT":
            dat.loc[ind_tax, "GBIFstatus"] = db.get("status")
            dat.loc[ind_tax, "GBIFmatchtype"] = db.get("matchType")
            dat.loc[ind_tax, "GBIFtaxonRank"] = db.get("rank")
            dat.loc[ind_tax, "GBIFusageKey"] = db.get("usageKey")
            dat.loc[ind_tax, "note"] = "Doubtful record"

            # Try again by stripping author name
            try:
                db_all_2 = get_species_name_backbone(
                    strip_author_name(taxon), strict=True
                )
            except (HTTPError, Timeout):
                print(
                    f"Failed to retrieve data for {strip_author_name(taxon)} after 5 attempts. Skipping."
                )
                continue

            db_2 = db_all_2

            if db_2.get("matchType") == "EXACT":
                dat.loc[ind_tax, "scientificName"] = db_2.get("scientificName")
                dat.loc[ind_tax, "GBIFstatus_Synonym"] = db_2.get("status")
                dat.loc[ind_tax, "species"] = db_2.get("species")
                dat.loc[ind_tax, "genus"] = db_2.get("genus")
                dat.loc[ind_tax, "family"] = db_2.get("family")
                dat.loc[ind_tax, "class"] = db_2.get("class")
                dat.loc[ind_tax, "order"] = db_2.get("order")
                dat.loc[ind_tax, "phylum"] = db_2.get("phylum")
                dat.loc[ind_tax, "kingdom"] = db_2.get("kingdom")
                dat.loc[ind_tax, "note"] = (
                    "Doubtful record, exact match after stripping author name"
                )
            continue
        else:
            dat.loc[ind_tax, "note"] = "No match found"
            mismatch_entry = {
                "Taxon": taxon,
                "status": db.get("status"),
                "matchType": db.get("matchType"),
            }
            mismatches = pd.concat(
                [mismatches, pd.DataFrame([mismatch_entry])], ignore_index=True
            )

    return dat, mismatches

In [45]:
# cycle through data files - collect all from species column where matchType is blank, NA or "HIGHERRANK" then make a dataframe for check_gbif_tax function with species names called species_to_recheck with Taxon and Taxon_orig cols

specie_unfound = []
for file in data_files:
    for index, row in file.iterrows():
        if row['matchType'] in ["", "NA", "HIGHERRANK"]:
            specie_unfound.append(row['species'])
# drop duplicates
specie_unfound = list(set(specie_unfound))

check_species_df = pd.DataFrame(specie_unfound, columns=["Taxon"])
# add Taxon_orig that duplicates Taxon col
check_species_df['Taxon_orig'] = check_species_df['Taxon']
matched_species, unmatched = check_gbif_tax(check_species_df)
# make dc into dataframe
matched_species = pd.DataFrame(matched_species)


def update_GBIFstatus(row):
    if row["GBIFstatus"] == "Missing" and row["GBIFstatus_Synonym"] != None:
        row["GBIFstatus"] = row["GBIFstatus_Synonym"]
    elif row["GBIFstatus"] == None and row["GBIFstatus_Synonym"] != None:
        row["GBIFstatus"] = row["GBIFstatus_Synonym"]
    return row


matched_species = matched_species.apply(update_GBIFstatus, axis=1)
# write matched_species to csv as previously_unmatched_species.csv
matched_species.to_csv(
    data_dir + "\species lists\previously_unmatched_species_gbif_match_sinas.csv"
)

Processing taxa: 100%|██████████| 1852/1852 [40:34<00:00,  1.31s/it]


In [106]:
data_files[0] = temp

In [107]:
# combine data_files and matched specis. Add taxonomy columns to data_files.
# load matched_species from csv
# matched_species = pd.read_csv(data_dir + "\species lists\previously_unmatched_species_gbif_match.csv")
gbif_backbone
for file in data_files:

    if "kingdom" not in file.columns:
        file["kingdom"] = None
        file["phylum"] = None
        file["class"] = None
        file["order"] = None
        file["family"] = None

        file["genus"] = None
        file["GBIFstatus"] = None
        file["GBIFtaxonRank"] = None
        file["taxonomic_species"] = None
    if "canonicalName" not in file.columns:
        file["canonicalName"] = None
    #if usageKey col is not string, set to string and remove any .0 from floats
    if file['usageKey'].dtype != 'str':
        file['usageKey'] = file['usageKey'].astype(str)
        file['usageKey'] = file['usageKey'].str.replace('.0', '')
    
    for index, row in file.iterrows():
     
        skip_uk = 0
        if row['matchType'] in ["", "NA", "HIGHERRANK"]:
            # search for species in matched_species
            search = matched_species[matched_species['Taxon_orig'] == row['species']]
            if search.empty:
                print(row['species'])
                # break
                print(row)
                continue
            elif search.iloc[0]['GBIFstatus'] == "Missing" or search.iloc[0]['GBIFstatus'] == None:
                print("making XX")
                if file.at[index, "matchType"] == "HIGHERRANK":
                    print("HIGHERRANK")
                    print(file.at[index, "species"])
                try:
                    file.at[index, "usageKey"] = "XX" + row['species'].replace(" ", "_")
                    print('creating UK' + "XX" + row['species'].replace(" ", "_"))
                except AttributeError:
                    print(row['species'])
                    pass
            
             
                continue
            else:
                skip_uk = 1
                search = search.iloc[0]
            file.at[index, "kingdom"] = search['kingdom']
            file.at[index, "phylum"] = search['phylum']
            file.at[index, "class"] = search['class']
            file.at[index, "order"] = search['order']
            file.at[index, "family"] = search['family']
            file.at[index, "genus"] = search['genus']
            file.at[index, "usageKey"] = search['GBIFusageKey']
            file.at[index, "canonicalName"] = search['Taxon']
            file.at[index, "scientificName"] = search['scientificName']
            # file.at[index, "GBIFstatus"] = search['GBIFstatus']
            file.at[index, "matchType"] = search['GBIFmatchtype']
            file.at[index, "rank"] = search['GBIFtaxonRank']
            file.at[index, "taxonomic_species"] = search['species']
        else: 
            # match to gbif_backbone
            search = gbif_backbone[gbif_backbone['usageKey'] == row['usageKey']]

            if search.empty:
                # print(row['usageKey'])
                # break
                # print(row)
                # print(row["species"])
                print('no match to gbif backbone')
                # print(row['usageKey'])
                if (
                    pd.isnull(file.at[index, "usageKey"])
                    or file.at[index, "usageKey"] == ""
                    or row['usageKey'] == "nan"
                ):

                    print("making XX")
                    if file.at[index, "matchType"] == "HIGHERRANK":
                        print("HIGHERRANK")
                        print(file.at[index, "species"])
                    try:
                        file.at[index, "usageKey"] = "XX" + row['species'].replace(" ", "_")
                        print('creating UK' + "XX" + row['species'].replace(" ", "_"))
                    except AttributeError:
                        print(row['species'])
                        pass
               
                continue
            else:
                print('foo')
                search = search.iloc[0]
                # print(search)
                file.at[index, "kingdom"] = search['kingdom']
                file.at[index, "phylum"] = search['phylum']
                file.at[index, "class"] = search['class']
                file.at[index, "order"] = search['order']
                file.at[index, "family"] = search['family']
                file.at[index, "genus"] = search['genus']
                file.at[index, "gbif_species"] = search['species']
                # if cannonical name blank
                if pd.isnull(row['canonicalName']):
                    file.at[index, "canonicalName"] = search['species']
                    file.at[index, "scientificName"] = search['scientificName']
                    # file.at[index, "GBIFstatus"] = search['GBIFstatus']
                    file.at[index, "matchType"] = search['taxonomicStatus']
                    file.at[index, "rank"] = search['taxonRank']
                    file.at[index, "taxonomic_species"] = search['species']
            # print file at index
            # print(file.at[index, "kingdom"])
            # if usageKey is na

        if pd.isnull(file.at[index, "usageKey"]) or file.at[index, "matchType"] == "HIGHERRANK" or file.at[index, "usageKey"] == "NA" or file.at[index, "usageKey"] == "" or pd.isnull(row['usageKey']):
            print("making XX")
            if file.at[index, "matchType"] == "HIGHERRANK":
                print("HIGHERRANK")
                print(file.at[index, "species"])
            try:
                file.at[index, "usageKey"] = "XX" + row['species'].replace(" ", "_")
                print('creating UK' + "XX" + row['species'].replace(" ", "_"))
            except AttributeError:
                print(row['species'])
        elif pd.isnull(row['usageKey'] ) or row['usageKey'] == "" or row['usageKey'] == "NaN" or row['usageKey'] == 'nan':
            print("NA")
            print(row['species'])
            print(row['usageKey'])
            print(file.at[index, "usageKey"])
            print(file.at[index, "species"])
        else:
            print(row['usageKey'])
            print('broke')

foo
2501066
broke
foo
2500461
broke
foo
2500685
broke
foo
4411080
broke
foo
2308887
broke
foo
4411124
broke
foo
4411119
broke
foo
7389097
broke
foo
8256670
broke
foo
8168527
broke
foo
8274448
broke
foo
7849089
broke
foo
2307544
broke
foo
4410938
broke
foo
6135528
broke
foo
9087002
broke
foo
9225458
broke
foo
4410208
broke
foo
2307610
broke
foo
9457976
broke
no match to gbif backbone
foo
2307609
broke
no match to gbif backbone
foo
4410206
broke
foo
4410206
broke
foo
2307755
broke
foo
2307759
broke
foo
2307758
broke
foo
2307758
broke
foo
2307756
broke
foo
9562076
broke
foo
8784979
broke
foo
4410818
broke
foo
4410778
broke
foo
2307757
broke
foo
2307760
broke
foo
8950917
broke
foo
4410234
broke
foo
7893065
broke
foo
11964820
broke
foo
9675087
broke
foo
2308786
broke
foo
8290434
broke
foo
7725323
broke
foo
8041190
broke
foo
7772922
broke
foo
7772922
broke
no match to gbif backbone
foo
8452410
broke
foo
5739805
broke
foo
8772214
broke
foo
9524019
broke
foo
9715336
broke
foo
9715336
broke
foo

In [108]:
data_files[0]

Unnamed: 0,taxonSINAS,usageKey,scientificName,taxonRank,matchType,Date,New,species,kingdom,phylum,class,order,family,genus,GBIFstatus,GBIFtaxonRank,taxonomic_species,gbif_species,canonicalName,rank
0,Paratenuisentis ambiguus,2501066,"Paratenuisentis ambiguus (Van Cleave, 1921)",SPECIES,ACCEPTED,2024-04-16,False,Paratenuisentis ambiguus,Animalia,Acanthocephala,Eoacanthocephala,Neoechinorhynchida,Tenuisentidae,Paratenuisentis,,,Paratenuisentis ambiguus,Paratenuisentis ambiguus,Paratenuisentis ambiguus,SPECIES
1,Acanthocephalus anguillae,2500461,"Acanthocephalus anguillae (Müller, 1780)",SPECIES,ACCEPTED,2024-04-16,False,Acanthocephalus anguillae,Animalia,Acanthocephala,Palaeacanthocephala,Echinorhynchida,Echinorhynchidae,Acanthocephalus,,,Acanthocephalus anguillae,Acanthocephalus anguillae,Acanthocephalus anguillae,SPECIES
2,Pomphorhynchus laevis,2500685,"Pomphorhynchus laevis (Zoega, 1776)",SPECIES,ACCEPTED,2024-04-16,True,Pomphorhynchus laevis,Animalia,Acanthocephala,Palaeacanthocephala,Echinorhynchida,Pomphorhynchidae,Pomphorhynchus,,,Pomphorhynchus laevis,Pomphorhynchus laevis,Pomphorhynchus laevis,SPECIES
3,Erpobdella testacea,4411080,"Erpobdella testacea (Savigny, 1822)",SPECIES,ACCEPTED,2024-04-16,False,Erpobdella testacea,Animalia,Annelida,Clitellata,Arhynchobdellida,Erpobdellidae,Erpobdella,,,Erpobdella testacea,Erpobdella testacea,Erpobdella testacea,SPECIES
4,Hirudo medicinalis,2308887,"Hirudo medicinalis Linnaeus, 1758",SPECIES,ACCEPTED,2024-04-16,False,Hirudo medicinalis,Animalia,Annelida,Clitellata,Arhynchobdellida,Hirudinidae,Hirudo,,,Hirudo medicinalis,Hirudo medicinalis,Hirudo medicinalis,SPECIES
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47200,Zizania latifolia (Griseb.) Turcz. ex Stapf,2703231,Zizania latifolia (Griseb.) Hance ex F.Muell.,,EXACT,2024-04-16,True,Zizania latifolia (Griseb.) Turcz. ex Stapf,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Zizania,,,Zizania latifolia,,Zizania latifolia (Griseb.) Turcz. ex Stapf,SPECIES
47201,Zizania latifolia Turcz. ex Stapf,2703231,Zizania latifolia (Griseb.) Hance ex F.Muell.,,EXACT,2024-04-16,True,Zizania latifolia Turcz. ex Stapf,Plantae,Tracheophyta,Liliopsida,Poales,Poaceae,Zizania,,,Zizania latifolia,,Zizania latifolia Turcz. ex Stapf,SPECIES
47202,Ziziphus xylopyrus Hochst. ex A.Rich.,3877691,Ziziphus xylopyrus (Retz.) Willd.,,EXACT,2024-04-16,True,Ziziphus xylopyrus Hochst. ex A.Rich.,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rhamnaceae,Ziziphus,,,Ziziphus xylopyrus,,Ziziphus xylopyrus Hochst. ex A.Rich.,SPECIES
47203,"Zonitoides nitida (O.F.Müller, 1774)","XXZonitoides_nitida_(O.F.Müller,_1774)",,,,2024-04-16,True,"Zonitoides nitida (O.F.Müller, 1774)",,,,,,,,,,,,


In [110]:
#show data files[0] where usageKey contains "XX" and remove na
test = data_files[0][data_files[0]['usageKey'].str.contains("XX", na=False)]
test

Unnamed: 0,taxonSINAS,usageKey,scientificName,taxonRank,matchType,Date,New,species,kingdom,phylum,class,order,family,genus,GBIFstatus,GBIFtaxonRank,taxonomic_species,gbif_species,canonicalName,rank
45354,Abutilon aurantiacum,XXAbutilon_aurantiacum,,,,2024-04-16,True,Abutilon aurantiacum,,,,,,,,,,,,
45355,Abutilon coccineum hort.,XXAbutilon_coccineum_hort.,,,,2024-04-16,True,Abutilon coccineum hort.,,,,,,,,,,,,
45356,Abutilon darwinii x pictum,XXAbutilon_darwinii_x_pictum,,,,2024-04-16,True,Abutilon darwinii x pictum,,,,,,,,,,,,
45357,Abutilon hybridum Hort.,XXAbutilon_hybridum_Hort.,,,,2024-04-16,True,Abutilon hybridum Hort.,,,,,,,,,,,,
45359,Abutilon megapotamicum x pictum,XXAbutilon_megapotamicum_x_pictum,,,,2024-04-16,True,Abutilon megapotamicum x pictum,,,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
47196,"Zaptychopsis buschi (H.C. Küster, 1844)","XXZaptychopsis_buschi_(H.C._Küster,_1844)",,,,2024-04-16,True,"Zaptychopsis buschi (H.C. Küster, 1844)",,,,,,,,,,,,
47197,Zelkova carpinifolia (Pall.) C. Koch,XXZelkova_carpinifolia_(Pall.)_C._Koch,,,,2024-04-16,True,Zelkova carpinifolia (Pall.) C. Koch,,,,,,,,,,,,
47198,Zika virus (ZIKV),XXZika_virus_(ZIKV),,,,2024-04-16,True,Zika virus (ZIKV),,,,,,,,,,,,
47203,"Zonitoides nitida (O.F.Müller, 1774)","XXZonitoides_nitida_(O.F.Müller,_1774)",,,,2024-04-16,True,"Zonitoides nitida (O.F.Müller, 1774)",,,,,,,,,,,,


In [109]:
#data_file[0] where usage key is na
nauk = data_files[0][pd.isnull(data_files[0]['usageKey'])]
nauk

Unnamed: 0,taxonSINAS,usageKey,scientificName,taxonRank,matchType,Date,New,species,kingdom,phylum,class,order,family,genus,GBIFstatus,GBIFtaxonRank,taxonomic_species,gbif_species,canonicalName,rank


In [127]:
cabi_gbif_match = data_files[0]
eppo_gbif_match = data_files[1]
daisie_gbif_match = data_files[2]

#write to csv 
cabi_gbif_match.to_csv(data_dir + "\species lists\gbif_matched\cabi_gbif_matched.csv")
eppo_gbif_match.to_csv(data_dir + "\species lists\gbif_matched\eppo_gbif_matched.csv")
daisie_gbif_match.to_csv(data_dir + "\species lists\gbif_matched\daisie_gbif_matched.csv")


In [113]:
sinas_gbif_match = data_files[0]
sinas_gbif_match.to_csv(data_dir + "\species lists\gbif_matched\sinas_gbif_matched.csv")


In [54]:
test = gbif.name_backbone("Lycopersicon esculentum", strict=True, verbose=True)

In [30]:
# backbone where usageKey = 5060876

bb = gbif_backbone[gbif_backbone["usageKey"] == 3112707]

bb

Unnamed: 0,usageKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,classKey,order,orderKey,family,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory


In [111]:
row

species               albuca
codeCABI            44007809
source                   NaN
usageKey               XXalb
scientificName           NaN
canonicalName            NaN
rank                     NaN
confidence               NaN
matchType                NaN
Date              2023-05-04
New                    False
kingdom                     
phylum                      
class                       
order                       
family                      
genus                       
Name: 12099, dtype: object

In [12]:
gbif.name_lookup("Lycopersicon esculentum", rank="SPECIES")["results"][0]

{'key': 159142211,
 'nameKey': 6530446,
 'datasetKey': '02fff9ea-e629-458d-b99e-cb7dc9deae01',
 'nubKey': 2930181,
 'parentKey': 159142210,
 'parent': 'Lycopersicon',
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Solanales',
 'family': 'Solanaceae',
 'genus': 'Lycopersicon',
 'species': 'Lycopersicon esculentum',
 'kingdomKey': 159140699,
 'phylumKey': 159140751,
 'classKey': 159141015,
 'orderKey': 159142130,
 'familyKey': 159142176,
 'genusKey': 159142210,
 'speciesKey': 159142211,
 'scientificName': 'Lycopersicon esculentum',
 'canonicalName': 'Lycopersicon esculentum',
 'authorship': '',
 'accordingTo': 'Taxonomía tomada de: Species 2000 & ITIS Catalogue of Life: April 2013; Estado taxonómico tomado de: http://www.iplantcollaborative.org/',
 'nameType': 'SCIENTIFIC',
 'taxonomicStatus': 'ACCEPTED',
 'rank': 'SPECIES',
 'origin': 'SOURCE',
 'numDescendants': 0,
 'numOccurrences': 0,
 'habitats': [],
 'nomenclaturalStatus': [],
 'threatStatuses': [],
 'descriptions': [

In [21]:
def get_sp_id(sp):
    sp_dict = gbif.name_backbone(sp)
    if "speciesKey" in sp_dict:
        return sp_dict["speciesKey"], sp_dict["species"]
    else:
        return None, None

In [24]:
gbif.name_backbone(name="Solanum verbascifolium")

{'usageKey': 2928997,
 'scientificName': 'Solanum L.',
 'canonicalName': 'Solanum',
 'rank': 'GENUS',
 'status': 'ACCEPTED',
 'confidence': 97,
 'matchType': 'HIGHERRANK',
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Solanales',
 'family': 'Solanaceae',
 'genus': 'Solanum',
 'kingdomKey': 6,
 'phylumKey': 7707728,
 'classKey': 220,
 'orderKey': 1176,
 'familyKey': 7717,
 'genusKey': 2928997,
 'synonym': False,
 'class': 'Magnoliopsida'}

In [22]:
get_synonyms(get_sp_id("Lycopersicon esculentum"))

HTTPError: 400 Client Error: Bad Request for url: https://api.gbif.org/v1/species/(None,%20None)/synonyms?limit=100

In [23]:
get_sp_id("Lycopersicon esculentum")

(None, None)

In [16]:
def get_synonyms(sp_key):
    nu = gbif.name_usage(sp_key, data="synonyms")["results"]
    if len(nu) > 0:
        return [i["canonicalName"] for i in nu if "canonicalName" in i]
    else:
        return []

In [25]:
 search = gbif.name_backbone(name=row['species'])

            if search['matchType'] == "NONE":
                search = gbif.name_lookup(row['species'], rank="SPECIES")
            if 'count' in search and search['count'] == 0:
                print('gn resolving' + row['species'])
                other_sources = gn.resolve(row['species'])
                try:
                    matched_name = other_sources[0][0]
                    search = gbif.name_lookup(matched_name, rank="SPECIES")
                    if search['count'] == 0:
                        print('still no luck')
                        continue
                    search = search['results'][0]
                    try:
                        row['kingdom'] = search['kingdom']
                    except KeyError:
                        pass
                    try:
                        row['phylum'] = search['phylum']
                    except KeyError:
                        pass
                    try:
                        row['class'] = search['class']
                    except KeyError:
                        pass
                    try:

                        row['order'] = search['order']
                    except KeyError:
                        pass
                    try:
                        row['family'] = search['family']
                    except KeyError:
                        pass
                    try:
                        row['genus'] = search['genus']
                        row['usageKey'] = search['usageKey']
                        #row['matchType'] = search['matchType']
                        row['canonicalName'] = search['canonicalName']
                        row['scientificName'] = search['scientificName']
                        print("changed row" + row['species'] + " to " + search['scientificName'] + " with usageKey " + search['usageKey'])
                    except KeyError:
                        print('HR KeyError 1')
                        continue
                except IndexError:
                    print("HR IndexError")
                    continue

            else:
                if 'results' in search:
                    search = search['results'][0]

                try:
                    row['kingdom'] = search['kingdom']
                except KeyError:
                    pass
                try:
                    row['phylum'] = search['phylum']
                except KeyError:
                    pass
                try:
                    row['class'] = search['class']
                except KeyError:
                    pass
                try:

                    row['order'] = search['order']
                except KeyError:
                    pass
                try:
                    row['family'] = search['family']
                except KeyError:
                    pass
                try:
                    row['usageKey'] = search['usageKey']
                except KeyError:
                    try: 
                        row['usageKey'] = search['speciesKey']
                    except KeyError:
                        pass
                try:
                    row['genus'] = search['genus']
                    
                    #row['matchType'] = search['matchType']
                    row['canonicalName'] = search['canonicalName']
                    row['scientificName'] = search['scientificName']
                    print("changed row" + row['species'] + " to " + search['scientificName'] + " with usageKey ")
                except KeyError:
                    print('HR KeyErro 2')
                    
                    continue

IndentationError: unexpected indent (39153588.py, line 3)

In [81]:
search3['count']

KeyError: 'count'

In [64]:
test = gbif.name_lookup("Trientalis europaea", rank="SPECIES")["results"][0]
test

{'key': 206098550,
 'nameKey': 11447999,
 'datasetKey': '2c38cf8a-f981-4dfb-bc9d-dd2b6fc792ed',
 'nubKey': 3169295,
 'parentKey': 206113510,
 'parent': 'Plantae',
 'kingdom': 'Plantae',
 'species': 'Trientalis europaea',
 'kingdomKey': 206113510,
 'speciesKey': 206098550,
 'scientificName': 'Trientalis europaea',
 'canonicalName': 'Trientalis europaea',
 'authorship': '',
 'nameType': 'SCIENTIFIC',
 'taxonomicStatus': 'ACCEPTED',
 'rank': 'SPECIES',
 'origin': 'SOURCE',
 'numDescendants': 0,
 'numOccurrences': 0,
 'habitats': [],
 'nomenclaturalStatus': [],
 'threatStatuses': [],
 'descriptions': [],
 'vernacularNames': [],
 'higherClassificationMap': {'206113510': 'Plantae'},
 'synonym': False}

In [93]:
test2 = gbif.name_suggest(q="Athyrium alpestre")

In [105]:
test2[2]

{'key': 8282577,
 'nameKey': 1166416,
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Polypodiales',
 'family': 'Athyriaceae',
 'genus': 'Pseudathyrium',
 'species': 'Pseudathyrium alpestre',
 'kingdomKey': 6,
 'phylumKey': 7707728,
 'classKey': 7228684,
 'orderKey': 392,
 'familyKey': 4922264,
 'genusKey': 7289277,
 'speciesKey': 4017779,
 'parent': 'Pseudathyrium',
 'parentKey': 7289277,
 'nubKey': 8282577,
 'scientificName': 'Athyrium alpestre (Hoppe) Rylands ex T.Moore',
 'canonicalName': 'Athyrium alpestre',
 'rank': 'SPECIES',
 'status': 'HETEROTYPIC_SYNONYM',
 'higherClassificationMap': {'6': 'Plantae',
  '7707728': 'Tracheophyta',
  '7228684': 'Polypodiopsida',
  '392': 'Polypodiales',
  '4922264': 'Athyriaceae',
  '7289277': 'Pseudathyrium',
  '4017779': 'Pseudathyrium alpestre'},
 'synonym': True,
 'class': 'Polypodiopsida'}

In [106]:
test3 = gbif.name_backbone(name="Athyrium alpestre", strict=True)
test3

{'usageKey': 4922264,
 'scientificName': 'Athyriaceae',
 'canonicalName': 'Athyriaceae',
 'rank': 'FAMILY',
 'status': 'ACCEPTED',
 'confidence': 98,
 'matchType': 'HIGHERRANK',
 'kingdom': 'Plantae',
 'phylum': 'Tracheophyta',
 'order': 'Polypodiales',
 'family': 'Athyriaceae',
 'kingdomKey': 6,
 'phylumKey': 7707728,
 'classKey': 7228684,
 'orderKey': 392,
 'familyKey': 4922264,
 'synonym': False,
 'class': 'Polypodiopsida'}

In [33]:
#make a list of all species in the invasive_all_source list that do not have a usageKey and have a canonicalName beginning with "xx" or "XX"
xxlist = []
for index, row in invasive_all_source.iterrows():

#if usageKey begins with "xx" or "XX" a
    if row['usageKey'].startswith("xx") or row['usageKey'].startswith("XX"):
      
        if pd.notnull(row['speciesCABI']):
            species = row['speciesCABI']
        elif pd.notnull(row['speciesASFR']):
            species = row['speciesASFR']
        
        elif pd.notnull(row['speciesEPPO']):
            species = row['speciesEPPO']
        elif pd.notnull(row['speciesDAISIE']):
            species = row['speciesDAISIE']
        else:
            species = "NA"
        xxlist.append(species)

In [13]:
invasive_all_source
#add columns to invasive_all_source for kingdom, phylum, class, order, family, genus
invasive_all_source["kingdom"] = ""
invasive_all_source["phylum"] = ""
invasive_all_source["class"] = ""
invasive_all_source["order"] = ""
invasive_all_source["family"] = ""
invasive_all_source["genus"] = ""


Unnamed: 0,speciesCABI,codeCABI,usageKey,invasiveCABI,speciesASFR,source,speciesEPPO,codeEPPO,invasiveEPPO,speciesDAISIE,codeDAISIE,scientificName,canonicalName,rank
0,Fusarium solani,81903.0,5251899,True,,,,,,,,Fusarium solani W.C.Snyder,Fusarium solani,SPECIES
1,Acridotheres cristatellus,97370105.0,2489010,False,Acridotheres cristatellus,ASFR,,,,Acridotheres cristatellus,50310.0,"Acridotheres cristatellus (Linnaeus, 1758)",Acridotheres cristatellus,SPECIES
2,Macrorhynchia philippina,53061963.0,2266916,False,Macrorhynchia philippina,ASFR,,,,Macrorhynchia philippina,100544.0,"Macrorhynchia philippina Kirchenpauer, 1872",Macrorhynchia philippina,SPECIES
3,Peronospora aquilegiicola,12131388.0,2583264,True,,,Peronospora sp.,PEROSP,False,,,Peronospora Corda,Peronospora,GENUS
4,Peronospora aquilegiicola,12131388.0,2583264,True,,,Peronospora aquilegiicola,PEROAQ,False,,,Peronospora Corda,Peronospora,GENUS
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
52989,,,3221935,,,,,,,Brenneria quercina,901589.0,"Brenneria quercina (Hildebrand & Schroth, 1967...",Brenneria quercina,SPECIES
52990,,,3221940,,,,,,,Erwinia alni,901591.0,"Erwinia alni Surico et al., 1996",Erwinia alni,SPECIES
52991,,,3221944,,,,,,,Erwinia salicis,901593.0,"Erwinia salicis (Day, 1924) Chester, 1939",Erwinia salicis,SPECIES
52992,,,2591578,,,,,,,Myriellina cydoniae,901601.0,Myriellina cydoniae (Desm.) Höhn.,Myriellina cydoniae,SPECIES
