In [1]:
import pandas as pd
import json
import requests

from collections import Counter, defaultdict
from lxml import etree
from time import sleep
from math import floor

In [2]:
# Paths to input files
with open("paths.json", "r") as fh:
    filepaths = json.load(fh)

# Get names and identifiers for matching

## GBIF dataset - Get current accepted names and taxonIDs

Use option `dtype=str` to avoid interpreting taxonIDs as numeric

In [3]:
# Read GBIF Backbone taxonomy
gbif_backbone = pd.read_table(
    filepaths["gbif_backbone"],
    sep="\t",
    dtype=str,
    na_values=None
)

# acceptedNameUsageID is blank for currently accepted taxa
# logically this should be equal to taxonID - makes some downstream processing less convoluted
gbif_backbone.loc[gbif_backbone.taxonomicStatus == 'accepted', 'acceptedNameUsageID'] = gbif_backbone.loc[gbif_backbone.taxonomicStatus == 'accepted', 'taxonID']

In [4]:
# Raw dataset
dataset = pd.read_table(
    filepaths["dataset"],
    sep="\t",
    dtype=str,
    na_values=None
)

In [5]:
dataset.head()

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,classKey,order,orderKey,family,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory
0,2722299,Carex ×pieperiana Junge,2722300,Carex ×ruedtii Kneuck.,2,SPECIES,SYNONYM,Plantae,6,Tracheophyta,...,196,Poales,1369,Cyperaceae,7708,Carex,2721893,Carex ruedtii,2722300,NE
1,2874569,Cucumis sativus L.,2874569,Cucumis sativus L.,33,SPECIES,ACCEPTED,Plantae,6,Tracheophyta,...,220,Cucurbitales,7224005,Cucurbitaceae,6634,Cucumis,2874568,Cucumis sativus,2874569,NE
2,2888292,Pyrola media Sw.,2888292,Pyrola media Sw.,249,SPECIES,ACCEPTED,Plantae,6,Tracheophyta,...,220,Ericales,1353,Ericaceae,2505,Pyrola,2888249,Pyrola media,2888292,NE
3,2996064,Rubus rhytidophyllus H.E.Weber,2996064,Rubus rhytidophyllus H.E.Weber,50,SPECIES,ACCEPTED,Plantae,6,Tracheophyta,...,220,Rosales,691,Rosaceae,5015,Rubus,2988638,Rubus rhytidophyllus,2996064,NE
4,2998929,Rubus kiesewetteri Henker,2998929,Rubus kiesewetteri Henker,16,SPECIES,ACCEPTED,Plantae,6,Tracheophyta,...,220,Rosales,691,Rosaceae,5015,Rubus,2988638,Rubus kiesewetteri,2998929,NE


Genus Ammophila in the dataset was wrongly annotated as the homonymous genus of insects when the dataset was ingested by GBIF.

Issue has been reported: https://github.com/gbif/portal-feedback/issues/4666

In [6]:
dataset.value_counts('kingdom').to_frame()

Unnamed: 0_level_0,count
kingdom,Unnamed: 1_level_1
Plantae,7207
Animalia,1
incertae sedis,1


In [7]:
dataset.query('kingdom != "Plantae"')

Unnamed: 0,taxonKey,scientificName,acceptedTaxonKey,acceptedScientificName,numberOfOccurrences,taxonRank,taxonomicStatus,kingdom,kingdomKey,phylum,...,classKey,order,orderKey,family,familyKey,genus,genusKey,species,speciesKey,iucnRedListCategory
1722,0,incertae sedis,,,19304,KINGDOM,,incertae sedis,0,,...,,,,,,,,,,
3161,1346141,"Ammophila W.Kirby, 1798",1346141.0,"Ammophila W.Kirby, 1798",485,GENUS,ACCEPTED,Animalia,1,Arthropoda,...,216.0,Hymenoptera,1457.0,Sphecidae,4352.0,Ammophila,1346141.0,,,


In [8]:
# Merge on taxonID
dataset_merged = pd.merge(
    left=dataset.query("taxonRank == 'SPECIES'")[['taxonKey','scientificName']],
    right=gbif_backbone[[
        'taxonID',
        'parentNameUsageID',
        'scientificName',
        'scientificNameAuthorship',
        'canonicalName',
        'acceptedNameUsageID',
        'taxonomicStatus',
        'kingdom','phylum','class','order','family','genus'
    ]],
    how='left',
    left_on = 'taxonKey',
    right_on = 'taxonID'
).rename(
    columns={
        'scientificName_y' : 'scientificName',
        'taxonKey' : 'dataset_taxonID',
        'scientificName_x' : 'dataset_scientificName'
    }
)

In [9]:
# Get accepted scientific name
dataset_merged = pd.merge(
    left=dataset_merged,
    right=gbif_backbone[['taxonID','scientificName']].rename(
        columns={'taxonID' : 'acceptedNameUsageID', 'scientificName' : 'acceptedName'}
    ),
    left_on="acceptedNameUsageID",
    right_on="acceptedNameUsageID",
    how="left"
)

In [10]:
# NaN: taxonID not present in the current GBIF backbone taxonomy
# Doubtful: no accepted name in current GBIF backbone taxonomy
dataset_merged.value_counts('taxonomicStatus', dropna=False).to_frame()

Unnamed: 0_level_0,count
taxonomicStatus,Unnamed: 1_level_1
accepted,4873
synonym,396
homotypic synonym,367
doubtful,105
heterotypic synonym,85
,50


In [11]:
# Drop doubtful and missing taxa
dataset_merged_curr = dataset_merged[[not i for i in dataset_merged.acceptedNameUsageID.isna()]]

In [12]:
# Export TSV file with fields required for name matching
dataset_merged_curr[['scientificName','family','taxonID']].to_csv(
    "name-match/dataset_merged_curr.tsv",
    sep="\t", index=False, quoting=None
)

In [13]:
dataset_merged_curr.head()

Unnamed: 0,dataset_taxonID,dataset_scientificName,taxonID,parentNameUsageID,scientificName,scientificNameAuthorship,canonicalName,acceptedNameUsageID,taxonomicStatus,kingdom,phylum,class,order,family,genus,acceptedName
0,2722299,Carex ×pieperiana Junge,2722299,2721893,Carex pieperiana Junge,Junge,Carex pieperiana,2722300,synonym,Plantae,Tracheophyta,Liliopsida,Poales,Cyperaceae,Carex,Carex ruedtii Kneuck.
1,2874569,Cucumis sativus L.,2874569,2874568,Cucumis sativus L.,L.,Cucumis sativus,2874569,accepted,Plantae,Tracheophyta,Magnoliopsida,Cucurbitales,Cucurbitaceae,Cucumis,Cucumis sativus L.
2,2888292,Pyrola media Sw.,2888292,2888249,Pyrola media Sw.,Sw.,Pyrola media,2888292,accepted,Plantae,Tracheophyta,Magnoliopsida,Ericales,Ericaceae,Pyrola,Pyrola media Sw.
3,2996064,Rubus rhytidophyllus H.E.Weber,2996064,2988638,Rubus rhytidophyllus H.E.Weber,H.E.Weber,Rubus rhytidophyllus,2996064,accepted,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rosaceae,Rubus,Rubus rhytidophyllus H.E.Weber
4,2998929,Rubus kiesewetteri Henker,2998929,2988638,Rubus kiesewetteri Henker,Henker,Rubus kiesewetteri,2998929,accepted,Plantae,Tracheophyta,Magnoliopsida,Rosales,Rosaceae,Rubus,Rubus kiesewetteri Henker


## NCBI Taxonomy - get species level taxa within Viridiplantae

Get list of species-level taxa within Viridiplantae for name-matching.

 * Limit the list to Viridiplantae to avoid spurious matches with homonyms in other kingdoms.
 * Include synonyms because there may be differences in which names are accepted between GBIF and NCBI.
 * Include the authors if available to disambiguate e.g. nomina invalida

In [14]:
shortlist = {}
# Faster to use taxidlineage from new_taxdump than to compute lineages locally
with open(filepaths["ncbi_taxidlineage"], 'r') as fh_in:
    for line in fh_in:
        spl = line.split("\t|\t")
        if len(spl) > 1:
            lineage = spl[1].split(" ")
            if "33090" in lineage: # Viridplantae taxonID
                shortlist[spl[0]] = { 'lineage' : lineage }
with open(filepaths["ncbi_nodes"], 'r') as fh_in:
    for line in fh_in:
        spl = line.split("\t|\t")
        if len(spl) > 3:
            if spl[0] in shortlist and spl[2] == "species":
                shortlist[spl[0]]['rank'] = 'species'
with open(filepaths["ncbi_names"], 'r') as fh_in:
    for line in fh_in:
        spl = line.removesuffix("\t|\n").split("\t|\t")
        if spl[0] in shortlist and 'rank' in shortlist[spl[0]] and shortlist[spl[0]]['rank'] == 'species':
            if spl[3] == 'scientific name': # Should only be one
                shortlist[spl[0]]['canonicalName'] = spl[1]
            elif spl[3] == 'authority': # there can be more than one 'authority' for a given taxonID because synonyms share same NCBI taxonID
                if 'scientificName' not in shortlist[spl[0]]:
                    shortlist[spl[0]]['scientificName'] = [spl[1]]
                else:
                    shortlist[spl[0]]['scientificName'].append(spl[1])
with open('name-match/ncbi_viridiplantae.tsv', 'w') as fh_out:
    fh_out.write("scientificName\ttaxonID\n") # Header line
    for taxid in shortlist:
        if 'rank' in shortlist[taxid] and shortlist[taxid]['rank'] == 'species':
            if 'scientificName' in shortlist[taxid]:
                for name in shortlist[taxid]['scientificName']:
                    fh_out.write("\t".join([name, taxid]))
                    fh_out.write("\n")
            if 'canonicalName' in shortlist[taxid]: # Also list the canonical name as a backup, in case author of the accepted name is not available (Example: Ricinus communis L., taxonID NCBI:3988, not found in names.dmp file!
                fh_out.write("\t".join([shortlist[taxid]['canonicalName'], taxid]))
                fh_out.write("\n")
            else:
                print("Warning: no scientificName or canonicalName for taxon " + taxid)

# Find matches with gndiff

Name matching between scientific names (including authors) from GBIF and the NCBI Taxonomy (subset of taxa belonging to Viridiplantae), with authors where available.

[Gndiff](https://github.com/gnames/gndiff) is a tool from the Global Names Architecture project, which allows comparison of Linnean scientific names between two text tables. There are other GNA tools available, including a lookup service (Gnfinder) but we decided on Gndiff because we wanted a tool that could be used offline with a specific version of the databases of interest, in order to keep analysis reproducible.

In [15]:
%%bash
sed -i 's/\"//g' name-match/ncbi_viridiplantae.tsv # Strip quote chars
gndiff --format pretty name-match/dataset_merged_curr.tsv name-match/ncbi_viridiplantae.tsv > name-match/round_1.json

In [16]:
def classify_gndiff_result(match):
    """Classify name matching results from gndiff
    
    Names from GBIF used as source, names NCBI taxonomy as reference. Gndiff performs name
    matching (including author if available) and classifies matches as "Exact", "Partial",
    or "Fuzzy". However, the "Exact" match only takes the binomen into account, so the
    author fields may be missing or mismatched. We wish to be able to detect homonyms so
    the author field is important to us.
    
    Parameters
    ----------
    match : dict
        Individual match results in list 'Matches' from gndiff results. Results reported
        in json format and parsed by python json.load
    
    Returns
    -------
    dict
        With the following keys: `gbif_speciesName`, `gbif_taxonID` (from `sourceRecord`),
        `ncbi_speciesName`, `ncbi_taxonID` (from `referenceRecords`), and the hit
        classified under `status` (possible values: `exact_match`, `author_mismatch`, `noauthor`,
        `fuzzy`, `partial`, `no_hit`.
    """
    rec = {
        'gbif_speciesName' : i['sourceRecord']['name'],
        'gbif_taxonID' : i['sourceRecord']['id'],
    }
    if 'referenceRecords' in match and match['referenceRecords']: # key present and not null
        # At least one hit - if multiple hits, top hit should have best score
        if len(match['referenceRecords']) >= 1:
            # Record match in NCBI database
            rec['ncbi_speciesName'] = match['referenceRecords'][0]['name']
            rec['ncbi_taxonID'] = match['referenceRecords'][0]['id']
            # Exact matching names
            if match['referenceRecords'][0]['matchType'] == 'Exact':
                # Author names are present
                if 'authors' in match['sourceRecord'] and 'authors' in match['referenceRecords'][0]:
                    # Author names exactly match
                    if set(match['sourceRecord']['authors']) == set(match['referenceRecords'][0]['authors']):
                        rec['status'] = 'exact_match'
                    # Author names do not match
                    else:
                        rec['status'] = 'author_mismatch'
                # One or both sets of author names absent
                else:
                    rec['status'] = 'noauthor'
            elif match['referenceRecords'][0]['matchType'] == 'Fuzzy':
                # Fuzzy match
                rec['status'] = 'fuzzy'
            elif match['referenceRecords'][0]['matchType'].startswith("Partial"):
                # If best hit is only a partial match, e.g. only genus matches,
                # then it is discarded
                rec['status'] = 'partial'
                rec['ncbi_speciesName'] = ""
                rec['ncbi_taxonID'] = ""
    else:
        rec['status'] = 'no_hit'
    return(rec)

In [17]:
# Read results from gndiff
with open('name-match/round_1.json', 'r') as fh:
    hits1 = json.load(fh)

# Classify status of results by our criteria
hits1_parsed = []
for i in hits1['Matches']:
    rec = classify_gndiff_result(i)
    hits1_parsed.append(rec)

# Convert to data frame
hits1_df = pd.DataFrame(hits1_parsed).fillna('')

In [18]:
hits1_df.value_counts("status", dropna=False).to_frame()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
exact_match,3526
partial,1330
author_mismatch,373
noauthor,265
no_hit,190
fuzzy,37


In [19]:
# Merge matched names with the original table
round1_matched = pd.merge(
    left=dataset_merged_curr,
    right=hits1_df.query("ncbi_taxonID != ''"),
    how='right',
    left_on="taxonID",
    right_on="gbif_taxonID",
)[[
    'dataset_scientificName',
    'dataset_taxonID',
    'acceptedNameUsageID',
    'acceptedName',
    'gbif_speciesName',
    'gbif_taxonID',
    'taxonomicStatus',
    'ncbi_speciesName',
    'ncbi_taxonID',
    'status'
]]

In [20]:
round1_matched.shape[0]

4201

## For names with no hits, get synonyms and do matching again

Matches in the NCBI taxonomy were not found for about a fifth of the GBIF names in the first round. It is possible that some of those species are known under a different synonym in the NCBI database.

From the GBIF databsae, we search for known synonyms to each name by finding other names with the same `acceptedNameUsageID`, and then match these synonyms against the NCBI list.

Note that this assumes that the synonyms represent equivalent taxonomic concepts and that synonymy is transitive, which may not be true!

In [21]:
# Names with no hits in the first round of matching
round1_nohits = pd.merge(
    left=hits1_df.query("ncbi_taxonID == ''"),
    right=dataset_merged_curr,
    left_on="gbif_taxonID",
    right_on="taxonID"
)[['dataset_scientificName','dataset_taxonID','acceptedNameUsageID','acceptedName']]

In [22]:
# Get the synonyms from the GBIF Backbone Taxonomy
round1_nohits_synonyms = pd.merge(
    left=round1_nohits,
    right=gbif_backbone[['taxonID','scientificName','acceptedNameUsageID','taxonomicStatus']],
    left_on="acceptedNameUsageID",
    right_on="acceptedNameUsageID"
)

In [23]:
round1_nohits_synonyms.value_counts("taxonomicStatus", dropna=False).to_frame()

Unnamed: 0_level_0,count
taxonomicStatus,Unnamed: 1_level_1
synonym,11228
homotypic synonym,1542
accepted,1512
heterotypic synonym,1475
proparte synonym,29


In [24]:
# Write to CSV for Gndiff
round1_nohits_synonyms[['scientificName','taxonID']].to_csv('name-match/synonyms.tsv', sep="\t", index=False, quoting=None)

In [25]:
%%bash
gndiff --format pretty name-match/synonyms.tsv name-match/ncbi_viridiplantae.tsv > name-match/round_2.json

In [26]:
# Read results from gndiff
with open('name-match/round_2.json', 'r') as fh:
    hits2 = json.load(fh)

# Classify status of results by our criteria
hits2_parsed = []
for i in hits2['Matches']:
    rec = classify_gndiff_result(i)
    hits2_parsed.append(rec)

# Convert to data frame
hits2_df = pd.DataFrame(hits2_parsed).fillna('')

In [27]:
# Majority of synonyms do not have a hit
hits2_df.query('ncbi_taxonID == ""').shape

(15229, 5)

In [28]:
# Get synonyms with hits from the second round of matching
round2_matched = pd.merge(
    left=round1_nohits_synonyms,
    right=hits2_df.query('ncbi_taxonID != ""'),
    how='right',
    left_on="taxonID",
    right_on="gbif_taxonID"
)[[
    'dataset_scientificName',
    'dataset_taxonID',
    'acceptedNameUsageID',
    'acceptedName',
    'gbif_speciesName',
    'gbif_taxonID',
    'taxonomicStatus',
    'ncbi_speciesName',
    'ncbi_taxonID',
    'status'
]]

In [29]:
round1_matched.shape

(4201, 10)

In [30]:
round2_matched.shape

(604, 10)

In [31]:
# Merge the original checklist names and IDs with the matched NCBI names
dataset_merged_upd = pd.merge(
    left=dataset_merged_curr[['dataset_scientificName','dataset_taxonID','acceptedNameUsageID','acceptedName']],
    right=pd.concat([round1_matched,round2_matched]),
    left_on=['dataset_scientificName','dataset_taxonID','acceptedNameUsageID','acceptedName'],
    right_on=['dataset_scientificName','dataset_taxonID','acceptedNameUsageID','acceptedName'],
    how='left'
).fillna('')

# Separate into those with and without hits
dataset_merged_matched = dataset_merged_upd[dataset_merged_upd['gbif_speciesName'] != ""]
dataset_merged_unmatched = dataset_merged_upd[dataset_merged_upd['gbif_speciesName'] == ""]

# For those species without hits,
# Fill in gbif_speciesName and taxonomicStatus with the current values corresponding to this taxonID in the GBIF backbone
dataset_merged_unmatched = pd.merge(
    left=gbif_backbone[['taxonID','scientificName','taxonomicStatus']].rename(
        columns={'taxonID' : 'gbif_taxonID', 'scientificName' : 'gbif_speciesName'}
    ),
    right=dataset_merged_unmatched.drop(
        ['gbif_speciesName', 'gbif_taxonID', 'taxonomicStatus'],axis=1
    ),
    left_on="gbif_taxonID",
    right_on="dataset_taxonID",
    how="right",
)[[
    'dataset_scientificName',
    'dataset_taxonID',
    'acceptedNameUsageID',
    'acceptedName',
    'gbif_speciesName',
    'gbif_taxonID',
    'taxonomicStatus',
    'ncbi_speciesName',
    'ncbi_taxonID',
    'status'
]]

# Combine again into a single table
dataset_merged_upd = pd.concat([dataset_merged_matched,dataset_merged_unmatched]).sort_values('status')

Peculiarities of the NCBI Taxonomy dump files:

 * NCBI Taxonomy includes synonyms under the same taxonID. 
 * In NCBI Taxonomy `names.dmp` terminology, `scientific name` is equivalent to `dwc:acceptedNameUsage` *without* `dwc:scientificNameAuthorship` (e.g. `dwc:genus` + `dwc:specificEpithet`), whereas `authority` is equivalent to a `dwc:scientificName` that includes `dwc:scientificNameAuthorship`. This means for a given taxonID, there is only one `scientific name`, but there can be multiple `authority`s corresponding to the scientific Name + authorship of the accepted name, as well as the scientific name + authorship of the other synonyms. However, the `authority` name corresponding to the accepted `scientific name` is not indicated.
 * For some species, `authority`s corresponding to synonyms are present, but not the `authority` for the accepted name! Example: NCBI:3988 has `scientific name` "Ricinus communis", but the only `authority` is for the synonym "Ricinus sanguineus hort. ex Groenl.".

To account for such cases we get the scientific names both with and without authors to ensure that the accepted "canonical" name for each taxonID is included, with or without the authorship.

In [32]:
ncbi_canonical = {}
with open("resources/ncbi_taxonomy/names.dmp", 'r') as fh:
    for line in fh:
        spl = line.removesuffix("\t|\n").split("\t|\t")
        if spl[3] == 'scientific name':
            ncbi_canonical[spl[0]] = spl[1]
# Merge into table
dataset_merged_upd['ncbi_acceptedName'] = dataset_merged_upd['ncbi_taxonID'].apply(lambda x: ncbi_canonical[x] if x in ncbi_canonical else "")

In [33]:
# Test case: NCBI Taxonomy dump files do not include scientific name with author, only the canonical name
dataset_merged_upd.query('ncbi_speciesName == "Ricinus communis"')

Unnamed: 0,dataset_scientificName,dataset_taxonID,acceptedNameUsageID,acceptedName,gbif_speciesName,gbif_taxonID,taxonomicStatus,ncbi_speciesName,ncbi_taxonID,status,ncbi_acceptedName
3823,Ricinus communis L.,5380041,5380041,Ricinus communis L.,Ricinus communis L.,5380041,accepted,Ricinus communis,3988,noauthor,Ricinus communis


In [34]:
dataset_merged_upd['status'].value_counts().to_frame()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
exact_match,3730
,1318
author_mismatch,512
noauthor,457
fuzzy,106


# Get linked identifiers from Wikidata

NB: Re-running this code will produce different results because Wikidata is continually updated. The batch edits illustrated below would already have been executed by the time this notebook is published.

In [35]:
def query_wikidata_by_taxonIDs(ids, p_number_in="P846", p_number_out="P685", chunksize=100, sleeplen=2, url="https://query.wikidata.org/sparql"):
    """
    Given a list of taxonIDs, e.g. GBIF, find linked taxonIDs of another type, e.g. NCBI, from Wikidata.
    The P-numbers for the query and return taxonIDs should be specified. This function formulates a
    simple SPARQL query and submits it in chunks to the Wikidata servers.
    
    Parameters
    ----------
    ids : list
        List of taxonIDs
    p_number_in : str
        P number of the taxon identifier to query (default: P846 for GBIF Backbone)
    p_number_out : str
        P number of the linked taxon identifiers to return (default: P685 for NCBI Taxonomy)
    chunksize : int
        Number of records to search at once; limited to 100 (default 100)
    sleeplen : int
        Seconds to pause between chunks (default 2)
    url : str
        Wikidata Spaqrl query URL (default https://query.wikidata.org/sparql)
    
    Returns
    -------
    list
        List of dicts, with keys 'gbif_taxonID','wikidata_uri','ncbi_taxonID_wd'
    """
    if chunksize > 100:
        chunksize = 100 # Limit chunk size to 100
    out = []
    for chunk in range(0,floor(len(ids)/chunksize)+1):
        id_string = " ".join(
            ['\"' + str(i) +'\"' 
             for i in ids[chunk*chunksize : (chunk+1)*chunksize]]
        ) # Operate in chunks of chunksize, with pause between queries

        query = """
        SELECT ?item ?itemLabel ?rprop ?rpropLabel ?taxonname ?taxonnameLabel ?QPROP WHERE {
          VALUES ?QPROP {%s}
          ?item wdt:%s ?QPROP.
          OPTIONAL {
            ?item wdt:%s ?rprop.
            ?item wdt:P225 ?taxonname.
          }
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
        }
        LIMIT 100


        """ % (id_string, p_number_in, p_number_out) # str.format doesn't work for some reason
        r1 = requests.get(url, params={'query' : query})

        # If OK, parse
        if r1.ok:
            # Parse XML
            rtree = etree.fromstring(
                r1.text.encode()
            ) # .encode otherwise ValueError "Unicode strings with encoding declaration are not supported"
            # Strip namespace prefix
            for e in rtree.getiterator():
                e.tag = etree.QName(e).localname

            # Translate results to dictionary
            for e in rtree.iterdescendants('result'):
                res_dict = {ee.get('name') : ee for ee in e.findall('binding')}
                res = { 
                    'query_taxonID' : res_dict['QPROP'].find('literal').text,
                    'wikidata_uri' : res_dict['item'].find('uri').text,
                }
                if 'rprop' in res_dict:
                    res['linked_taxonID_wd'] = res_dict['rprop'].find('literal').text
                out.append(res)

        # sleep 2 sec before next iteration
        sleep(sleeplen)
    return(out)

In [36]:
# Get all the GBIF taxonIDs and find corresponding Wikidata items and linked NCBI taxonIDs where available
gbif_taxonIDs = list(set(dataset_merged_upd['gbif_taxonID']))

wikidata_gbif2ncbi_linked_IDs = query_wikidata_by_taxonIDs(gbif_taxonIDs, "P846", "P685")

In [37]:
gbif2wd = {
    i['query_taxonID'] : i['wikidata_uri'] 
    for i in wikidata_gbif2ncbi_linked_IDs 
    if 'wikidata_uri' in i
}
gbif2ncbi = {
    i['query_taxonID'] : i['linked_taxonID_wd'] 
    for i in wikidata_gbif2ncbi_linked_IDs 
    if 'linked_taxonID_wd' in i
}

dataset_merged_upd['gbif_wikidata_uri'] = dataset_merged_upd['gbif_taxonID'].apply(lambda x: gbif2wd[x] if x in gbif2wd else '')
dataset_merged_upd['ncbi_taxonID_wd'] = dataset_merged_upd['gbif_taxonID'].apply(lambda x: gbif2ncbi[x] if x in gbif2ncbi else '')

# Remove duplicate lines
dataset_merged_upd = dataset_merged_upd.drop_duplicates()

In [38]:
dataset_merged_upd.query('ncbi_taxonID_wd != "" & ncbi_taxonID == ncbi_taxonID_wd').value_counts('status').to_frame()

Unnamed: 0_level_0,count
status,Unnamed: 1_level_1
exact_match,3130
author_mismatch,271
noauthor,211
fuzzy,5


In [39]:
# NCBI taxonID found but no Wikidata item - look up Wikidata with NCBI taxonID instead
ncbi_taxonIDs = list(set(dataset_merged_upd.query('ncbi_taxonID != "" & gbif_wikidata_uri == ""')['ncbi_taxonID']))

In [40]:
wikidata_ncbi2gbif_linked_IDs = query_wikidata_by_taxonIDs(ncbi_taxonIDs, "P685", "P846")

In [41]:
ncbi2wd = {
    i['query_taxonID'] : i['wikidata_uri'] 
    for i in wikidata_ncbi2gbif_linked_IDs
    if 'wikidata_uri' in i
}
ncbi2gbif = {
    i['query_taxonID'] : i['linked_taxonID_wd']
    for i in wikidata_ncbi2gbif_linked_IDs
    if 'linked_taxonID_wd' in i
}

dataset_merged_upd['ncbi_wikidata_uri'] = dataset_merged_upd['ncbi_taxonID'].apply(lambda x: ncbi2wd[x] if x in ncbi2wd else '')
dataset_merged_upd['gbif_taxonID_wd'] = dataset_merged_upd['ncbi_taxonID'].apply(lambda x: ncbi2gbif[x] if x in ncbi2gbif else '')

# Remove duplicate lines
dataset_merged_upd = dataset_merged_upd.drop_duplicates()

## Classify different cases and flag cases for manual curation

In [42]:
# No NCBI taxonIDs found either by name matching or Wikidata lookup, despite accounting for synonyms
df = pd.merge(
    dataset_merged_upd,
    dataset_merged_upd.groupby(['dataset_taxonID','dataset_scientificName']).apply(
        lambda x: pd.Series(
            {
                'curation' : 'no_match_incl_synonyms' 
                if not (x['ncbi_taxonID'].any(axis=0) or x['ncbi_taxonID_wd'].any(axis=0))
                else ""
            } # brackets after 'if not' are necessary!
        )
    ).reset_index()
)

In [43]:
# Name matching and Wikidata agree on the taxonID, and the authorships also match
df.loc[
    (df['ncbi_taxonID'] == df['ncbi_taxonID_wd']) & (df['status'] == 'exact_match'),
    'curation'
] = 'ok_auto'

# Name matching has exact match but Wikidata links a different NCBI taxonID
# -- may be a deprecated/merged identifier
df.loc[
    (df['ncbi_taxonID'] != df['ncbi_taxonID_wd']) & (df['status'] == "exact_match") & (df['ncbi_taxonID_wd'] != ""),
    'curation'
] = 'wd_update_ncbi_taxonid'

# Name matching has exact match but Wikidata does not link an NCBI taxonID
# -- Wikidata should be updated
df.loc[
    (df['gbif_wikidata_uri'] != "") & (df['ncbi_taxonID_wd'] == "") & (df['ncbi_wikidata_uri'] == "") & (df['status'] == "exact_match"),
    'curation'
] = 'wd_batch_add_ncbi_taxonid'

# Name matching and Wikidata agree on the taxonID, but NCBI taxonomy does not give an author
# -- have to manually verify authors
df.loc[
    (df['ncbi_taxonID'] == df['ncbi_taxonID_wd']) & (df['status'] == 'noauthor'),
    'curation'
] = 'wd_verify_authorship'

# Name matching and Wikidata agree on the taxonID, but authorships do not match
# -- have to manually verify authors
df.loc[
    (df['ncbi_taxonID'] == df['ncbi_taxonID_wd']) & (df['status'] == 'author_mismatch'),
    'curation'
] = 'wd_verify_authorship'

# Name matching has exact match and GBIF taxon is an accepted name, but Wikidata has a different GBIF taxonID
# -- Wikidata should be updated
df.loc[
    (df['gbif_taxonID'] != df['gbif_taxonID_wd']) & (df['gbif_taxonID_wd'] != "") & (df['status'] == "exact_match") & (df['taxonomicStatus'] == "accepted"),
    'curation'
] = 'wd_batch_update_gbif_taxonid'

# Name matching has exact match, Wikidata has a different GBIF taxonID, but GBIF taxon is not an accepted name
# -- probably a synonym in GBIF, verify before linking identifiers
df.loc[
    (df['gbif_taxonID'] != df['gbif_taxonID_wd']) & (df['gbif_taxonID_wd'] != "") & (df['status'] == "exact_match") & (df['taxonomicStatus'] != "accepted"),
    'curation'
] = 'gbif_verify_synonym'

In [44]:
df.groupby('status')['curation'].value_counts().to_frame()

Unnamed: 0_level_0,Unnamed: 1_level_0,count
status,curation,Unnamed: 2_level_1
,no_match_incl_synonyms,1310
,,8
author_mismatch,wd_verify_authorship,271
author_mismatch,,217
exact_match,ok_auto,3130
exact_match,wd_batch_update_gbif_taxonid,245
exact_match,wd_batch_add_ncbi_taxonid,177
exact_match,gbif_verify_synonym,108
exact_match,,33
exact_match,wd_update_ncbi_taxonid,11


In [45]:
# Export to TSV files

# Cases needing no further curation
df.query(
    'curation == "ok_auto" or curation == "no_match_incl_synonyms"'
).sort_values(
    ['dataset_taxonID','gbif_speciesName']
).to_csv(
    "results/no_curation_needed.tsv", 
    sep="\t", quoting=None, index=False
)

# Cases where Wikidata can be batch-edited to resolve clear errors or outdated information
df.query(
    'curation == "wd_batch_update_gbif_taxonid" or curation == "wd_batch_add_ncbi_taxonid"'
).sort_values(
    ['dataset_taxonID','gbif_speciesName']
).to_csv(
    "results/batch_curation.tsv", 
    sep="\t", quoting=None, index=False
)

# Cases that require manual curation
df.query(
    'curation not in ["ok_auto","no_match_incl_synonyms","wd_batch_update_gbif_taxonid","wd_batch_add_ncbi_taxonid"]'
).sort_values(
    ['dataset_taxonID','gbif_speciesName']
).to_csv(
    "results/manual_curation_needed.tsv", 
    sep="\t", quoting=None, index=False
)

## Batch edits of Wikidata

The following cases represent exactly-matching names between GBIF and NCBI Taxonomies (including authorship fields), but where there were either missing, outdated, or incorrect information in Wikidata. Wikidata can be automatically batch-updated. 

 * `wd_batch_add_ncbi_taxonid` : Wikidata has no link to NCBI taxonID - add NCBI taxonID. In many cases the name is not currently accepted in the NCBI Taxonomy, accepted taxon and synonym share same NCBI taxonID.\
 * `wd_batch_update_gbif_taxonid` : Wikidata has different GBIF taxonID, but the GBIF taxonID we have is the currently accepted one in the GBIF Backbone Taxonomy - update GBIF taxonID. Ad lib check of several such cases show that a wrong, homonymous taxon was linked, likely as the result of name matching without verification of authorship.


Wikidata identifiers for reference:
 * NCBI taxonID `P685`
 * GBIF taxonID `P846`
 * Stated in `P248`
 * Taxonomy database of the U.S. National Center for Biotechnology Information `Q13711410`
 * Retrieved `P813`

Use QuickStatements for batch editing https://quickstatements.toolforge.org/#/. The CSV files generated below can be imported via the web interface. Alternatively the commands can be copied and pasted into the browser. One can also interact with QuickStatements via its API.

Requires user account, and for the account to be autoconfirmed (account on Wikidata at least X days old, > 50 edits).

In [46]:
with open("results/wd_batch_update_gbif_taxonid.quickstatements.csv", "w") as fh:
    # Quickstatements V2 syntax (CSV)
    # Header
    fh.write(",".join([
        "qid",
        "-P846", # Remove previous GBIF taxonID claim
        "#",
        "P846", # GBIF taxonID
        '#', # edit comment
        'S248', # stated in
        's813', # retrieved
    ]))
    fh.write("\n")
    for i in df.query('curation == "wd_batch_update_gbif_taxonid"')[['ncbi_wikidata_uri','gbif_taxonID_wd','gbif_taxonID']].itertuples():
        cmd = ','.join([
            i.ncbi_wikidata_uri.split('/')[-1],
            '\"\"\"' + i.gbif_taxonID_wd + '\"\"\"', # Strings or external-ids should be triple-quoted
            'Remove identifier pointing to homonym with wrong authorship',
            '\"\"\"' + i.gbif_taxonID + '\"\"\"', # Strings or external-ids should be triple-quoted
            'Add correct GBIF identifier',
            'Q1531570', # Global Biodiversity Information Facility
            '+2021-11-26T00:00:00Z/11', # Timestamp of the GBIF Backbone Taxonomy dump we are using. Dates/times must follow this format, /11 indicates precision to day
        ])
        fh.write(cmd + "\n")

In [47]:
with open("results/wd_batch_add_ncbi_taxonid.quickstatements.csv", "w") as fh:
    # Quickstatements V2 syntax (CSV)
    # Header
    fh.write(",".join([
        "qid",
        "P685", # NCBI taxonID
        '#', # edit comment
        'S248', # stated in
        's813', # retrieved
    ]))
    fh.write("\n")
    for i in df.query('curation == "wd_batch_add_ncbi_taxonid"')[['gbif_wikidata_uri','ncbi_taxonID']].itertuples():
        cmd = ','.join([
            i.gbif_wikidata_uri.split('/')[-1],
            '\"\"\"' + i.ncbi_taxonID + '\"\"\"', # Strings or external-ids should be triple-quoted
            'Add NCBI Taxonomy identifier',
            'Q13711410', # Taxonomy database of the U.S. National Center for Biotechnology Information
            '+2022-12-01T00:00:00Z/11', # Timestamp of the NCBI Taxonomy dump we are using. Dates/times must follow this format, /11 indicates precision to day
        ])
        fh.write(cmd + "\n")