Objectives

 * Find deprecated GBIF taxonIDs in Wikidata and update them with the latest ones that are accepted in the GBIF Taxonomy, if available
   * Strategy: Get all records from Wikidata with GBIF taxonIDs, compare against current version of GBIF Backbone Taxonomy. If deleted, search for currently accepted GBIF taxonID
   * Getting all taxa with GBIF taxonIDs will timeout. We can instead get a list of all genera (or families), and then progressively work through child taxa for each genus
 * Find homonyms
   * 

In [2]:
import requests
import json

from collections import Counter, defaultdict
from lxml import etree
from time import sleep
from math import floor

In [4]:
wd_url = "https://query.wikidata.org/sparql"
# Get genera with GBIF taxonIDs
query = """
SELECT DISTINCT ?item ?gbif WHERE {
  ?item wdt:P105 wd:Q34740;
        wdt:P846 ?gbif.
}
"""

req = requests.get(wd_url, params = {'query' : query })

In [5]:
req.ok

True

In [7]:
req.url

'https://query.wikidata.org/sparql?query=%0ASELECT+DISTINCT+%3Fitem+%3Fgbif+WHERE+%7B%0A++%3Fitem+wdt%3AP105+wd%3AQ34740%3B%0A++++++++wdt%3AP846+%3Fgbif.%0A%7D%0A'

In [51]:
# Parse XML
rtree = etree.fromstring(
    req.text.encode()
) # .encode otherwise ValueError "Unicode strings with encoding declaration are not supported"
# Strip namespace prefix
for e in rtree.getiterator():
    e.tag = etree.QName(e).localname

gbif2wd = defaultdict(list)
for e in rtree.iterdescendants('result'):
    res_dict = {ee.get('name') : [i.text for i in ee.getchildren()] for ee in e.findall('binding')}
    for gbif in res_dict['gbif']:
        gbif2wd[gbif].extend(res_dict['item'])

In [58]:
# Errors: Duplicate GBIF ID - either duplicate record in Wikidata or homonym linked in error
duplicate_gbif_id = {i: gbif2wd[i] for i in gbif2wd if len(gbif2wd[i]) > 1}

In [47]:
res_dict

{'item': ['http://www.wikidata.org/entity/Q46660'], 'gbif': ['4974046']}

Example searches:

Get all genera level taxa with GBIF IDs (searching for species level taxa will timeout):
```
SELECT DISTINCT ?item ?gbif WHERE {
  ?item wdt:P105 wd:Q34740;
        wdt:P846 ?gbif.
}
```

Search by GBIF ID, get taxon name property and taxon author qualifier, as well as botanist author abbreviation
```
SELECT DISTINCT ?item ?itemLabel ?taxonname ?taxonauthor ?taxonauthorLabel ?botanist WHERE {
  SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE]". }
  ?item wdt:P846 "8178883";
        wdt:P225 ?taxonname.
  OPTIONAL { ?item p:P225 ?tname . ?tname pq:P405 ?taxonauthor . ?taxonauthor wdt:P428 ?botanist }
}
```

Traverse graph to find child taxa of Q158501
```
PREFIX gas: <http://www.bigdata.com/rdf/gas#>

SELECT ?item ?itemLabel ?linkTo
WHERE
{
  SERVICE gas:service {
    gas:program gas:gasClass "com.bigdata.rdf.graph.analytics.SSSP" ;
                gas:in wd:Q158501;
                gas:traversalDirection "Reverse" ;
                gas:out ?item ;
                gas:out1 ?depth ;
                gas:maxIterations 3 ;
                gas:linkType wdt:P171 .
  }
  OPTIONAL { ?item wdt:P171 ?linkTo }
  SERVICE wikibase:label {bd:serviceParam wikibase:language "en" }
}
```

In [None]:
def query_wikidata_by_taxonIDs(ids, p_number_in="P846", p_number_out="P685", chunksize=100, sleeplen=2, url="https://query.wikidata.org/sparql"):
    """
    Given a list of taxonIDs, e.g. GBIF, find linked taxonIDs of another type, e.g. NCBI, from Wikidata.
    The P-numbers for the query and return taxonIDs should be specified. This function formulates a
    simple SPARQL query and submits it in chunks to the Wikidata servers.
    
    Parameters
    ----------
    ids : list
        List of taxonIDs
    p_number_in : str
        P number of the taxon identifier to query (default: P846 for GBIF Backbone)
    p_number_out : str
        P number of the linked taxon identifiers to return (default: P685 for NCBI Taxonomy)
    chunksize : int
        Number of records to search at once; limited to 100 (default 100)
    sleeplen : int
        Seconds to pause between chunks (default 2)
    url : str
        Wikidata Spaqrl query URL (default https://query.wikidata.org/sparql)
    
    Returns
    -------
    list
        List of dicts, with keys 'gbif_taxonID','wikidata_uri','ncbi_taxonID_wd'
    """
    if chunksize > 100:
        chunksize = 100 # Limit chunk size to 100
    out = []
    for chunk in range(0,floor(len(ids)/chunksize)+1):
        id_string = " ".join(
            ['\"' + str(i) +'\"' 
             for i in ids[chunk*chunksize : (chunk+1)*chunksize]]
        ) # Operate in chunks of chunksize, with pause between queries

        query = """
        SELECT ?item ?itemLabel ?rprop ?rpropLabel ?taxonname ?taxonnameLabel ?QPROP WHERE {
          VALUES ?QPROP {%s}
          ?item wdt:%s ?QPROP.
          OPTIONAL {
            ?item wdt:%s ?rprop.
            ?item wdt:P225 ?taxonname.
          }
          SERVICE wikibase:label { bd:serviceParam wikibase:language "[AUTO_LANGUAGE],en". }
        }
        LIMIT 100


        """ % (id_string, p_number_in, p_number_out) # str.format doesn't work for some reason
        r1 = requests.get(url, params={'query' : query})

        # If OK, parse
        if r1.ok:
            # Parse XML
            rtree = etree.fromstring(
                r1.text.encode()
            ) # .encode otherwise ValueError "Unicode strings with encoding declaration are not supported"
            # Strip namespace prefix
            for e in rtree.getiterator():
                e.tag = etree.QName(e).localname

            # Translate results to dictionary
            for e in rtree.iterdescendants('result'):
                res_dict = {ee.get('name') : ee for ee in e.findall('binding')}
                res = { 
                    'query_taxonID' : res_dict['QPROP'].find('literal').text,
                    'wikidata_uri' : res_dict['item'].find('uri').text,
                }
                if 'rprop' in res_dict:
                    res['linked_taxonID_wd'] = res_dict['rprop'].find('literal').text
                out.append(res)

        # sleep 2 sec before next iteration
        sleep(sleeplen)
    return(out)