# Demo: Reading Linked Data
* for this exercise you will need to install SPARQLWrapper:
  * __`~/anaconda3/bin/conda install -c conda-forge sparqlwrapper`__

In [None]:
import pandas as pd
import json
from SPARQLWrapper import SPARQLWrapper, JSON

In [None]:
def get_sparql_dataframe(service, query):
    """
    Helper function to convert SPARQL results into a Pandas data frame.
    """
    sparql = SPARQLWrapper(service)
    sparql.setQuery(query)
    sparql.setReturnFormat(JSON)
    result = sparql.query()

    processed_results = json.load(result.response)
    cols = processed_results['head']['vars']

    out = []
    for row in processed_results['results']['bindings']:
        item = []
        for c in cols:
            item.append(row.get(c, {}).get('value'))
        out.append(item)

    return pd.DataFrame(out, columns=cols)

In [None]:
wds = "https://query.wikidata.org/sparql"

In [None]:
# This is a SPARQL query to send to the SPARQL endpoint defined in the
# previous step. It mixes three vocabularies that each have their own
# definitions but is ultimately a selection from the Wikidata graph.
#
# We're looking for distinct rows of individuals who have an orcid
# (https://orcid.org) and any English descriptions and labels we
# might also have about them. We're matching a pattern in the graph
# for any node that is connected to other nodes with these relationships.
#
# Note that the relationships themselves are often resolvable. To
# understand what wdt:P496 means, expand it into its full URL by
# applying the prefix for wdt and then issue an HTTP request to
# http://www.wikidata.org/prop/direct/P496
#
# For more information on SPARQL, please consult "Learning SPARQL (2nd
# Edition)" by Bob DuCharme.

rq = """
PREFIX bd: <http://www.bigdata.com/rdf#>
PREFIX wikibase: <http://wikiba.se/ontology#>
PREFIX wdt: <http://www.wikidata.org/prop/direct/>

select distinct
   ?item
   ?itemLabel
   ?orcid
   ?description
WHERE {
  ?item wdt:P496 ?orcid 
  OPTIONAL { ?item schema:description ?description filter (lang(?description) = "en") }
  SERVICE wikibase:label {
        bd:serviceParam wikibase:language "en" .
  }
} Limit 100000
"""

In [None]:
wikidf = get_sparql_dataframe(wds, rq)

In [None]:
# Let's inspect our DataFrame
wikidf.head()

In [None]:
# Some stats...
wikidf.describe()

In [None]:
# Top 10 most frequently occurring job titles
wikidf['description'].value_counts()[:10]

In [None]:
# A Jupyter trick to store this DataFrame so we can use it in our exercise
%store wikidf