<a href="https://colab.research.google.com/github/otoperalias/miscellanea/blob/main/Wikidata_queries.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Link
https://itnext.io/extracting-data-from-wikidata-using-sparql-and-python-59e0037996f


In [None]:
import sys
import pandas as pd
from typing import List, Dict
from SPARQLWrapper import SPARQLWrapper, JSON

class WikiDataQueryResults:
    """
    A class that can be used to query data from Wikidata using SPARQL and return the results as a Pandas DataFrame or a list
    of values for a specific key.
    """
    def __init__(self, query: str):
        """
        Initializes the WikiDataQueryResults object with a SPARQL query string.
        :param query: A SPARQL query string.
        """
        self.user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
        self.endpoint_url = "https://query.wikidata.org/sparql"
        self.sparql = SPARQLWrapper(self.endpoint_url, agent=self.user_agent)
        self.sparql.setQuery(query)
        self.sparql.setReturnFormat(JSON)

    def __transform2dicts(self, results: List[Dict]) -> List[Dict]:
        """
        Helper function to transform SPARQL query results into a list of dictionaries.
        :param results: A list of query results returned by SPARQLWrapper.
        :return: A list of dictionaries, where each dictionary represents a result row and has keys corresponding to the
        variables in the SPARQL SELECT clause.
        """
        new_results = []
        for result in results:
            new_result = {}
            for key in result:
                new_result[key] = result[key]['value']
            new_results.append(new_result)
        return new_results

    def _load(self) -> List[Dict]:
        """
        Helper function that loads the data from Wikidata using the SPARQLWrapper library, and transforms the results into
        a list of dictionaries.
        :return: A list of dictionaries, where each dictionary represents a result row and has keys corresponding to the
        variables in the SPARQL SELECT clause.
        """
        results = self.sparql.queryAndConvert()['results']['bindings']
        results = self.__transform2dicts(results)
        return results

    def load_as_dataframe(self) -> pd.DataFrame:
        """
        Executes the SPARQL query and returns the results as a Pandas DataFrame.
        :return: A Pandas DataFrame representing the query results.
        """
        results = self._load()
        return pd.DataFrame.from_dict(results)

In [None]:
SELECT DISTINCT ?person ?personLabel ?occupationLabel ?dateOfBirth ?birthLocationLabel
WHERE {
  ?person wdt:P31 wd:Q5 ;  # Select instances of humans
          wdt:P19 wd:Q207 ;  # Born in Andalusia (Spain)
          wdt:P106 ?occupation ;  # Occupation
          wdt:P569 ?dateOfBirth ;  # Date of birth
          rdfs:label ?personLabel .  # Person's label
  ?occupation rdfs:label ?occupationLabel .  # Occupation's label
  ?person wdt:P19 ?birthLocation .  # Birth location
  ?birthLocation rdfs:label ?birthLocationLabel .  # Birth location's label
  FILTER(LANG(?personLabel) = "en")  # English labels only
  FILTER(LANG(?occupationLabel) = "en")  # English occupation labels only
  FILTER(LANG(?birthLocationLabel) = "en")  # English birth location labels only
}
ORDER BY ?personLabel

This query selects individuals who:

Are instances of humans (Q5).
Were born in Andalusia (Q207).
Have an occupation (P106).
Have a date of birth (P569).
The results will include their name, occupation, date of birth, and birth location. Feel free to execute this query on the Wikidata SPARQL endpoint to explore the fascinating biographies of people from Andalusia! 🌟

1: Wikidata:SPARQL query service/queries/examples

In [None]:
SELECT DISTINCT ?persona ?nombrePersona ?ocupacion ?fechaNacimiento ?lugarNacimiento
WHERE {
  ?persona wdt:P31 wd:Q5 ;  # Selecciona instancias de seres humanos
          wdt:P19 wd:Q207 ;  # Nacidos en Andalucía (España)
          wdt:P106 ?ocupacion ;  # Ocupación
          wdt:P569 ?fechaNacimiento ;  # Fecha de nacimiento
          rdfs:label ?nombrePersona .  # Nombre de la persona
  ?ocupacion rdfs:label ?ocupacionLabel .  # Etiqueta de la ocupación
  ?persona wdt:P19 ?lugarNacimiento .  # Lugar de nacimiento
  ?lugarNacimiento rdfs:label ?lugarNacimientoLabel .  # Etiqueta del lugar de nacimiento
  FILTER(LANG(?nombrePersona) = "es")  # Solo etiquetas en español
  FILTER(LANG(?ocupacionLabel) = "es")  # Solo etiquetas de ocupación en español
  FILTER(LANG(?lugarNacimientoLabel) = "es")  # Solo etiquetas del lugar de nacimiento en español
}
ORDER BY ?nombrePersona

In [None]:
data_extracter = WikiDataQueryResults(query)
df = data_extracter.load_as_dataframe()
print(df.head())