In [None]:
from pprint import pprint

from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

endpoint_url = "https://dbpedia.org/sparql"
prefixes = """ 
PREFIX schema: <http://schema.org/> 

PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
PREFIX owl: <http://www.w3.org/2002/07/owl#> 
PREFIX dc: <http://purl.org/dc/elements/1.1/> 

PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>

PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dbc: <http://dbpedia.org/resource/Category:>
""" # found later in the process


# Define a function to query and return a DataFrame
def run_query(query, endpoint=endpoint_url):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(prefixes + query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if 'boolean' in results:  # type: ignore
        return results['boolean']  # type: ignore

    elif 'head' in results and 'results' in results:  # type: ignore
        cols = results["head"]["vars"] # type: ignore
        data = []
        for result in results["results"]["bindings"]: # type: ignore
            row = [result.get(col, {}).get("value", None) for col in cols] # type: ignore
            data.append(row)

        return pd.DataFrame(data, columns=cols) # type: ignore
    
    else:
        raise ValueError("Unknown SPARQL response format")


# https://dbpedia.org/page/Italy


In [None]:
# 1.
# ?? finding out the predicates on the Italy page
# 
all_query_predicates = '''
SELECT DISTINCT ?p
WHERE { 
    dbr:Italy ?p ?o .
}
'''

predicates_df = run_query(all_query_predicates)
predicates_df

Unnamed: 0,p
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1,http://www.w3.org/2000/01/rdf-schema#label
2,http://www.w3.org/2000/01/rdf-schema#comment
3,http://www.w3.org/2000/01/rdf-schema#seeAlso
4,http://xmlns.com/foaf/0.1/name
...,...
120,http://dbpedia.org/ontology/governmentType
121,http://dbpedia.org/ontology/language
122,http://dbpedia.org/ontology/timeZone
123,http://xmlns.com/foaf/0.1/isPrimaryTopicOf


In [None]:
# 2.
# ?? checking for the existence of specific predicates
# 
query_predicates = '''
SELECT DISTINCT ?p
WHERE {
  dbr:Italy ?p ?o .
  VALUES ?p {
    <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
    <http://www.w3.org/2000/01/rdf-schema#label> 
  }
}
'''

predicates_df = run_query(query_predicates)
predicates_df

['p']


Unnamed: 0,p
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1,http://www.w3.org/2000/01/rdf-schema#label


In [None]:
# 3.
# ?? checking for the existence of important predicates
# 
query_predicates = """
SELECT DISTINCT ?p
WHERE {
  dbr:Italy ?p ?o .
  FILTER (
    STRSTARTS(STR(?p), "http://www.w3.org/2002/07/owl#") ||
    STRSTARTS(STR(?p), "http://purl.org/dc/elements/1.1/") ||
    STRSTARTS(STR(?p), "http://dbpedia.org/ontology/") ||
    STRSTARTS(STR(?p), "http://dbpedia.org/resource/")
  )
}
"""

predicates_df = run_query(query_predicates)
predicates_df

Unnamed: 0,p
0,http://dbpedia.org/ontology/wikiPageID
1,http://dbpedia.org/ontology/wikiPageRevisionID
2,http://dbpedia.org/ontology/wikiPageWikiLink
3,http://dbpedia.org/ontology/wikiPageExternalLink
4,http://www.w3.org/2002/07/owl#sameAs
5,http://dbpedia.org/ontology/thumbnail
6,http://dbpedia.org/ontology/abstract
7,http://dbpedia.org/ontology/anthem
8,http://dbpedia.org/ontology/currency
9,http://dbpedia.org/ontology/topLevelDomain


In [None]:
# 4.
# ?? exploring the current regional data –> stuff is missing, Wikidata needs to be used to reconcile this
# 

regions_of_italy_query = """
SELECT DISTINCT ?region ?name WHERE {
  ?region dct:subject dbc:Regions_of_Italy .
  ?region rdf:type dbo:Region .
  ?region dbo:country dbr:Italy .
  ?region rdfs:label ?name .
  FILTER (lang(?name) = 'it')
}
ORDER BY ?name
"""


regions_of_italy = run_query(regions_of_italy_query)
regions_of_italy

['region', 'name']


Unnamed: 0,region,name
0,http://dbpedia.org/resource/Friuli_Venezia_Giulia,Aeroporto di Trieste-Ronchi dei Legionari
1,http://dbpedia.org/resource/Aosta_Valley,Aosta
2,http://dbpedia.org/resource/Basilicata,Basilicata
3,http://dbpedia.org/resource/Calabria,Calabria
4,http://dbpedia.org/resource/Campania,Campania
5,http://dbpedia.org/resource/Calabria,Consiglio regionale della Calabria
6,http://dbpedia.org/resource/Friuli_Venezia_Giulia,Consiglio regionale del Friuli-Venezia Giulia
7,http://dbpedia.org/resource/Emilia-Romagna,Emilia
8,http://dbpedia.org/resource/Emilia-Romagna,Emilia-Romagna
9,http://dbpedia.org/resource/Friuli_Venezia_Giulia,Friuli


In [None]:
# 5.
# ?? the labels in the dataset require conversion from Italian to English
# 

def convert_italian_to_english_label(italian_region_name) -> str:
  basic_query = """
  SELECT ?region ?english_region_name ?italian_region_name 
  WHERE {
    ?region wdt:P31 ?region_type.
    VALUES ?region_type { wd:Q16110 wd:Q1710033 }       

    SERVICE wikibase:label {
      bd:serviceParam wikibase:language "en" .
      ?region rdfs:label ?english_region_name .
    }
    SERVICE wikibase:label {
      bd:serviceParam wikibase:language "it" .
      ?region rdfs:label ?italian_region_name .
    }
  }
  ORDER BY ?english_region_name
  """


  italian_regions = run_query(basic_query, endpoint="https://query.wikidata.org/sparql")
  match = italian_regions[italian_regions['italian_region_name'] == italian_region_name] # type: ignore
  if not match.empty: # type: ignore
      return match.iloc[0]['english_region_name'] # type: ignore
  else:
      return f"Region '{italian_region_name}' not found" # type: ignore

In [None]:
# 6.
# ?? exploring the current regional data –> stuff is missing, Wikidata needs to be used to reconcile this
# 

from urllib.parse import quote

def get_region_pages():
    metadata_df = pd.read_excel("kiparla-data/metadata/KIPasti_conversations.xlsx", keep_default_na=False)
    # dbpedia_base_url = "https://dbpedia.org/page/"

    regions = set()
    # region_links = 

    for italian_region_name in metadata_df["region"]:
        italian_region_name = italian_region_name.strip().capitalize()
        if '-' in italian_region_name:
            italian_region_name = '-'.join(region_name_part.capitalize() for region_name_part in italian_region_name.split("-"))
        regions.add(italian_region_name)

    regions = sorted(regions)
    regions = [convert_italian_to_english_label(region) for region in regions]

    # for region in regions:
    #     region = region.replace("-", "_")
    #     dbpedia_url = dbpedia_base_url + quote(region)
    #     print(dbpedia_url)
    
    region_page_resources = ["dbr:" + region_name for region_name in regions]
    region_page_resources_str = ' '.join(region_page_resources)
    
    return region_page_resources_str


In [177]:
region_page_resources_str = get_region_pages()

abstract_query = f"""
SELECT ?region ?abstract 
WHERE {{
  VALUES ?region {{
    {region_page_resources_str}
  }}
  ?region dbo:abstract ?abstract .
  FILTER (lang(?abstract) = "it")
}}
ORDER BY ?region
"""

region_abstracts = run_query(abstract_query)
region_abstracts

Unnamed: 0,region,abstract
0,http://dbpedia.org/resource/Abruzzo,L'Abruzzo (o gli Abruzzi) (AFI: /aˈbruʦʦo/) è ...
1,http://dbpedia.org/resource/Apulia,"La Puglia (AFI: /ˈpuʎʎa/; Apulia in latino, Ἰα..."
2,http://dbpedia.org/resource/Basilicata,La Basilicata (AFI: /baziliˈkata/) è una regio...
3,http://dbpedia.org/resource/Calabria,La Calabria (AFI: /kaˈlabrja/; Calabria in cal...
4,http://dbpedia.org/resource/Calabria,Il Consiglio regionale della Calabria è l'orga...
5,http://dbpedia.org/resource/Campania,La Campania (AFI: /kamˈpanja/) è una regione i...
6,http://dbpedia.org/resource/Emilia-Romagna,"L'Emilia (Emeja, Emélia, Emégglia o Emilia in ..."
7,http://dbpedia.org/resource/Emilia-Romagna,L'Emilia-Romagna (IPA: /eˈmilja roˈmaɲɲa/; Emi...
8,http://dbpedia.org/resource/Lazio,"Il Latium (Lătĭŭm, in italiano Lazio) fu una r..."
9,http://dbpedia.org/resource/Lazio,Il Lazio è una regione a statuto ordinario del...
