In [None]:
from pprint import pprint

from SPARQLWrapper import SPARQLWrapper, JSON
import pandas as pd

from pathlib import Path

METADATA_PATH = Path.cwd().parent / "kiparla-data" / "metadata" / "KIPasti_conversations.xlsx"

endpoint_url = "https://dbpedia.org/sparql"
wikidata_endpoint = "https://query.wikidata.org/sparql"

prefixes = """ 
PREFIX schema: <http://schema.org/> 

PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> 
PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#> 
PREFIX owl: <http://www.w3.org/2002/07/owl#> 
PREFIX dc: <http://purl.org/dc/elements/1.1/> 

PREFIX dbo: <http://dbpedia.org/ontology/>
PREFIX dbr: <http://dbpedia.org/resource/>

PREFIX dct: <http://purl.org/dc/terms/>
PREFIX dbc: <http://dbpedia.org/resource/Category:>
""" # found later in the process


# Define a function to query and return a DataFrame
def run_query(query, endpoint=endpoint_url):
    sparql = SPARQLWrapper(endpoint)
    sparql.setQuery(prefixes + query)
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()

    if "boolean" in results:  # type: ignore
        return results["boolean"]  # type: ignore

    elif "head" in results and "results" in results:  # type: ignore
        cols = results["head"]["vars"] # type: ignore
        data = []
        for result in results["results"]["bindings"]: # type: ignore
            row = [result.get(col, {}).get("value", None) for col in cols] # type: ignore
            data.append(row)

        return pd.DataFrame(data, columns=cols) # type: ignore
    
    else:
        raise ValueError("Unknown SPARQL response format")


# https://dbpedia.org/page/Italy


In [2]:
# 1.
# ?? finding out the predicates on the Italy page
# 
all_query_predicates = """
SELECT DISTINCT ?p
WHERE { 
    dbr:Italy ?p ?o .
}
"""

predicates_df = run_query(all_query_predicates)
predicates_df

Unnamed: 0,p
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1,http://www.w3.org/2000/01/rdf-schema#label
2,http://www.w3.org/2000/01/rdf-schema#comment
3,http://www.w3.org/2000/01/rdf-schema#seeAlso
4,http://xmlns.com/foaf/0.1/name
...,...
120,http://dbpedia.org/ontology/governmentType
121,http://dbpedia.org/ontology/language
122,http://dbpedia.org/ontology/timeZone
123,http://xmlns.com/foaf/0.1/isPrimaryTopicOf


In [3]:
# 2.
# ?? checking for the existence of specific predicates
# 
query_predicates = """
SELECT DISTINCT ?p
WHERE {
  dbr:Italy ?p ?o .
  VALUES ?p {
    <http://www.w3.org/1999/02/22-rdf-syntax-ns#type> 
    <http://www.w3.org/2000/01/rdf-schema#label> 
  }
}
"""

predicates_df = run_query(query_predicates)
predicates_df

Unnamed: 0,p
0,http://www.w3.org/1999/02/22-rdf-syntax-ns#type
1,http://www.w3.org/2000/01/rdf-schema#label


In [4]:
# 3.
# ?? checking for the existence of important predicates
# 
query_predicates = """
SELECT DISTINCT ?p
WHERE {
  dbr:Italy ?p ?o .
  FILTER (
    STRSTARTS(STR(?p), "http://www.w3.org/2002/07/owl#") ||
    STRSTARTS(STR(?p), "http://purl.org/dc/elements/1.1/") ||
    STRSTARTS(STR(?p), "http://dbpedia.org/ontology/") ||
    STRSTARTS(STR(?p), "http://dbpedia.org/resource/")
  )
}
"""

predicates_df = run_query(query_predicates)
predicates_df

Unnamed: 0,p
0,http://dbpedia.org/ontology/wikiPageID
1,http://dbpedia.org/ontology/wikiPageRevisionID
2,http://dbpedia.org/ontology/wikiPageWikiLink
3,http://dbpedia.org/ontology/wikiPageExternalLink
4,http://www.w3.org/2002/07/owl#sameAs
5,http://dbpedia.org/ontology/thumbnail
6,http://dbpedia.org/ontology/abstract
7,http://dbpedia.org/ontology/anthem
8,http://dbpedia.org/ontology/currency
9,http://dbpedia.org/ontology/topLevelDomain


In [5]:
# 4.
# ?? exploring the current regional data –> stuff is missing, Wikidata needs to be used to reconcile this
# 

regions_of_italy_query = """
SELECT DISTINCT ?region ?name WHERE {
  ?region dct:subject dbc:Regions_of_Italy .
  ?region rdf:type dbo:Region .
  ?region dbo:country dbr:Italy .
  ?region rdfs:label ?name .
  FILTER (lang(?name) = "it")
}
ORDER BY ?name
"""


regions_of_italy = run_query(regions_of_italy_query)
regions_of_italy

Unnamed: 0,region,name
0,http://dbpedia.org/resource/Friuli_Venezia_Giulia,Aeroporto di Trieste-Ronchi dei Legionari
1,http://dbpedia.org/resource/Aosta_Valley,Aosta
2,http://dbpedia.org/resource/Basilicata,Basilicata
3,http://dbpedia.org/resource/Calabria,Calabria
4,http://dbpedia.org/resource/Campania,Campania
5,http://dbpedia.org/resource/Calabria,Consiglio regionale della Calabria
6,http://dbpedia.org/resource/Friuli_Venezia_Giulia,Consiglio regionale del Friuli-Venezia Giulia
7,http://dbpedia.org/resource/Emilia-Romagna,Emilia
8,http://dbpedia.org/resource/Emilia-Romagna,Emilia-Romagna
9,http://dbpedia.org/resource/Friuli_Venezia_Giulia,Friuli


In [6]:
# 5.
# ?? the labels in the dataset require conversion from Italian to English
# 

def convert_italian_to_english_label(italian_region_name) -> str:
  basic_query = """
  SELECT ?region ?english_region_name ?italian_region_name 
  WHERE {
    ?region wdt:P31 ?region_type.
    VALUES ?region_type { wd:Q16110 wd:Q1710033 }       

    SERVICE wikibase:label {
      bd:serviceParam wikibase:language "en" .
      ?region rdfs:label ?english_region_name .
    }
    SERVICE wikibase:label {
      bd:serviceParam wikibase:language "it" .
      ?region rdfs:label ?italian_region_name .
    }
  }
  ORDER BY ?english_region_name
  """


  italian_regions = run_query(basic_query, endpoint=wikidata_endpoint)
  match = italian_regions[italian_regions["italian_region_name"] == italian_region_name] # type: ignore
  if not match.empty: # type: ignore
      return match.iloc[0]["english_region_name"] # type: ignore
  else:
      return f"Region '{italian_region_name}' not found" # type: ignore

In [7]:
# 6.
# ?? exploring the current regional data –> stuff is missing, Wikidata needs to be used to reconcile this
# 

from urllib.parse import quote

def get_regions(convert_to_english=True):
    metadata_df = pd.read_excel(METADATA_PATH, keep_default_na=False)
    # dbpedia_base_url = "https://dbpedia.org/page/"

    regions = set()
    # region_links = 

    for italian_region_name in metadata_df["region"]:
        italian_region_name = italian_region_name.strip().capitalize()
        if "-" in italian_region_name:
            italian_region_name = "-".join(region_name_part.capitalize() for region_name_part in italian_region_name.split("-"))
        regions.add(italian_region_name)

    regions = sorted(regions)
    if convert_to_english:
        regions = [convert_italian_to_english_label(region) for region in regions]
    return regions    

def get_region_pages():
    regions = get_regions()
    region_page_resources = ["dbr:" + region_name for region_name in regions]
    region_page_resources_str = " ".join(region_page_resources)
    
    return region_page_resources_str


In [None]:
# 
# ?? 7. preparing data for JSON push
#

valid_regions = get_regions(False)
region_page_resources_str = get_region_pages()

population_query = f"""
SELECT ?region_name ?url ?population 
WHERE {{
  VALUES ?url {{
    {region_page_resources_str}
  }}
  ?url dbo:populationTotal ?population .
  ?url rdfs:label ?region_name .
  FILTER(lang(?region_name) = "it")
}}
ORDER BY ?url
"""
print(population_query)

print(valid_regions)
region_data = run_query(population_query)
region_data = region_data[region_data["region_name"].isin(valid_regions)].reset_index(drop=True) # type: ignore
region_data


SELECT ?region_name ?url ?population 
WHERE {
  VALUES ?url {
    dbr:Abruzzo dbr:Basilicata dbr:Calabria dbr:Campania dbr:Emilia-Romagna dbr:Lazio dbr:Lombardy dbr:Marche dbr:Apulia dbr:Sardinia dbr:Tuscany dbr:Umbria dbr:Veneto
  }
  ?url dbo:populationTotal ?population .
  ?url rdfs:label ?region_name .
  FILTER(lang(?region_name) = "it")
}
ORDER BY ?url

['Abruzzo', 'Basilicata', 'Calabria', 'Campania', 'Emilia-Romagna', 'Lazio', 'Lombardia', 'Marche', 'Puglia', 'Sardegna', 'Toscana', 'Umbria', 'Veneto']


Unnamed: 0,region_name,url,population
0,Abruzzo,http://dbpedia.org/resource/Abruzzo,1305770
1,Puglia,http://dbpedia.org/resource/Apulia,4063888
2,Basilicata,http://dbpedia.org/resource/Basilicata,575902
3,Calabria,http://dbpedia.org/resource/Calabria,1877527
4,Campania,http://dbpedia.org/resource/Campania,5869029
5,Emilia-Romagna,http://dbpedia.org/resource/Emilia-Romagna,4446220
6,Lazio,http://dbpedia.org/resource/Lazio,5864321
7,Lombardia,http://dbpedia.org/resource/Lombardy,10103969
8,Marche,http://dbpedia.org/resource/Marche,1541692
9,Sardegna,http://dbpedia.org/resource/Sardinia,1628384


In [17]:
#
# now for some conversations querying...
#

conversation_themes_query = """
SELECT ?conv ?convLabel WHERE {
  ?conv rdfs:label "conversation"@en.
  SERVICE wikibase:label { bd:serviceParam wikibase:language "en". }
}
"""

conversation_themes = run_query(conversation_themes_query, wikidata_endpoint)
conversation_themes

Unnamed: 0,conv,convLabel
0,http://www.wikidata.org/entity/Q52943,conversation


In [None]:
#
# running the query, then removing irrelevant predicates
#

conversation_predicates_query = """
SELECT (SAMPLE(?itemLabel) AS ?exampleLabel) ?predicate
WHERE {
  ?item ?predicate wd:Q52943;
        rdfs:label ?itemLabel.
  FILTER(LANG(?itemLabel) = "en")
}
GROUP BY ?predicate
ORDER BY ?predicate
"""

# FILTER(?predicate IN (wdt:P101, wdt:P921, wdt:P136))

conversation_predicates = run_query(conversation_predicates_query, wikidata_endpoint)
predicate_list = conversation_predicates["predicate"].str.extract(r"/P(\d+)$")[0].apply(lambda x: f"wdt:P{x}") # type: ignore
conversation_predicates["predicate_id"] = predicate_list # type: ignore
conversation_predicates

Unnamed: 0,exampleLabel,predicate,predicate_id
0,Nikolaus P. Himmelmann,http://www.wikidata.org/prop/direct/P101,wdt:P101
1,A Conversation,http://www.wikidata.org/prop/direct/P138,wdt:P138
2,View of the Garden of the Villa Medici,http://www.wikidata.org/prop/direct/P180,wdt:P180
3,English conversation,http://www.wikidata.org/prop/direct/P31,wdt:P31
4,oral communication,http://www.wikidata.org/prop/direct/P1382,wdt:P1382
5,turn-taking,http://www.wikidata.org/prop/direct/P1542,wdt:P1542
6,emotion recognition in conversation,http://www.wikidata.org/prop/direct/P2283,wdt:P2283
7,interview,http://www.wikidata.org/prop/direct/P279,wdt:P279
8,Category:Conversation,http://www.wikidata.org/prop/direct/P301,wdt:P301
9,awkward silence,http://www.wikidata.org/prop/direct/P361,wdt:P361


In [11]:
predicate_string = " ".join(predicate_list)
predicate_string

'wdt:P101 wdt:P138 wdt:P180 wdt:P31 wdt:P1382 wdt:P1542 wdt:P2283 wdt:P279 wdt:P301 wdt:P361 wdt:P366 wdt:P460 wdt:P7937 wdt:P921 wdt:P941'

In [None]:
#
# ?? getting results, then filtering out undesired ones
#

predicate_ids_query = f"""
SELECT DISTINCT ?item ?predicate ?itemLabel 
WHERE {{
  ?item ?predicate wd:Q52943 .
  VALUES ?predicate {{ {predicate_string} }}
  ?item rdfs:label ?itemLabel.
  FILTER(LANG(?itemLabel) = "en")
  FILTER(?predicate NOT IN (wdt:P101, wdt:P301, wdt:P366, wdt:P7937, wdt:P921, wdt:P941))
  FILTER(?predicate NOT IN (wdt:P180, wdt:P31))
  FILTER(?predicate NOT IN (wdt:P138, wdt:P1542))
}}
ORDER BY ?itemLabel
"""

# ! wdt:P101 : remove
# & wdt:P138 : a conversation (singular)
# ?? wdt:P180 : descriptions???
# ?? wdt:P31 : literary (ish)
# & wdt:P1382 : oral communication, (singular)
# & wdt:P1542 : turn taking (singular)
# * wdt:P2283 : excellent
# * wdt:P279 : pretty, good, about conversation genres
# ! wdt:P301 : vague, means nothing
# * wdt:P361 : solid
# ! wdt:P366 : remove
# & wdt:P460 : vague at best, classification
# ! wdt:P7937 : remove
# ! wdt:P921 : a bit problematic
# ! wdt:P941 remove


conversation_types = run_query(predicate_ids_query, wikidata_endpoint)
conversation_types["predicate_id"] = conversation_types["predicate"].str.extract(r"/P(\d+)$")[0].apply(lambda x: f"wdt:P{x}") # type: ignore
conversation_types


Unnamed: 0,item,predicate,itemLabel,predicate_id
0,http://www.wikidata.org/entity/Q4680988,http://www.wikidata.org/prop/direct/P279,Adda,wdt:P279
1,http://www.wikidata.org/entity/Q7047953,http://www.wikidata.org/prop/direct/P279,Intake interview,wdt:P279
2,http://www.wikidata.org/entity/Q4830098,http://www.wikidata.org/prop/direct/P361,awkward silence,wdt:P361
3,http://www.wikidata.org/entity/Q2892080,http://www.wikidata.org/prop/direct/P279,chatter,wdt:P279
4,http://www.wikidata.org/entity/Q124391291,http://www.wikidata.org/prop/direct/P361,comfortable silence,wdt:P361
5,http://www.wikidata.org/entity/Q5154139,http://www.wikidata.org/prop/direct/P279,communication in small groups,wdt:P279
6,http://www.wikidata.org/entity/Q939654,http://www.wikidata.org/prop/direct/P279,conversation piece,wdt:P279
7,http://www.wikidata.org/entity/Q5176265,http://www.wikidata.org/prop/direct/P279,council circle,wdt:P279
8,http://www.wikidata.org/entity/Q131395,http://www.wikidata.org/prop/direct/P460,dialogue,wdt:P460
9,http://www.wikidata.org/entity/Q2286435,http://www.wikidata.org/prop/direct/P279,difficult conversation,wdt:P279


In [None]:
filtered_conversation_types = sorted(map(str.title, conversation_types["itemLabel"].dropna()))  # type: ignore
unwanted_conversation_types = frozenset((
    "Adda", "Council Circle", "Interview", "In-Depth Interview", "Intake Interview", "Job Interview", "Oral Communication"
    "Radio Voice Communication", "Religious Debates Over The Harry Potter Series",  "Sacra Conversazione",  "Whispering In Islam",  "Women Gossip"
)) # gathered from the previous query
filtered_conversation_types = [conversation_type for conversation_type in filtered_conversation_types if conversation_type not in unwanted_conversation_types]
filtered_conversation_types

['Awkward Silence',
 'Chatter',
 'Comfortable Silence',
 'Communication In Small Groups',
 'Conversation Piece',
 'Dialogue',
 'Difficult Conversation',
 'Discussion',
 'Doctor-Patient Conversation',
 'Emotion Recognition In Conversation',
 'Gossip',
 'Intervention',
 'Locker Room Talk',
 'Non-Convergent Discourse',
 'Oral Communication',
 'Phone Conversation',
 'Radio Voice Communication',
 'Relationship Effort']

In [None]:
#
# push the data to a JSON file for some data reconciliation
#

import json

data = {
    "conversation_types": filtered_conversation_types,
    "region_data": {
        row["region_name"]: {
            "url": row["url"],
            "population": row["population"]
        }
        for _, row in region_data.iterrows() # type: ignore
    }
}

output_path = Path.cwd().parent / "data" / "sparql_data.json"

with open(output_path, "w", encoding="utf-8") as f:
    json.dump(data, f, indent=4, ensure_ascii=False)

print(f"Saved JSON data to {output_path}")


Saved JSON data to /Users/niconapoli/Library/CloudStorage/OneDrive-TheUniversityofSydney(Students)/University/B Advanced Studies (Digital Music)/Year 3/Semester 1/95781/95781 Project/data/sparql_data.json
