In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON, RDFXML, XML
from rdflib import Graph
import pprint, time, sys

In [2]:
sparql = SPARQLWrapper("http://localhost:3030/w3c-email-q21/sparql")
sparql.setQuery("""
    PREFIX base: <http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/>
    PREFIX schema: <https://schema.org/>
    PREFIX email: <http://www.w3.org/2000/10/swap/pim/email#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX itsrdf: <https://www.w3.org/2005/11/its/rdf#>
    PREFIX olia: <http://purl.org/olia/olia.owl#>
    PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
    PREFIX nerd: <http://nerd.eurecom.fr/ontology#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    
    CONSTRUCT{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    WHERE{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    """)
sparql.setReturnFormat(XML)
results = sparql.query().convert()

In [3]:
# pprint.pprint(results.serialize(format='turtle').decode())

In [4]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Namespace
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
BASE = Namespace("http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/")
EMAIL = Namespace("http://www.w3.org/2000/10/swap/pim/email#")
OLIA = Namespace("http://purl.org/olia/olia.owl#")
NERD = Namespace("http://nerd.eurecom.fr/ontology#")
NIF = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")

In [5]:
# RDFグラフ
g = Graph()
g.bind('',BASE)
g.bind('schema',SDO)
g.bind('foaf',FOAF)
g.bind('owl',OWL)
g.bind('rdf',RDF)
g.bind('rdfs',RDFS)
g.bind('olia',OLIA)
g.bind('nerd',NERD)
g.bind('nif',NIF)
g.bind('itsrdf',ITSRDF)
g.bind('wd',WD)
g.bind('wdt',WDT)

In [6]:
for result in results:
    g.add(result)

In [7]:
wikidataEntity = []
for s, p, o in results.triples((None, None, None)):
    wikidataEntity.append(str(o))
wikidataEntity = list(set(wikidataEntity))

In [8]:
len(wikidataEntity)

496

In [9]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(XML)
    return sparql.query().convert()

In [10]:
endpoint_url = "https://query.wikidata.org/sparql"

start = time.time()

for i, entity in enumerate(wikidataEntity):
    query = 'CONSTRUCT{<'+ entity +'> wdt:P31 ?class1;\
                                wdt:P279 ?class2;\
                                rdfs:label ?label.\
                        ?class1 rdfs:label ?classlabel1.\
                        ?class2 rdfs:label ?classlabel2.\
                }WHERE{\
                    OPTIONAL{<'+ entity +'> wdt:P31 ?class1.\
                            ?class1 rdfs:label ?classlabel1.\
                            FILTER(lang(?classlabel1)=\'en\')}\
                    OPTIONAL{<'+ entity +'> wdt:P279 ?class2.\
                            ?class2 rdfs:label ?classlabel2.\
                            FILTER(lang(?classlabel2)=\'en\')}\
                    OPTIONAL{<'+ entity +'> rdfs:label ?label.\
                            FILTER(lang(?label)=\'en\')}\
                }'
    results = get_results(endpoint_url, query)
    for result in results:
        g.add(result)
    print('{}:{}'.format(i,time.time()-start))
print(time.time()-start)

0:1.4907872676849365
1:2.821275234222412
2:4.053391218185425
3:5.17685604095459
4:6.508671998977661
5:7.945224285125732
6:9.071608066558838
7:9.788869142532349
8:10.506209135055542
9:11.627917051315308
10:12.259522199630737
11:13.266674280166626
12:13.964064121246338
13:15.059642314910889
14:16.139586210250854
15:17.464845180511475
16:18.28861427307129
17:19.41105008125305
18:20.41993498802185
19:21.317974090576172
20:22.32571315765381
21:23.530778169631958
22:24.43016028404236
23:25.55823016166687
24:26.681270122528076
25:27.39827609062195
26:28.833224296569824
27:29.923676252365112
28:30.984275341033936
29:31.83669924736023
30:32.92683815956116
31:34.15941309928894
32:34.859806299209595
33:35.900888204574585
34:37.108774185180664
35:38.12670016288757
36:38.869773149490356
37:40.07732033729553
38:41.094632387161255
39:41.739259243011475
40:43.04229211807251
41:43.67944121360779
42:44.80876111984253
43:45.93271613121033
44:46.647956132888794
45:47.77683210372925
46:48.90658926963806
47

In [11]:
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
qres = g.query('CONSTRUCT{?spot <https://www.w3.org/2005/11/its/rdf#taClassRef> ?class.}WHERE{?spot itsrdf:taIdentRef ?entity. OPTIONAL{?entity wdt:P31 ?class.} OPTIONAL{?entity wdt:P279 ?class.}}',initNs={'itsrdf':ITSRDF, 'wdt':WDT})

In [12]:
for triple in qres.graph:
    g.add(triple)

In [13]:
#print(g.serialize(format="turtle").decode("utf-8"))

In [14]:
# turtle形式でファイル出力
g.serialize(destination='/Users/taroaso/myprojects/OpenIE/trec/output/21/class_info.ttl', format='turtle')