In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON, RDFXML, XML
from rdflib import Graph
import pprint, time, sys

In [2]:
sparql = SPARQLWrapper("http://localhost:3030/w3c-email-q12/sparql")
sparql.setQuery("""
    PREFIX base: <http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/>
    PREFIX schema: <https://schema.org/>
    PREFIX email: <http://www.w3.org/2000/10/swap/pim/email#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX itsrdf: <https://www.w3.org/2005/11/its/rdf#>
    PREFIX olia: <http://purl.org/olia/olia.owl#>
    PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
    PREFIX nerd: <http://nerd.eurecom.fr/ontology#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    
    CONSTRUCT{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    WHERE{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    """)
sparql.setReturnFormat(XML)
results = sparql.query().convert()

In [4]:
# pprint.pprint(results.serialize(format='turtle').decode())

In [5]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Namespace
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
BASE = Namespace("http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/")
EMAIL = Namespace("http://www.w3.org/2000/10/swap/pim/email#")
OLIA = Namespace("http://purl.org/olia/olia.owl#")
NERD = Namespace("http://nerd.eurecom.fr/ontology#")
NIF = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")

In [6]:
# RDFグラフ
g = Graph()
g.bind('',BASE)
g.bind('schema',SDO)
g.bind('foaf',FOAF)
g.bind('owl',OWL)
g.bind('rdf',RDF)
g.bind('rdfs',RDFS)
g.bind('olia',OLIA)
g.bind('nerd',NERD)
g.bind('nif',NIF)
g.bind('itsrdf',ITSRDF)
g.bind('wd',WD)
g.bind('wdt',WDT)

In [7]:
for result in results:
    g.add(result)

In [8]:
wikidataEntity = []
for s, p, o in results.triples((None, None, None)):
    wikidataEntity.append(str(o))
wikidataEntity = list(set(wikidataEntity))

In [9]:
len(wikidataEntity)

706

In [10]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(XML)
    return sparql.query().convert()

In [12]:
endpoint_url = "https://query.wikidata.org/sparql"

start = time.time()

for i, entity in enumerate(wikidataEntity):
    query = 'CONSTRUCT{<'+ entity +'> wdt:P31 ?class1;\
                                wdt:P279 ?class2;\
                                rdfs:label ?label.\
                        ?class1 rdfs:label ?classlabel1.\
                        ?class2 rdfs:label ?classlabel2.\
                }WHERE{\
                    OPTIONAL{<'+ entity +'> wdt:P31 ?class1.\
                            ?class1 rdfs:label ?classlabel1.\
                            FILTER(lang(?classlabel1)=\'en\')}\
                    OPTIONAL{<'+ entity +'> wdt:P279 ?class2.\
                            ?class2 rdfs:label ?classlabel2.\
                            FILTER(lang(?classlabel2)=\'en\')}\
                    OPTIONAL{<'+ entity +'> rdfs:label ?label.\
                            FILTER(lang(?label)=\'en\')}\
                }'
    results = get_results(endpoint_url, query)
    for result in results:
        g.add(result)
    print('{}:{}'.format(i,time.time()-start))
print(time.time()-start)

0:0.9292888641357422
1:2.137065887451172
2:3.4847588539123535
3:4.460753917694092
4:5.695916652679443
5:6.355265855789185
6:7.770864725112915
7:8.88962984085083
8:10.265784740447998
9:11.269567966461182
10:12.801193952560425
11:13.815919876098633
12:15.160384893417358
13:16.18428683280945
14:17.309123754501343
15:18.335888862609863
16:19.28774380683899
17:20.484899759292603
18:21.4580500125885
19:22.43710470199585
20:23.385684967041016
21:23.950403690338135
22:24.917569875717163
23:25.914663791656494
24:26.938042879104614
25:28.065939903259277
26:28.649755716323853
27:29.70195460319519
28:30.72703981399536
29:31.748941659927368
30:32.775150775909424
31:33.54800772666931
32:34.6176917552948
33:35.641647815704346
34:36.666759729385376
35:37.23846673965454
36:38.304194688797
37:39.02025866508484
38:40.045209646224976
39:41.06789684295654
40:42.09302568435669
41:43.115750789642334
42:44.141374826431274
43:45.16631865501404
44:46.19042468070984
45:46.80302166938782
46:47.82809591293335
47:4

In [13]:
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
qres = g.query('CONSTRUCT{?spot <https://www.w3.org/2005/11/its/rdf#taClassRef> ?class.}WHERE{?spot itsrdf:taIdentRef ?entity. OPTIONAL{?entity wdt:P31 ?class.} OPTIONAL{?entity wdt:P279 ?class.}}',initNs={'itsrdf':ITSRDF, 'wdt':WDT})

In [14]:
for triple in qres.graph:
    g.add(triple)

In [15]:
#print(g.serialize(format="turtle").decode("utf-8"))

In [16]:
# turtle形式でファイル出力
g.serialize(destination='/Users/taroaso/myprojects/OpenIE/trec/output/12/class_info.ttl', format='turtle')