In [107]:
from SPARQLWrapper import SPARQLWrapper, JSON, RDFXML, XML
from rdflib import Graph
import pprint, time, sys

In [108]:
sparql = SPARQLWrapper("http://localhost:3030/w3c-email/sparql")
sparql.setQuery("""
    PREFIX base: <http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/>
    PREFIX schema: <https://schema.org/>
    PREFIX email: <http://www.w3.org/2000/10/swap/pim/email#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX itsrdf: <https://www.w3.org/2005/11/its/rdf#>
    PREFIX olia: <http://purl.org/olia/olia.owl#>
    PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
    PREFIX nerd: <http://nerd.eurecom.fr/ontology#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    
    CONSTRUCT{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    WHERE{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    """)
sparql.setReturnFormat(XML)
results = sparql.query().convert()

In [109]:
results

<Graph identifier=Na6a6c34026c14d84b2abf0bccbe552fe (<class 'rdflib.graph.ConjunctiveGraph'>)>

In [110]:
#pprint.pprint(results.serialize(format='turtle').decode())

In [111]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Namespace
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
BASE = Namespace("http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/")
EMAIL = Namespace("http://www.w3.org/2000/10/swap/pim/email#")
OLIA = Namespace("http://purl.org/olia/olia.owl#")
NERD = Namespace("http://nerd.eurecom.fr/ontology#")
NIF = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")

In [112]:
# RDFグラフ
g = Graph()
g.bind('',BASE)
g.bind('schema',SDO)
g.bind('foaf',FOAF)
g.bind('owl',OWL)
g.bind('rdf',RDF)
g.bind('rdfs',RDFS)
g.bind('olia',OLIA)
g.bind('nerd',NERD)
g.bind('nif',NIF)
g.bind('itsrdf',ITSRDF)
g.bind('wd',WD)
g.bind('wdt',WDT)

In [113]:
for result in results:
    g.add(result)

In [115]:
wikidataEntity = []
for s, p, o in results.triples((None, None, None)):
    wikidataEntity.append(str(o))
wikidataEntity = list(set(wikidataEntity))

In [116]:
len(wikidataEntity)

452

In [117]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(XML)
    return sparql.query().convert()

In [118]:
endpoint_url = "https://query.wikidata.org/sparql"

start = time.time()

for i, entity in enumerate(wikidataEntity):
    query = 'CONSTRUCT{<'+ entity +'> wdt:P31 ?class1;\
                                wdt:P279 ?class2;\
                                rdfs:label ?label.\
                }WHERE{\
                    OPTIONAL{<'+ entity +'> wdt:P31 ?class1.}\
                    OPTIONAL{<'+ entity +'> wdt:P279 ?class2.}\
                    OPTIONAL{<'+ entity +'> rdfs:label ?label.}\
                    FILTER(lang(?label)=\'en\')\
                }'
    results = get_results(endpoint_url, query)
    for result in results:
        g.add(result)
    print('{}:{}'.format(i,time.time()-start))
print(time.time()-start)

0:1.0297889709472656
1:1.6219689846038818
2:2.567530870437622
3:3.122129201889038
4:3.682023763656616
5:4.673566818237305
6:5.223496913909912
7:6.192104816436768
8:7.157339096069336
9:8.197683811187744
10:9.252537965774536
11:9.825793027877808
12:10.867139101028442
13:11.83316707611084
14:12.764744997024536
15:13.75942587852478
16:14.734117031097412
17:15.708958864212036
18:16.264444828033447
19:17.262936115264893
20:18.304502964019775
21:19.273202896118164
22:20.25644302368164
23:21.233436107635498
24:22.16199016571045
25:23.144654035568237
26:24.106396913528442
27:24.65391516685486
28:25.7026948928833
29:26.684817790985107
30:27.684218168258667
31:28.663893938064575
32:29.239213943481445
33:29.837028980255127
34:30.839725017547607
35:31.855145931243896
36:32.831549882888794
37:33.55810594558716
38:34.544076919555664
39:35.49691581726074
40:36.06605005264282
41:37.05419683456421
42:38.01663684844971
43:38.94512891769409
44:39.571924924850464
45:40.20471000671387
46:41.16244292259216
4

In [137]:
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
qres = g.query('CONSTRUCT{?spot <https://www.w3.org/2005/11/its/rdf#taClassRef> ?class.}WHERE{?spot itsrdf:taIdentRef ?entity. OPTIONAL{?entity wdt:P31 ?class.} OPTIONAL{?entity wdt:P279 ?class.}}',initNs={'itsrdf':ITSRDF, 'wdt':WDT})

In [140]:
for triple in qres.graph:
    g.add(triple)

In [142]:
#print(g.serialize(format="turtle").decode("utf-8"))

In [143]:
# turtle形式でファイル出力
g.serialize(destination='/Users/taroaso/myprojects/OpenIE/trec/output/class_info.ttl', format='turtle')