In [1]:
from SPARQLWrapper import SPARQLWrapper, JSON, RDFXML, XML
from rdflib import Graph
import pprint, time, sys

In [2]:
sparql = SPARQLWrapper("http://localhost:3030/w3c-email-q54/sparql")
sparql.setQuery("""
    PREFIX base: <http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/>
    PREFIX schema: <https://schema.org/>
    PREFIX email: <http://www.w3.org/2000/10/swap/pim/email#>
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX itsrdf: <https://www.w3.org/2005/11/its/rdf#>
    PREFIX olia: <http://purl.org/olia/olia.owl#>
    PREFIX nif: <http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#>
    PREFIX nerd: <http://nerd.eurecom.fr/ontology#>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>
    PREFIX owl: <http://www.w3.org/2002/07/owl#>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    
    CONSTRUCT{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    WHERE{
        ?mention itsrdf:taIdentRef ?wikidataEntity.
    }
    """)
sparql.setReturnFormat(XML)
results = sparql.query().convert()

In [3]:
# pprint.pprint(results.serialize(format='turtle').decode())

In [4]:
from rdflib import Graph, Literal, RDF, URIRef, BNode, Namespace
from rdflib.namespace import CSVW, DC, DCAT, DCTERMS, DOAP, FOAF, ODRL2, ORG, OWL, \
                           PROF, PROV, RDF, RDFS, SDO, SH, SKOS, SOSA, SSN, TIME, \
                           VOID, XMLNS, XSD
BASE = Namespace("http://www.kde.cs.tsukuba.ac.jp/~aso/w3c-email/")
EMAIL = Namespace("http://www.w3.org/2000/10/swap/pim/email#")
OLIA = Namespace("http://purl.org/olia/olia.owl#")
NERD = Namespace("http://nerd.eurecom.fr/ontology#")
NIF = Namespace("http://persistence.uni-leipzig.org/nlp2rdf/ontologies/nif-core#")
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")

In [5]:
# RDFグラフ
g = Graph()
g.bind('',BASE)
g.bind('schema',SDO)
g.bind('foaf',FOAF)
g.bind('owl',OWL)
g.bind('rdf',RDF)
g.bind('rdfs',RDFS)
g.bind('olia',OLIA)
g.bind('nerd',NERD)
g.bind('nif',NIF)
g.bind('itsrdf',ITSRDF)
g.bind('wd',WD)
g.bind('wdt',WDT)

In [6]:
for result in results:
    g.add(result)

In [7]:
wikidataEntity = []
for s, p, o in results.triples((None, None, None)):
    wikidataEntity.append(str(o))
wikidataEntity = list(set(wikidataEntity))

In [8]:
len(wikidataEntity)

630

In [9]:
def get_results(endpoint_url, query):
    user_agent = "WDQS-example Python/%s.%s" % (sys.version_info[0], sys.version_info[1])
    # TODO adjust user agent; see https://w.wiki/CX6
    sparql = SPARQLWrapper(endpoint_url, agent=user_agent)
    sparql.setQuery(query)
    sparql.setReturnFormat(XML)
    return sparql.query().convert()

In [10]:
endpoint_url = "https://query.wikidata.org/sparql"

start = time.time()

for i, entity in enumerate(wikidataEntity):
    query = 'CONSTRUCT{<'+ entity +'> wdt:P31 ?class1;\
                                wdt:P279 ?class2;\
                                rdfs:label ?label.\
                        ?class1 rdfs:label ?classlabel1.\
                        ?class2 rdfs:label ?classlabel2.\
                }WHERE{\
                    OPTIONAL{<'+ entity +'> wdt:P31 ?class1.\
                            ?class1 rdfs:label ?classlabel1.\
                            FILTER(lang(?classlabel1)=\'en\')}\
                    OPTIONAL{<'+ entity +'> wdt:P279 ?class2.\
                            ?class2 rdfs:label ?classlabel2.\
                            FILTER(lang(?classlabel2)=\'en\')}\
                    OPTIONAL{<'+ entity +'> rdfs:label ?label.\
                            FILTER(lang(?label)=\'en\')}\
                }'
    results = get_results(endpoint_url, query)
    for result in results:
        g.add(result)
    print('{}:{}'.format(i,time.time()-start))
print(time.time()-start)

0:1.5073440074920654
1:2.6725411415100098
2:3.610774040222168
3:4.693443298339844
4:5.95372200012207
5:7.182594060897827
6:7.898191928863525
7:9.02567720413208
8:10.157432079315186
9:11.281935930252075
10:12.306941986083984
11:13.429964065551758
12:14.558408975601196
13:15.608366966247559
14:16.64036202430725
15:17.688311100006104
16:18.868272066116333
17:19.88740110397339
18:20.60082697868347
19:21.653604984283447
20:22.750486135482788
21:23.393250226974487
22:24.418740034103394
23:25.437753200531006
24:26.651365995407104
25:27.666924238204956
26:28.269004106521606
27:29.407827138900757
28:30.538039207458496
29:31.556907176971436
30:32.59235215187073
31:33.75436305999756
32:34.80813217163086
33:35.862224102020264
34:36.67490530014038
35:37.31907105445862
36:38.37324500083923
37:39.03410315513611
38:39.750813007354736
39:40.76945424079895
40:41.80395293235779
41:42.921515226364136
42:43.64349699020386
43:44.31383299827576
44:45.34001708030701
45:46.41858696937561
46:47.593178033828735


In [11]:
ITSRDF = Namespace("https://www.w3.org/2005/11/its/rdf#")
qres = g.query('CONSTRUCT{?spot <https://www.w3.org/2005/11/its/rdf#taClassRef> ?class.}WHERE{?spot itsrdf:taIdentRef ?entity. OPTIONAL{?entity wdt:P31 ?class.} OPTIONAL{?entity wdt:P279 ?class.}}',initNs={'itsrdf':ITSRDF, 'wdt':WDT})

In [12]:
for triple in qres.graph:
    g.add(triple)

In [13]:
#print(g.serialize(format="turtle").decode("utf-8"))

In [14]:
# turtle形式でファイル出力
g.serialize(destination='/Users/taroaso/myprojects/OpenIE/trec/output/54/class_info.ttl', format='turtle')