In [86]:
import rdflib
from rdflib import Graph, URIRef, Literal, BNode
from rdflib.namespace import FOAF, RDF, SDO, RDFS, OWL, DC

import os
from tqdm import tqdm

import sqlparse

In [7]:
global_graph = Graph()
# https://data.mendeley.com/datasets/zp23s23xpb/1
input_folder = "dump/soler/"
for filename in tqdm(os.listdir(input_folder)[:1000]):
    f = os.path.join(input_folder, filename)
    try:
        g1 = Graph().parse(f, format='xml')

        #check for literals
        for s, p, o in g1:
            if not rdflib.term._is_valid_uri(o):
                g1.set((s, p, Literal(o)))
             
        global_graph = global_graph + g1
        
    except:
        continue

global_graph.serialize(destination="datasets/soler.ttl", format="turtle")

  0%|          | 0/1000 [00:00<?, ?it/s]tel:- +33-4-92-38-79-83--  does not look like a valid URI, trying to serialize this will break.
http://xmlns.com/foaf/0.1/Nova Spivack's Ryze Business Networking Page does not look like a valid URI, trying to serialize this will break.
Amazon.com: Listmania! Buddhist and Bonpo Dzogchen Meditation does not look like a valid URI, trying to serialize this will break.
KDnuggets: Data Mining, Web Mining, and Knowledge Discovery Guide does not look like a valid URI, trying to serialize this will break.
http://xmlns.com/foaf/0.1/Philosophy of mind - Wikipedia does not look like a valid URI, trying to serialize this will break.
CiteSeer: The NEC Research Institute Scientific Literature Digital Library does not look like a valid URI, trying to serialize this will break.
Amazon.com: Listmania! The Semantic Web: XML, Internet, RDF, AI, Ontologies & KM does not look like a valid URI, trying to serialize this will break.
http://xmlns.com/foaf/0.1/      W3C Se

<Graph identifier=N550ab359e49642f19330e822d891a5b1 (<class 'rdflib.graph.Graph'>)>

In [6]:
# https://ebiquity.umbc.edu/resource/html/id/82/foafPub-dataset
g = Graph()
g.bind("foaf", FOAF)
g.bind("dc", DC)

with open("dump/foaf_pub/triple_person.sql") as file:
    for i, line in enumerate(tqdm(file)):
        try:
            parsed = sqlparse.parse(line)[0]
            values = str(parsed.tokens[-3])[9:-1].split(',')

            subject = values[5][1:-1]
            predicate = values[4][1:-1]
            object = values[3][1:-1]

            #check all uris
            if not (rdflib.term._is_valid_uri(subject) and rdflib.term._is_valid_uri(predicate)):
                continue

            subject = URIRef(subject)
            predicate = URIRef(predicate)

            # check if literal
            if not rdflib.term._is_valid_uri(object):
                object = Literal(object)
            else:
                object = URIRef(object)
            
            g.add((subject, predicate, object))
        except:
            continue

        #if i > 9000:
        #    break

g.serialize(destination="datasets/foaf_pub.ttl", format="turtle")

202271it [03:35, 937.10it/s] 


<Graph identifier=N6c93408c686b4d37bc7dbd32778c2edd (<class 'rdflib.graph.Graph'>)>

In [2]:
# FactBench
# https://github.com/DeFacto/FactBench
global_graph = Graph()
input_folder = "dump/FactBench/"
for folder in tqdm(os.listdir(input_folder)):
    folder = os.path.join(input_folder, folder)
    for filename in os.listdir(folder):
        f = os.path.join(folder, filename)
        try:
            g1 = Graph().parse(f, format='ttl')
                
            global_graph = global_graph + g1
            
        except:
            continue

global_graph.serialize(destination="datasets/factbench.ttl", format="turtle")

100%|██████████| 10/10 [08:07<00:00, 48.76s/it]


<Graph identifier=N69273402929b4a4db1c1336595cc3783 (<class 'rdflib.graph.Graph'>)>

In [1]:
from SPARQLWrapper import SPARQLWrapper, RDFXML
from rdflib import Graph

sparql = SPARQLWrapper("http://dbpedia.org/sparql") # use this endpoint for higher timeout

# https://gist.github.com/tomsaleeba/ff8e145b3efd1127e48baa6512df24e2
sparql.setQuery("""
    PREFIX dbpedia: <http://dbpedia.org/resource/>
    PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
    CONSTRUCT {
        ?actor ?p1 ?o1 .
        ?s2 ?p2 ?actor .
        ?movie ?p3 ?o2 .
        ?s3 ?p4 ?movie .
    }
    WHERE {
        ?actor ?p1 ?o1 {
            SELECT ?actor, ?movie
            WHERE {
                ?movie dbpedia-owl:starring ?actor .
            } 
            order by asc(UCASE(str(?actor)))
            LIMIT 10 # set graph size with this, to high results in partial result only, because of timeout
        } .
        ?s2 ?p2 ?actor FILTER (?p2 NOT IN (dbo:wikiPageWikiLink, dbo:wikiPageRedirects)) . 

        ?movie ?p3 ?o2 FILTER (?p3 NOT IN (dbo:wikiPageWikiLink, dbo:wikiPageRedirects)) . 
        ?s3 ?p4 ?movie FILTER (?p4 NOT IN (dbo:wikiPageWikiLink, dbo:wikiPageRedirects)) . 
    }
""")

sparql.setReturnFormat(RDFXML)
results = sparql.query().convert()
results.serialize(destination="datasets/dbpedia.ttl", format="turtle")

<Graph identifier=N0a76a50148ca4dbd90d1a3e84f6814da (<class 'rdflib.graph.ConjunctiveGraph'>)>

In [115]:
# https://gist.github.com/tomsaleeba/ff8e145b3efd1127e48baa6512df24e2

from SPARQLWrapper import SPARQLWrapper, RDFXML, JSON, TURTLE, XML
from rdflib import Graph

sparql = SPARQLWrapper("http://dbpedia.org/sparql") # use this endpoint for higher timeout

sparql.setQuery("""
    PREFIX dbpedia: <http://dbpedia.org/resource/>
    PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
    SELECT ?actor, ?movie
    WHERE {
        ?movie dbpedia-owl:starring ?actor .
    }
    #order by asc(UCASE(str(?actor)))
    LIMIT 100 # set graph size with this
""")

sparql.setReturnFormat(JSON)
actors_movies = sparql.queryAndConvert()
actors_movies

{'head': {'link': [], 'vars': ['actor', 'movie']},
 'results': {'distinct': False,
  'ordered': True,
  'bindings': [{'actor': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Helena_Bergström'},
    'movie': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/1939_(film)'}},
   {'actor': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Helene_Egelund'},
    'movie': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/1939_(film)'}},
   {'actor': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Per_Morberg'},
    'movie': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/1939_(film)'}},
   {'actor': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Rallapalli_(actor)'},
    'movie': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/1940_Lo_Oka_Gramam'}},
   {'actor': {'type': 'uri',
     'value': 'http://dbpedia.org/resource/Ramakrishna_(Kannada_actor)'},
    'movie': {'type': 'uri',
     'value': 'http://dbpedia.or

In [116]:
g = Graph()

for a_m in tqdm(actors_movies["results"]["bindings"]):
    actor = a_m['actor']['value'].rsplit('/', 1)[1]
    sparql.setQuery("""
        PREFIX dbpedia: <http://dbpedia.org/resource/>
        PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
        CONSTRUCT {
            dbpedia:""" + actor + """ ?p1 ?o1 .
            ?s2 ?p2 dbpedia:""" + actor + """ .
        }
        WHERE {
            dbpedia:""" + actor + """ ?p1 ?o1 .
            ?s2 ?p2 dbpedia:""" + actor + """ FILTER (?p2 NOT IN (dbo:wikiPageWikiLink, dbo:wikiPageRedirects, foaf:primaryTopic)) .
        }
    """)
    sparql.setReturnFormat(TURTLE)
    try:
        results = sparql.query().convert()
        g = g + Graph().parse(data=results, format="turtle")
        g.add((URIRef("http://dbpedia.org/resource/" + actor), RDF.type, URIRef("http://dbpedia.org/ontology/Actor")))
    except:
        continue

unique_movies = set([m['movie']['value'].rsplit('/', 1)[1].replace("(", "\(").replace(")", "\)") for m in actors_movies["results"]["bindings"]])
for movie in tqdm(unique_movies):
    sparql.setQuery("""
        PREFIX dbpedia: <http://dbpedia.org/resource/>
        PREFIX dbpedia-owl: <http://dbpedia.org/ontology/>
        CONSTRUCT {
            dbpedia:""" + movie + """ ?p3 ?o2 .
            #?s3 ?p4  dbpedia:""" + movie + """ .
        }
        WHERE {
            dbpedia:""" + movie + """ ?p3 ?o2 FILTER (?p3 NOT IN (dbo:wikiPageWikiLink, dbo:wikiPageRedirects)) .
            #?s3 ?p4  dbpedia:""" + movie + """ FILTER (?p4 NOT IN (dbo:wikiPageWikiLink, dbo:wikiPageRedirects)) .
        }
    """)

    sparql.setReturnFormat(TURTLE)
    try:
        results = sparql.query().convert()
        g = g + Graph().parse(data=results, format="turtle")
    except:
        continue

g.bind("rdfs", RDFS)
g.add((URIRef("http://dbpedia.org/ontology/starring"), RDFS.domain, URIRef("http://dbpedia.org/ontology/work")))
g.add((URIRef("http://dbpedia.org/ontology/starring"), RDFS.range, URIRef("http://dbpedia.org/ontology/Actor")))
g.serialize(destination="datasets/dbpedia.ttl", format="turtle")

100%|██████████| 100/100 [01:23<00:00,  1.20it/s]
100%|██████████| 30/30 [01:19<00:00,  2.64s/it]


<Graph identifier=Nf6058d6319584d7fa33c3568b95816e6 (<class 'rdflib.graph.Graph'>)>