# Task: Enrich your dataset with additional data from DBpedia and WikiData 


## Overview
Use the newly found DBpedia and Wikidata resources and collect additional information from DBpedia and Wikidata on the movies. It is sufficient to reuse the DBpedia and Wikidata vocabulary and use directly the DBpedia property values (including when the value is a URL/IRI), e.g., it is enough if you include the following information from DBpedia on the gross value and the producer:

<code>    
PREFIX dbo: http://dbpedia.org/ontology/
    
<https://firstname-lastname.org/resource/the_godfather>  dbo:gross    2.541E8^^xsd:double;
                                                       dbo:producer <http://dbpedia.org/page/Albert_S._Ruddy> .
<code>
    
> __Hint__: Using SPARQL’s OPTIONAL keyword might help.

## Task Details

1. Using SPARQL queries, information you __Must Get__ (if available in DBpedia and Wikidata for the movie):
    - Get the distinct genre(s) of a movie
    - Get the distinct actors 
    - Get the homepage of a movie
    - Get the number of received awards
    - Get the IMDB and the RottenTomatoes links
    - Get the box office value/gross value
    - Get the cost of a movie



## Submission 3:

Use RDFLib to load the data you have saved in Task 2 and add the additional information to the corresponding movies. As mentioned above, you can use the DBpedia and WikiData predicates. Save the enriched data set in the output folder with naming __movies_task_3.n3__.

 

<br>

## Your code

In [12]:
from rdflib import URIRef, Literal, Graph, Namespace
from rdflib.namespace import FOAF, RDF, RDFS, XSD, DC, OWL
import urllib
from datetime import datetime
from SPARQLWrapper import SPARQLWrapper, JSON, N3, RDFXML
import numpy as np
from time import sleep

In [17]:
EX = Namespace("https://ex1.org/")
DBO = Namespace("http://dbpedia.org/ontology/")
RSC = Namespace("http://philip-broehl.org/resource/")
WD = Namespace("http://www.wikidata.org/entity/")
WDT = Namespace("http://www.wikidata.org/prop/direct/")
SCH = Namespace("https://schema.org/")

In [None]:
# load graph from previous exercise
g = Graph()
g.load(source = "../output_data/movies_task_2.n3", format = 'n3')
g.bind('dbo', DBO)
g.bind("wd", WD)
g.bind("wdt", WDT)

# extract all sameAs links in Graph
sameAs_db = []
sameAs_wd = []
for s in list(g.subjects(predicate = RDF.type, object = SCH.Movie)):
    for sameAs in g.objects(subject = URIRef(s), predicate = OWL.sameAs):
        if "dbpedia" in sameAs:
            sameAs_db.append(sameAs)
        else:
            sameAs_wd.append(sameAs)

# from dbpedia, we can get the budget, gross and starring actors
query_db = """
    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    
    CONSTRUCT { 
        ?movie dbo:budget ?budget ;
            dbo:gross ?gross ;
            dbo:starring ?actor .
    }
    WHERE {
        ?movie rdf:type dbo:Film .
        OPTIONAL { ?movie dbo:budget ?budget }
        OPTIONAL { ?movie dbo:gross ?gross }
        OPTIONAL { ?movie dbo:starring ?actor }
        VALUES ?movie { """

# from wikidata, we can get the genre, official website, IMDb link, rottenTomatoes link and
# number of awards.
# wdt:P136 is genre, wdt:P856 is the official website, P345 is the IMDb link, P1258 is the
# rottenTomatoes link, and P166 is award received.
query_wd = """
    PREFIX wd: <http://www.wikidata.org/entity/>
    PREFIX wdt: <http://www.wikidata.org/prop/direct/>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX xsd: <http://www.w3.org/2001/XMLSchema#>

    CONSTRUCT {
        ?movie wdt:P136 ?genre ;
            wdt:P856 ?official_website ;
            wdt:P345 ?IMDb_link ;
            wdt:P1258 ?RottenTomatoes_link ;
            wdt:P166 ?awards .
    }
    WHERE {
        ?movie wdt:P31 wd:Q11424 .
        OPTIONAL { ?movie wdt:P136 ?genre }
        OPTIONAL { ?movie wdt:P856 ?official_website }
        OPTIONAL { ?movie wdt:P345 ?IMDb_link }
        OPTIONAL { ?movie wdt:P1258 ?RottenTomatoes_link }
        OPTIONAL { ?movie wdt:P166 ?awards }
        VALUES ?movie { """

results_db = []
results_wd = []
values = ""
i = 0
for uri in sameAs_db:
    i += 1
    print(f'\r{i}', end = '')
    values += "<" + str(uri) + ">"
    if i % 50 == 0 or i == len(sameAs_db):   
        values += "}\n}"
        
        sparql_db = SPARQLWrapper("https://dbpedia.org/sparql")
        sparql_db.setQuery(query_db + values)
        sparql_db.setReturnFormat(RDFXML)
        results_db.append(sparql_db.query().convert())
        values = ""

for uri in sameAs_wd:
    i += 1
    print(f'\r{i}', end = '')
    values += "<" + str(uri) + ">"
    if i % 50 == 0 or i == len(sameAs_wd):   
        values += "}\n}"
        sparql_wd = SPARQLWrapper("https://query.wikidata.org/sparql")
        sparql_wd.setQuery(query_wd + values)
        sparql_wd.setReturnFormat(RDFXML)
        results_wd.append(sparql_wd.query().convert())
        
        values = ""
        sleep(5)

450

In [None]:
# join graphs
graph_db = Graph()
graph_wd = Graph()

for graph in results_db:
    graph_db += graph
for graph in results_wd:
    graph_wd += graph
    
for s in list(g.subjects(predicate = RDF.type, object = SCH.Movie)):
    sameAs_db = None
    sameAs_wd = None
    award_count = 0
    for sameAs in g.objects(subject = URIRef(s), predicate = OWL.sameAs):
        if "dbpedia" in sameAs:
            sameAs_db = sameAs
        else:
            sameAs_wd = sameAs
    if sameAs_db != None:
        for (p, o) in list(graph_db.predicate_objects(sameAs_db)):
            g.add((URIRef(s), p, o))
    if sameAs_wd != None:
        for (p, o) in list(graph_wd.predicate_objects(sameAs_wd)):
            # form IMDb links and rottenTomatoes links from IDs (see formatter URL of properties)
            # P1065 is the archive URL datatype, which is compatible with IMDB / rottenTomatoes IDs
            if p == URIRef("http://www.wikidata.org/prop/direct/P345"):
                o = Literal("https://www.imdb.com/title/" + str(o), datatype = WDT.P1065)
            if p == URIRef("http://www.wikidata.org/prop/direct/P1258"):
                o = Literal("https://www.rottentomatoes.com/" + str(o), datatype = WDT.P1065)
            if p == URIRef("http://www.wikidata.org/prop/direct/P166"):
                award_count += 1
                continue
            g.add((URIRef(s), p, o))
        # property P166 (received awards) also supports quantities (Q309314)
        g.add((URIRef(s), WDT.P166, Literal(award_count, datatype = WD.Q309314)))
        
print(g.serialize(format="n3").decode("utf-8"))
g.serialize(destination='../output_data/movies_task_3.n3', format='n3')

In [None]:
number_movies_with_sameAs = len(list(set(g.subjects(object = SCH.Movie)) & set(g.subjects(predicate = OWL.sameAs))))
print(f'RottenTomatoes Links: {len(list(g.objects(predicate = WDT.P1258)))} / {number_movies_with_sameAs}')
print(f'IMDb Links: {len(list(g.objects(predicate = WDT.P345)))} / {number_movies_with_sameAs}')
print(f'Films with actors: {len(set(g.subjects(predicate = DBO.starring)))} / {number_movies_with_sameAs}')
print(f'Films with budget: {len(set(g.subjects(predicate = DBO.budget)))} / {number_movies_with_sameAs}')
print(f'Films with gross: {len(set(g.subjects(predicate = DBO.gross)))} / {number_movies_with_sameAs}')
print(f'Films with genre: {len(set(g.subjects(predicate = WDT.P136)))} / {number_movies_with_sameAs}')
print(f'Films with official website: {len(set(g.subjects(predicate = WDT.P856)))} / {number_movies_with_sameAs}')