In [13]:
from rdflib import Graph, RDF, Namespace, Literal, URIRef
from SPARQLWrapper import SPARQLWrapper, JSON
import csv
from rdflib.namespace import RDF, FOAF
import unicodedata
import itertools
import time

In [14]:
def serialize():
    print g.serialize(format='turtle')

def save(filename):
    with open(filename, 'w') as f:
        g.serialize(f, format='turtle')
        
def load(filename):
    with open(filename, 'r') as f:
        g.load(f, format='turtle')   

#### Collecting the ID and name of the first 50 movies of the linkedmdb database

In [15]:
def DownloadMovies(x):
    """This function downloads an x number of random selected movies from the linkedmdb
    database and stores the movie ID and corresponding name in a dictionary"""
    
    sparql = SPARQLWrapper("http://data.linkedmdb.org/sparql")
    sparql.setQuery("""
        PREFIX mdb: <http://data.linkedmdb.org/resource/movie/film>
        PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
        PREFIX dc: <http://purl.org/dc/terms/>

        SELECT ?label ?resource WHERE {{
           ?resource mdb:id ?uri .
           ?resource dc:title ?label . 
        }} LIMIT {val}
        """.format(val=x))
    
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    #Creating a dictionary to store the results in
    MoviesNamesID = {}
    #Making sure to add the right data in the dictionary.
    for x in results["results"]["bindings"]:
        MoviesNamesID[x['resource']['value']] = x['label']['value']
    #Return the dictionary with the IDs as keys and the name as value
    return MoviesNamesID

In [16]:
def CleanedDict(Dict):
    FixedDict = {}
    """This function makes clear that the names of the actors, writers or directors do not 
    contain any spaces and are URI valid."""
    for ID, Name in Dict.iteritems():
        value = unicodedata.normalize('NFKD', Name).encode('ascii','ignore').replace(' ', '_')
        #Deleting a lot of signs in the people their name so it can be used as a URI link.       
        value = value.replace('"', '')
        value = value.replace('.', '')
        value = value.replace('\'','')
        value = value.replace(',','')
        key = unicodedata.normalize('NFKD', ID).encode('ascii','ignore').replace(' ', '_')[35::]
        FixedDict[key] = value
    return FixedDict

#### Appending the data the the graph and printing the serialized output.

In [17]:
##Creating the graph where we will store the movie in
g =Graph()
IMDB = Namespace('http://data.linkedmdb.org/resource/')
g.bind('imdb',IMDB)
FOAF=Namespace('http://xmlns.com/foaf/0.1/')
g.bind('foaf',FOAF)

In [18]:
#Using the double functions to take the first 50 movies in the database, with their cleaned values.

def addingMoviesTurtle(functions):
    for ID, Name in functions.iteritems():
        w= IMDB[ID]
        c= IMDB[Name]
        g.add((w, RDF.type, IMDB['MovieID']))
        g.add((w, FOAF.name, c))

#### We now have the ID and Name of the first 50 movies. We are now going to gather the information about the actors, directors and writers of this movie

In [19]:
def CombinedDicts(Dict1, Dict2, Dict3):
    """This function gets 3 dicts as input and combines them so it outputs one dictionary"""
    CombinedDict = {}
    for key, value in Dict1.iteritems():
        CombinedDict[key] = value
    for key, value in Dict2.iteritems():
        CombinedDict[key] = value
    for key, value in Dict3.iteritems():
        CombinedDict[key] = value
    return CombinedDict

In [20]:
#Hier maken we de graph aan waar we uiteindelijk alle regisseurs, acteurs en schrijvers van de 50films
#in gaan storen!
gPeople = Graph()
 
IMDB = Namespace('http://data.linkedmdb.org/resource/')
MOVIE = Namespace('http://data.linkedmdb.org/resource/movie/')
FILM = Namespace('http://data.linkedmdb.org/resource/film/')
EX = Namespace('http://example.com/kad2017/')

gPeople.bind('IMDB', IMDB)
gPeople.bind('film',FILM)
gPeople.bind('movie',MOVIE)
gPeople.bind('EX', EX)

FOAF=Namespace('http://xmlns.com/foaf/0.1/')
gPeople.bind('foaf',FOAF)

In [21]:
##Nu gaan we een loop maken die door alle films heen gaat:
def AddingTriples(film):
    '''This function takes a film ID from linkedmdb as input. It than uses a QUERY to find the writers
    actors and directors of this movie. After this the script creates the triples between the IDs and Names
    of these people. It also creates a triple between all the people within a movie that they know eachother.'''
    
    sparql = SPARQLWrapper("http://data.linkedmdb.org/sparql")
    sparql.setQuery("""
    PREFIX mdb: <http://data.linkedmdb.org/resource/movie/>
    PREFIX mdb2: <http://data.linkedmdb.org/resource/film/>

    PREFIX rdfs: <http://www.w3.org/2000/01/rdf-schema#>
    PREFIX dc: <http://purl.org/dc/terms/>
    PREFIX foaf: <http://xmlns.com/foaf/0.1/>

    SELECT DISTINCT ?actor ?director ?actorName ?directorName ?writer ?writerName WHERE {{
    mdb2:{val} mdb:director ?director .
    ?director mdb:director_name ?directorName .
  
    mdb2:{val} mdb:actor ?actor .
    ?actor mdb:actor_name ?actorName .
  
    mdb2:{val} mdb:writer ?writer .
    ?writer mdb:writer_name ?writerName .
    }}
    """.format(val=film))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    temp=[]
    #We store the data obtained from the SPARQL query within 3 dict
    actor_dict = {}
    writer_dict = {}
    director_dict = {}
    #Here we append the SPARQL query data to the dicts
    for x in results["results"]["bindings"]:
            actor_dict[x['actor']['value']] = x['actorName']['value']
            writer_dict[x['writer']['value']] = x['writerName']['value']
            director_dict[x['director']['value']] = x['directorName']['value']      
    #Using the clean function to make sure the data within the dicts is URI compatible
    cleanedActor_dict = CleanedDict(actor_dict)
    cleanedWriter_dict = CleanedDict(writer_dict)
    cleanedDirector_dict = CleanedDict(director_dict)

    #Adding the triples from the cleaned dicts to the graph we created earlier above this code
    for ID, Name in cleanedActor_dict.iteritems():
        w= IMDB[ID]
        c= IMDB[Name]
        gPeople.add((w, RDF.type, IMDB['Person']))
        gPeople.add((w, FOAF.name, c))
        gPeople.add((w, MOVIE['actor'], FILM[film]))
        #Using DBpedia to get the agegroup of the actor, try except because some
        #people do not have a page on dbpedia with a corresponding birthdate
        try:
            age_group = classifyAgegroup(gettingAge(Name))
            gPeople.add((w, EX['hasAge'], EX[age_group]))
        except TypeError:
            continue
    #Adding the writers to the graph
    for ID, Name in cleanedWriter_dict.iteritems():
        w= IMDB[ID]
        c= IMDB[Name]
        gPeople.add((w, RDF.type, IMDB['Person']))
        gPeople.add((w, FOAF.name, c))
        gPeople.add((w, MOVIE['writer'], FILM[film]))
        #Using DBpedia to get the obtain agegroup of the writer
        try:
            age_group = classifyAgegroup(gettingAge(Name))
            gPeople.add((w, EX['hasAge'], EX[age_group]))
        except TypeError:
            continue
    #Adding the directors to the graph
    for ID, Name in cleanedDirector_dict.iteritems():
        w= IMDB[ID]
        c= IMDB[Name]
        gPeople.add((w, RDF.type, IMDB['Person']))
        gPeople.add((w, FOAF.name, c))
        gPeople.add((w, MOVIE['director'], FILM[film]))
        #Using DBpedia to get the obtain agegroup of the director
        try:
            age_group = classifyAgegroup(gettingAge(Name))
            gPeople.add((w, EX['hasAge'], EX[age_group]))
        except TypeError:
            continue
    #Making one final list of all people for the worked with relation they should have with each other.
    AllPeople = CombinedDicts(actor_dict, writer_dict, director_dict)
    #For the knows relation we first have to create all possible pairs between the people within a movie
    for pair in list(itertools.combinations(AllPeople.keys(), 2)):
        #Making sure we delete the link from the name, we want to use IMDB namespace
        person1 = pair[0][35::]
        person2 = pair[1][35::]
        gPeople.add((IMDB[person1], IMDB['workedWith'], IMDB[person2]))
    return
        

#### DBpedia for age functions
Now we have 2 functions that use the name of the actors and a sparql query to Dbpedia to obtain the age of the person.

In [22]:
def gettingAge(name):
    '''This function takes a name as input and uses a SPARQL query to obtain this person his date of birth
    from DBpedia. After this a python code calculates to what age-group this person belongs.'''
    #Creating a list to store the results in
    Birthdates = []
    #We do not have a way to let the function work with apostrophe YET
    if '\'' in name:
        return
    #There is a italian writer duo called age_&_Scarpelli, we filtr this one out for now.
    if '&' in name:
        return
    '''This function uses dbpedia and a Sparql Query to obtain the age from a certain person'''
    sparql = SPARQLWrapper("http://dbpedia.org/sparql")
    sparql.setQuery("""
    PREFIX dbr: <http://dbpedia.org/resource/>
    PREFIX dbo: <http://dbpedia.org/ontology/>
    PREFIX dbp: <http://dbpedia.org/property/>

    SELECT ?age WHERE {{
      dbr:{val} dbo:birthDate ?age
    }}
    """.format(val=name))
    sparql.setReturnFormat(JSON)
    results = sparql.query().convert()
    #Making sure to add the right data in the dictionary.
    for x in results["results"]["bindings"]:
        Birthdates.append(x['age']['value'])
    #Return the dictionary with the IDs as keys and the name as value
    
    try:
        return Birthdates[0]
    except IndexError:
        return
            


In [23]:
def classifyAgegroup(date):
    """This function takes a date as input and gives the age groups we decided to use as ouput. 
    The age groups are; 
    ageGroup-1 = 0-24, 
    ageGroup-2 = 25-40, 
    ageGroup-3 = 41-55, 
    ageGroup-4 = 56-70, 
    ageGroup-5 = 71-85, 
    ageGroup-6 = 85+ """
    #Getting the currect date so we can calculate someone his age
    Current_Date = (time.strftime("%Y-%m-%d"))
    #The total amount of years
    Leeftijd = int(Current_Date[0:4]) - int(date[0:4])
    
    #With all these if, elif and else statement we also check for the month the person was born in!
    #This is for the date format YYYY-MM-DD
    if len(date) == 10:
        if int(date[5:6]) < int(Current_Date[5:6]):
            Leeftijd = Leeftijd
        elif int(date[5:6]) > int(Current_Date[5:6]):
            Leeftijd += 1
        else:
            Leeftijd = Leeftijd
    #We also have dates where the date format is YYYY-M-D
    else:
        if str(date[6]) == '-':
            if int(date[5]) < int(Current_Date[5:6]):
                Leeftijd = Leeftijd
            elif int(date[5]) > int(Current_Date[5:6]):
                Leeftijd += 1
            else:
                Leeftijd = Leeftijd
        else:
            if int(date[5:6]) < int(Current_Date[5:6]):
                Leeftijd = Leeftijd
            elif int(date[5:6]) > int(Current_Date[5:6]):
                Leeftijd += 1
            else:
                Leeftijd = Leeftijd
    
    #Now we classify the person within a certain age-group!
    if Leeftijd > 0 and Leeftijd < 25:
        Leeftijd = 'ageGroup-1'
    elif Leeftijd > 24 and Leeftijd < 41:
        Leeftijd = 'ageGroup-2'
    elif Leeftijd > 40 and Leeftijd < 56:
        Leeftijd = 'ageGroup-3'
    elif Leeftijd > 55 and Leeftijd < 71:
        Leeftijd = 'ageGroup-4'
    elif Leeftijd > 70 and Leeftijd < 86:
        Leeftijd = 'ageGroup-5'
    elif Leeftijd > 85:
        Leeftijd = 'ageGroup-6'
    else:
        Leeftijd = 'Something went wrong!'
    
    return Leeftijd

In [24]:
def _Main(x):
    """This function takes the first x movies from the database and creates all the triples between the
    movies, actors, writers and directors. It safes the movies with their corresponding ID as a movies.ttl file
    and the writers, directors and actors with their triples as a people.ttl file."""
    #Just a simple counter to check if the function is still running
    counter=0
    #Here we create a list with the x amount of movies we want the data from
    The_Movies = CleanedDict(DownloadMovies(x))
    
    ##HERE WE ADD THE MOVIE ID and Titles of the movies to the Graph G
    addingMoviesTurtle(The_Movies)            
    
    #Now we get the writers, actors and directors from each movie!               
    for movie in The_Movies.keys():
                       
        if counter % 50 == 0:
            print 'still going!'
        AddingTriples(movie[5::])
        counter+=1
    #Here we append the triple data to 2 .tll files and store them in the same folder!
    
    g.serialize(destination='smallmovies.ttl', format='turtle')
    gPeople.serialize(destination='smallpeople.ttl', format='turtle')
      
%time _Main(200)

still going!
still going!
still going!
still going!
Wall time: 2min 34s
