In [38]:
import json,sys,os,re,math
from datetime import time
import urllib.parse as url_parse
import dateutil.parser as date_parse

In [2]:
imdb_path = './imdb.jl'
afi_path = './afi.jl'
task2_result_path ='./Ziheng_Gong_hw03_imdb_afi_el.json'
export_path = 'Ziheng_Gong_hw03_movie_triples.ttl'
imdb_dict = dict()
afi_dict = dict()

with open(imdb_path,'r') as f:
    for line in f.readlines():
        line_dict = json.loads(line)
        url = line_dict['url']
        imdb_dict[url] = line_dict

with open(afi_path,'r') as f:
    for line in f.readlines():
        line_dict = json.loads(line)
        url = line_dict['url']
        afi_dict[url] = line_dict
 
with open(task2_result_path,'r') as f:
    task2_result_list = json.loads(f.read())     


In [3]:
def pattern_match(pattern,input):
    company_list = []
    span_list = list()
    
    for match in re.finditer(pattern,input):
        matched_string = match.group().strip()
        company_list.append(matched_string)
        span_list.append(match.span())

    remainder = input
    for span in span_list:
        sub = input[span[0]:span[1]]
        remainder = remainder.replace(sub,"")
    
    if company_list:
        return company_list, remainder
    else:
        return None,remainder

def get_company(input_string):

    if input_string == None:
        return set()

    inc = re.compile(r'\b[- .&\w]+,? Inc.')
    ltd = re.compile(r'\b[- .&\w\(\)]+,? Ltd.')
    output = set()
    
    match,remainder = pattern_match(inc, input_string)
    if match:
        output.update(match)
    match,remainder = pattern_match(ltd, remainder)
    if match:
        output.update(match)
    remainder = remainder.split(',')
    for item in remainder:
        if re.match('\w',item):
            output.update([item.strip()])
    
    return output


In [4]:
company_set = set()
t_company = []
for pair in task2_result_list:
    if pair['afi_movie']:
        afi_url = pair['afi_movie']
        company_field = afi_dict[afi_url].get('production_company')
        company_set.update(get_company(company_field))
        t_company.extend(get_company(company_field))


In [13]:
comapany_list = sorted(company_set)
company_dict = dict()
company_base = 'http://dsci558.org/myfakenamespace/company#'
for company in comapany_list:
    company_dict[company] = company_base + urllib.parse.quote(company) 

In [39]:
import rdflib
from rdflib import URIRef, Literal, Namespace,BNode
from rdflib.namespace import RDF,RDFS,XSD,FOAF

XML = Namespace('http://www.w3.org/XML/1998/namespace')
MYNS = Namespace('http://dsci558.org/myfakenamespace#')
SCHEMA = Namespace("https://schema.org/")

g = rdflib.Graph()
# bind namespaces to prefixes to shorten the URIs for Turtle
g.bind('my_ns', MYNS)
g.bind('rdf',RDF)
g.bind('rdfs',RDFS)
g.bind('foaf', FOAF)
g.bind('schema', SCHEMA)

# production company
for company in comapany_list:
    company_node_uri = URIRef(MYNS[url_parse.quote(company)])
    g.add((company_node_uri, RDF.type, MYNS.productionCompany))
    g.add((company_node_uri, RDFS.subClassOf, SCHEMA.Organization))
    g.add((company_node_uri, SCHEMA.name, Literal(company,datatype=SCHEMA.text)))

for pair in task2_result_list:
    # afi
    afi_uri = pair['afi_movie']
    if afi_uri:
        title = afi_dict[afi_uri].get('title', None)
        release_date = afi_dict[afi_uri].get('release_date', None)
        a_genre = afi_dict[afi_uri].get('genre', None)
        producer = afi_dict[afi_uri].get('producer', None)
        author = afi_dict[afi_uri].get('writer', None)
        cinematographer = afi_dict[afi_uri].get('cinematographer', None)
        productionCompany = afi_dict[afi_uri].get('production_company', None)
    else:
        title = None
        release_date = None
        a_genre = None
        producer = None
        author = None
        cinematographer = None
        productionCompany = None

    # imdb
    imdb_uri = pair['imdb_movie']
    certificate = imdb_dict[imdb_uri].get('certificate', None)
    duration = imdb_dict[imdb_uri].get('runtime', None)
    if duration:
        mins = int(re.findall(r"\d{1,3}",duration)[0])
        m = mins % 60
        h = math.floor(mins / 60)
        duration = time(h,m).isoformat()
    i_genre = imdb_dict[imdb_uri].get('genre', None)
    imdb_rating = imdb_dict[imdb_uri].get('rating', None)
    imdb_metascore = imdb_dict[imdb_uri].get('metascore', None) 
    imdb_votes = imdb_dict[imdb_uri].get('votes', None) 
    gross_income = imdb_dict[imdb_uri].get('gross', None) 
    
    genre = ''
    if a_genre or i_genre:
        if a_genre:
            genre = genre + a_genre
        if i_genre:
            genre = genre + i_genre
    if genre == '':
        genre = None


    node_uri = URIRef(imdb_uri)
    g.add((node_uri, RDF.type, MYNS.Movie))
    
    if title:
        g.add((node_uri, MYNS.title, Literal(title,datatype=SCHEMA.text))) # title
    else:
        g.add((node_uri, MYNS.title, BNode()))
    
    if release_date:   
        g.add((node_uri, SCHEMA.datePublished, Literal(date_parse.parse(release_date),datatype=XSD.date))) # release_date
    else:
        g.add((node_uri, SCHEMA.datePublished, BNode()))
    
    if certificate:
        g.add((node_uri, SCHEMA['contentRating'], Literal(certificate,datatype=SCHEMA.Rating))) # certificate
    else:
        g.add((node_uri, SCHEMA['contentRating'], BNode()))
    
    if duration:
        g.add((node_uri, SCHEMA.duration, Literal(duration,datatype=SCHEMA.Duration)))
    else:
        g.add((node_uri, SCHEMA.duration, BNode()))

    if genre:
        g.add((node_uri, SCHEMA.genre, Literal(genre,datatype=SCHEMA.text)))
    else:
        g.add((node_uri, SCHEMA.genre, BNode()))
    
    if imdb_rating:
        g.add((node_uri, MYNS.imdb_rating, Literal(imdb_rating,datatype=XSD.float)))
    else:
        g.add((node_uri, MYNS.imdb_rating, BNode()))     
    
    if imdb_metascore:
        g.add((node_uri, MYNS.imdb_metascore, Literal(imdb_metascore,datatype=XSD.integer)))
    else:
        g.add((node_uri, MYNS.imdb_metascore, BNode()))        

    if imdb_votes:
        g.add((node_uri, MYNS.imdb_votes, Literal(imdb_votes,datatype=XSD.integer)))
    else:
        g.add((node_uri, MYNS.imdb_votes, BNode()))    
    
    if gross_income:
        g.add((node_uri, MYNS.gross_income, Literal(gross_income,datatype=SCHEMA.MonetaryAmount)))
    else:
        g.add((node_uri, MYNS.gross_income, BNode()))   
    
    if producer:
        g.add((node_uri, SCHEMA.producer, Literal(producer,datatype=SCHEMA.Person)))
    else:
        g.add((node_uri, SCHEMA.producer, BNode()))   

    if author:
        g.add((node_uri, SCHEMA.author, Literal(author,datatype=SCHEMA.Person)))
    else:
        g.add((node_uri, SCHEMA.author, BNode()))   

    if cinematographer:
        g.add((node_uri, SCHEMA.cinematographer, Literal(cinematographer,datatype=SCHEMA.Person)))
    else:
        g.add((node_uri, SCHEMA.cinematographer, BNode()))   
    
    if productionCompany:
        g.add((node_uri, SCHEMA.productionCompany, MYNS[url_parse.quote(productionCompany)]))
    else:
        g.add((node_uri, SCHEMA.productionCompany, BNode()))       


# print(g.serialize(format="turtle").decode("utf-8"))
g.serialize('3.2testoutput.ttl', format="turtle")