In [1]:

wikibase_url = '164.90.222.155'


In [2]:

from IPython.display import clear_output
from wikibase_api import Wikibase
import pathlib, rdflib, os, subprocess
import pandas, pydash, datetime, requests
import json, uuid

in_path = pathlib.Path.cwd().resolve().parents[0] / '3_rdf' / 'sightandsound_rdf.rdf'
out_path = pathlib.Path.cwd().resolve().parents[0] / '3_rdf' / 'sightandsound_rdf_utf8.rdf' 

with open(out_path, 'w') as data_write:
    with open(in_path) as data:
        data_read = data.read().encode('cp1252', errors='ignore').decode('utf8', errors='ignore')
    data_write.write(data_read)

print(len(data_read))


13169877


In [3]:

# load up n-triples and break into phrases

path = pathlib.Path.cwd().resolve().parents[0] / '3_rdf' / 'sightandsound_rdf_utf8.rdf'
g = rdflib.Graph().parse(str(path), format="nt")
print(len(g))


35497


In [4]:

# notebook_path = pathlib.Path.cwd()
# print(notebook_path)
# wikibase_path = pathlib.Path.home() / 'wikibase-docker'
# if pathlib.Path.exists(wikibase_path) == False:
#     os.chdir(pathlib.Path.home())
#     subprocess.call(['git', 'clone', 'https://github.com/wmde/wikibase-docker.git'])
#     os.chdir(notebook_path)
#     print(pathlib.Path.cwd())
    

In [5]:

# # fire up the instance
# os.chdir(wikibase_path)
# print(pathlib.Path.cwd())
# subprocess.call(['docker-compose', 'up', '-d'])
# os.chdir(notebook_path)
# print(pathlib.Path.cwd())


In [6]:

S = requests.Session()
wikiurl = f"http://{wikibase_url}:8181"
endpoint = wikiurl + "/w/api.php"
PARAMS_0 = {'action':"query",'meta':"tokens",'type':"createaccount",'format':"json"}
R = S.get(url=endpoint, params=PARAMS_0)
DATA = R.json()
TOKEN = DATA['query']['tokens']['createaccounttoken']
PARAMS_1 = {'action': "createaccount",'createtoken': TOKEN,'username': 'JupiterJones',
    'password': 'ghosttoghost','retype': 'ghosttoghost','createreturnurl': wikiurl,'format': "json"}

R = S.post(endpoint, data=PARAMS_1)
DATA = R.json()

print(DATA)

{'createaccount': {'status': 'PASS', 'username': 'JupiterJones'}}


In [7]:

# bot config

print(pathlib.Path.cwd())
config = {"apiUrl":f"http://{wikibase_url}:8181/w/api.php",
          "loginCredentials": {"botUsername":"JupiterJones", "botPassword":"ghosttoghost"},
          "summary":"data added via wikibase-api"}
configpath = pathlib.Path.cwd() / 'config.json' 
with open(configpath, 'w') as config_doc:
    json.dump(config, config_doc)
    

/Users/paulduchesne/Git/sight-and-sound/4_parse


In [8]:

# ontology entities

test_df = pandas.DataFrame(g, columns=['s','p','o'])

def test(row):
    if type(row['o']) == type(rdflib.term.Literal('h')):
        return('literal')
    else:
        return('item')
test_df['test'] = test_df.apply(test, axis=1)

rdf_type = rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
ontology = list(test_df.loc[test_df.p.isin([rdf_type])]['o'].unique())

wb = Wikibase(config_path=configpath)
ont_dict = dict()

for o in ontology:    
    content = {"labels": {"en": {"language": "en", "value": str(o).split('#')[1]}}}
    r = wb.entity.add('item', content=content)
    identif = (pydash.get(r, 'entity.id'))
    wb.description.set(identif, uuid.uuid4(), "en")  
    ont_dict[o] = identif
    
print(ont_dict)


{rdflib.term.URIRef('urn:absolute:testontology#Film'): 'Q1', rdflib.term.URIRef('urn:absolute:testontology#Person'): 'Q2', rdflib.term.URIRef('urn:absolute:testontology#Country'): 'Q3', rdflib.term.URIRef('urn:absolute:testontology#Gender'): 'Q4', rdflib.term.URIRef('urn:absolute:testontology#Profession'): 'Q5'}


In [9]:

# entities and labels

wb = Wikibase(config_path=configpath)

lab = rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')
entity = test_df.loc[test_df.p.isin([lab])]

ent_data = list(zip(list(entity.s), list(entity.o)))
ent_dict = dict()

commencer = datetime.datetime.now()
for n, x in enumerate(ent_data):
    a, b = x[0], x[1]
    
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(ent_data)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processing: {n+1} of {len(ent_data)}; eta {time_to_finish}.')
    clear_output(wait=True)

    content = {"labels": {"en": {"language": "en", "value": b}}}
    r = wb.entity.add('item', content=content)
    identif = (pydash.get(r, 'entity.id'))
    wb.description.set(identif, uuid.uuid4(), "en")  
    ent_dict[a] = identif


processing: 4895 of 4895; eta 2020-07-27 13:08:24.


In [10]:

# properties

exempt = [rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')]
wb = Wikibase(config_path=configpath)
prop = [x for x in pydash.uniq(list(zip(list(test_df.p), list(test_df.test)))) if x[0] not in exempt]

prop_dict = dict()

commencer = datetime.datetime.now()
for n, x in enumerate(prop):
    a, b = x[0], x[1]
    if b == 'item':
        content = {"labels": {"en": {"language": "en", "value": str(a).split('#')[1]}}, 'datatype':'wikibase-item'}
    else:
        content = {"labels": {"en": {"language": "en", "value": str(a).split('#')[1]}}, 'datatype':'string'}
    r = wb.entity.add('property', content=content)
    identif = (pydash.get(r, 'entity.id'))
    wb.description.set(identif, uuid.uuid4(), "en")  
    prop_dict[a] = identif
    
print(prop_dict)
print(len(prop_dict))


{rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 'P1', rdflib.term.URIRef('urn:absolute:testontology#of_country'): 'P2', rdflib.term.URIRef('urn:absolute:testontology#voted_by'): 'P3', rdflib.term.URIRef('urn:absolute:testontology#has_profession'): 'P4', rdflib.term.URIRef('urn:absolute:testontology#directed_by'): 'P5', rdflib.term.URIRef('urn:absolute:testontology#has_gender'): 'P6', rdflib.term.URIRef('urn:absolute:testontology#of_year'): 'P7'}
7


In [11]:

# replace rdf values with wikibase codes

altered_state = test_df.copy()

for a,b in ent_dict.items():
    altered_state.loc[altered_state.s.isin([a]), 's'] = b
for a,b in prop_dict.items():
    altered_state.loc[altered_state.p.isin([a]), 'p'] = b
for a,b in ont_dict.items():
    altered_state.loc[altered_state.o.isin([a]), 'o'] = b
for a,b in ent_dict.items():
    altered_state.loc[altered_state.o.isin([a]), 'o'] = b    

altered_state.head(20)


Unnamed: 0,s,p,o,test
0,Q2672,P1,Q1,item
1,Q3851,P2,Q2829,item
2,Q3657,P2,Q1891,item
3,Q896,P3,Q2503,item
4,Q4411,P2,Q4270,item
5,Q806,P3,Q302,item
6,Q3799,P3,Q4406,item
7,Q1136,P2,Q853,item
8,Q68,P3,Q2540,item
9,Q1679,P4,Q4681,item


In [12]:

# claims

wb = Wikibase(config_path=configpath)
exempt = [rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')]

claims = altered_state.copy()
claims = claims.loc[~claims.p.isin(exempt)]

claim_data = list(zip(list(claims.s),list(claims.p),list(claims.o),list(claims.test)))
commencer = datetime.datetime.now()
for n, x in enumerate(claim_data):
    a, b, c, d = x[0], x[1], x[2], x[3]

    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(claim_data)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processing: {n+1} of {len(claim_data)}; eta {time_to_finish}.')
    clear_output(wait=True)

    if d == 'literal':
        wb.claim.add(a, b, c)
    else:
        wb.claim.add(a, b, {'entity-type':'item', 'id':c})



processing: 30602 of 30602; eta 2020-07-27 22:18:22.
