In [1]:

from IPython.display import clear_output
from wikibase_api import Wikibase
import pathlib, rdflib, os, subprocess
import pandas, pydash, datetime, requests
import json, uuid

wikibase_url = '138.197.181.117'

print(datetime.datetime.now())
in_path = pathlib.Path.cwd().resolve().parents[0] / '4-rdf' / 'sightandsound-rdf.rdf'
out_path = pathlib.Path.cwd().resolve().parents[0] / '5-inject' / 'sightandsound-rdf-utf8.rdf' 

with open(out_path, 'w') as data_write:
    with open(in_path) as data:
        data_read = data.read().encode('cp1252', errors='ignore').decode('utf8', errors='ignore')
    data_write.write(data_read)

print(len(data_read))
path = pathlib.Path.cwd().resolve().parents[0] / '5-inject' / 'sightandsound-rdf-utf8.rdf' 
g = rdflib.Graph().parse(str(path), format="nt")
print(len(g))


2020-09-30 22:54:03.148638
17604729
40492


In [2]:

# generate a bot
# shamelessly taken from here https://www.mediawiki.org/wiki/API:Account_creation

S = requests.Session()
wikiurl = f"http://{wikibase_url}:8181"
endpoint = wikiurl + "/w/api.php"
PARAMS_0 = {'action':"query",'meta':"tokens",'type':"createaccount",'format':"json"}
R = S.get(url=endpoint, params=PARAMS_0)
DATA = R.json()
TOKEN = DATA['query']['tokens']['createaccounttoken']
PARAMS_1 = {'action': "createaccount",'createtoken': TOKEN,'username': 'JupiterJones',
    'password': 'ghosttoghost','retype': 'ghosttoghost','createreturnurl': wikiurl,'format': "json"}

R = S.post(endpoint, data=PARAMS_1)
DATA = R.json()

print(DATA)
print(datetime.datetime.now())


{'createaccount': {'status': 'PASS', 'username': 'JupiterJones'}}
2020-09-30 22:54:14.787765


In [3]:

# load config

config = {"apiUrl":f"http://{wikibase_url}:8181/w/api.php",
          "loginCredentials": {"botUsername":"JupiterJones", "botPassword":"ghosttoghost"},
          "summary":"data added via wikibase-api"}
configpath = pathlib.Path.cwd() / 'config.json' 
with open(configpath, 'w') as config_doc:
    json.dump(config, config_doc)
print(datetime.datetime.now())    
    

2020-09-30 22:54:14.805203


In [4]:

# author ontology entities

test_df = pandas.DataFrame(g, columns=['s','p','o'])

def test(row):
    if type(row['o']) == type(rdflib.term.Literal('h')):
        return('literal')
    else:
        return('item')
test_df['test'] = test_df.apply(test, axis=1)

rdf_type = rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type')
ontology = list(test_df.loc[test_df.p.isin([rdf_type])]['o'].unique())

wb = Wikibase(config_path=configpath)
ont_dict = dict()

for o in ontology:    
    content = {"labels": {"en": {"language": "en", "value": str(o).split('#')[1]}}}
    r = wb.entity.add('item', content=content)
    identif = (pydash.get(r, 'entity.id'))
    wb.description.set(identif, uuid.uuid4(), "en")  
    ont_dict[o] = identif

print(datetime.datetime.now())    
print(ont_dict)


2020-09-30 22:54:25.354224
{rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#Film'): 'Q1', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#Person'): 'Q2', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#Country'): 'Q3', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#Profession'): 'Q4', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#Gender'): 'Q5'}


In [5]:

# author entities and labels

wb = Wikibase(config_path=configpath)

lab = rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')
entity = test_df.loc[test_df.p.isin([lab])]

ent_data = list(zip(list(entity.s), list(entity.o)))
ent_dict = dict()

commencer = datetime.datetime.now()
for n, x in enumerate(ent_data):
    a, b = x[0], x[1]
    
    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(ent_data)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processing: {n+1} of {len(ent_data)}; eta {time_to_finish}.')
    clear_output(wait=True)

    content = {"labels": {"en": {"language": "en", "value": b}}}
    r = wb.entity.add('item', content=content)
    identif = (pydash.get(r, 'entity.id'))
    wb.description.set(identif, uuid.uuid4(), "en")  
    ent_dict[a] = identif

print(datetime.datetime.now())    


2020-10-01 01:34:45.433958


In [6]:

# author properties

exempt = [rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')]
wb = Wikibase(config_path=configpath)
prop = [x for x in pydash.uniq(list(zip(list(test_df.p), list(test_df.test)))) if x[0] not in exempt]

prop_dict = dict()

commencer = datetime.datetime.now()
for n, x in enumerate(prop):
    a, b = x[0], x[1]
    if b == 'item':
        content = {"labels": {"en": {"language": "en", "value": str(a).split('#')[1]}}, 'datatype':'wikibase-item'}
    else:
        content = {"labels": {"en": {"language": "en", "value": str(a).split('#')[1]}}, 'datatype':'string'}
    r = wb.entity.add('property', content=content)
    identif = (pydash.get(r, 'entity.id'))
    wb.description.set(identif, uuid.uuid4(), "en")  
    prop_dict[a] = identif

print(datetime.datetime.now())    
print(prop_dict)
print(len(prop_dict))


2020-10-01 01:35:00.964483
{rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#of_year'): 'P1', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#directed_by'): 'P2', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#has_profession'): 'P3', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#imdb_identifier'): 'P4', rdflib.term.URIRef('http://www.w3.org/1999/02/22-rdf-syntax-ns#type'): 'P5', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#voted_by'): 'P6', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#wikidata_identifier'): 'P7', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#has_gender'): 'P8', rdflib.term.URIRef('urn:absolute:sight-and-sound-ontology#of_country'): 'P9'}
9


In [7]:

# replace rdf values with fresh wikibase codes

altered_state = test_df.copy()

for a,b in ent_dict.items():
    altered_state.loc[altered_state.s.isin([a]), 's'] = b
for a,b in prop_dict.items():
    altered_state.loc[altered_state.p.isin([a]), 'p'] = b
for a,b in ont_dict.items():
    altered_state.loc[altered_state.o.isin([a]), 'o'] = b
for a,b in ent_dict.items():
    altered_state.loc[altered_state.o.isin([a]), 'o'] = b    

print(datetime.datetime.now())
altered_state.head()


2020-10-01 01:36:34.391909


Unnamed: 0,s,p,o,test
0,Q6,http://www.w3.org/2000/01/rdf-schema#label,The Long Goodbye,literal
1,Q4255,P1,1980.0,literal
2,Q2712,P2,Q672,item
3,Q2076,P2,Q3509,item
4,Q3502,P3,Q4587,item


In [8]:

# author claims

wb = Wikibase(config_path=configpath)
exempt = [rdflib.term.URIRef('http://www.w3.org/2000/01/rdf-schema#label')]

claims = altered_state.copy()
claims = claims.loc[~claims.p.isin(exempt)]

claim_data = list(zip(list(claims.s),list(claims.p),list(claims.o),list(claims.test)))
commencer = datetime.datetime.now()
for n, x in enumerate(claim_data):
    a, b, c, d = x[0], x[1], x[2], x[3]

    time_to_finish = ((((datetime.datetime.now()-commencer)/(n+1))*(len(claim_data)))+commencer).strftime("%Y-%m-%d %H:%M:%S")
    print(f'processing: {n+1} of {len(claim_data)}; eta {time_to_finish}.')
    clear_output(wait=True)

    if d == 'literal':
        wb.claim.add(a, b, c)
    else:
        wb.claim.add(a, b, {'entity-type':'item', 'id':c})

print(datetime.datetime.now())


2020-10-01 12:33:00.814112
