In [1]:
import sys
from os import listdir, path, mkdir
import time

import pandas as pd
import numpy as np
import networkx as nx


from remp.util import prepare_cache_folder, matching_quality, CacheDecoreator, suffix
from remp.matching import candidate_matching, initial_matching, pruned_matching, prior_probabilities
from remp.property_alignment import attribute_alignment
from remp.learning import RempMatcher
from remp import rdfutil

# Load Triples

In [2]:
# the cache directory to save temp data
cache_base_dir = '/mnt/hdd/Cache'

class IIMB:
    def __init__(self, task_id=1):
        self.label = ('iimbtbox:name', 'iimbtbox:name') # the label attributes
        loader = rdfutil.TripleLoader()
        loader.load('/mnt/hdd/Datasets/iimb/iimb1.owl')
        self.relations_1 = pd.DataFrame(loader.relations, columns=['s', 'r', 'o'])
        self.attributes_1 = pd.DataFrame(loader.attributes, columns=['s', 'a', 'v'])

        loader = rdfutil.TripleLoader()
        loader.load('/mnt/hdd/Datasets/iimb/iimb2.owl')
        self.relations_2 = pd.DataFrame(loader.relations, columns=['s', 'r', 'o'])
        self.attributes_2 = pd.DataFrame(loader.attributes, columns=['s', 'a', 'v'])
dataset = IIMB()

# Construct ER Graph

In [3]:
M_in = initial_matching(dataset)
M_c = candidate_matching(dataset)

# attribute alignment when required
M_at = None
if len(set(dataset.attributes_1['a']) ^ set(dataset.attributes_2['a'])) == 0:
    M_at = attribute_alignment(dataset.attributes_1, dataset.attributes_2, M_in)
    
M_pruned = pruned_matching(dataset, M_c, M_at, k=4)

# add probabilities to the er graph
from remp.ergraph import construct_er_graph
prior = prior_probabilities(dataset, M_pruned).reset_index()
ergraph = construct_er_graph(dataset.relations_1, dataset.relations_2, M_in, M_pruned, prior)

# Crowdsourcing

In [4]:
matcher = RempMatcher(dataset, ergraph, prior)
# multiple questions selection
questions = matcher.next_questions()
print(questions)

['iimb:sealand\tiimb:item1395082569838752659', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288', 'iimb:singapore\tiimb:item1719555844618676288']


In [5]:
# update labels
matcher.update_model(['iimb:sealand\tiimb:item1395082569838752659', 'iimb:singapore\tiimb:item1719555844618676288'], [0.95, 0.1])