# Imports & global variables

In [1]:
from subprocess import run
from subprocess import check_output
import re
import numpy as np
import pandas as pd
import rdflib
import rdflib_hdt
import ast
from SPARQLWrapper import SPARQLWrapper, JSON

In [2]:
# global variables
path_to_datasets_dir = '/Users/paulosh/Desktop/M2/stage_material/datasets/'
path_to_sakey = '/Users/paulosh/Desktop/M2/stage_material/sakey/sakey.jar'
class_keys = {}

# Loading the data

In [3]:
# transforms a text file into a dataframe, input: path to the text file
def file_to_dataframe(path):
    ds = []
    with open(path,  'r', encoding='latin-1') as f:
        lines = f.readlines()    
        for line in lines:
            ds.append(np.array(line.split(".\n")[0].split("\t")))
    
    ds = pd.DataFrame(ds).dropna()
    for i, row in ds.iterrows():    # removing the newline characters
        ifor_val = ds[2][i]
        if ds[2][i][-1:] == '\n':
            ifor_val = ds[2][i][:-1]
        ds.at[i,2] = ifor_val
    
    return ds

In [4]:
path_to_datasets_dir = '/Users/paulosh/Desktop/M2/stage_material/datasets/'

univ = file_to_dataframe(path_to_datasets_dir + 'University/DB_University')
muse = file_to_dataframe(path_to_datasets_dir + 'Museum/DB_Museum')
book = file_to_dataframe(path_to_datasets_dir + 'Book/DB_Book')
moun = file_to_dataframe(path_to_datasets_dir + 'Mountain/DB_Mountain')
albu = file_to_dataframe(path_to_datasets_dir + 'Album/DB_Album')
scie = file_to_dataframe(path_to_datasets_dir + 'Scientist/DB_Scientist')
acto = file_to_dataframe(path_to_datasets_dir + 'Actor/DB_Actor')
film = file_to_dataframe(path_to_datasets_dir + 'Film/DB_Film')
city = file_to_dataframe(path_to_datasets_dir + 'City/DB_City')

# Parsing the keys

In [5]:
# arguments: path to the file we need to parse keys of, and nb of exceptions for SAkey
# nb of exceptions can be an integer, or a float representing the threshold
def compute_keys(dataset1, nb_exceptions):
    if type(nb_exceptions) == float:
        ds = file_to_dataframe(dataset1)
        nb_exceptions = int(nb_exceptions*len(list(set(ds[0].values))))
    print('number of exceptions = %s'%nb_exceptions)
    res = check_output(f'java -jar %s %s '%(path_to_sakey, dataset1) + str(nb_exceptions) , shell=True)
    res = res.decode().split(": ")[1][:-2]
    res = res.split("\n%s-almost keys:" %(nb_exceptions - 1))
    almost_keys = res[1]              
    almost_keys_array = []
    for i in almost_keys.split("], ["):
        splited = re.split("[\[,\] ]", i)
        almost_keys_array.append([])
        for j in splited:
            if j != '' and j != 'prop':
                almost_keys_array[-1].append(j)
    
    if [] in almost_keys_array: almost_keys_array.remove([])            
    
    l = len(almost_keys_array)
    if l > 0:           
        class_keys[dataset1.split('DB_')[1]] = almost_keys_array
        print("%i keys found" %l)
    else: print("no keys found")
        
        
def compute_all_keys(k): # input k = the number of exceptions for SAKey
    compute_keys(path_to_datasets_dir + 'University/DB_University', k)
    compute_keys(path_to_datasets_dir + 'Museum/DB_Museum', k)
    compute_keys(path_to_datasets_dir + 'Book/DB_Book', k)
    compute_keys(path_to_datasets_dir + 'Mountain/DB_Mountain', k)
    compute_keys(path_to_datasets_dir + 'Album/DB_Album', k)
    compute_keys(path_to_datasets_dir + 'Scientist/DB_Scientist', k)
    compute_keys(path_to_datasets_dir + 'Actor/DB_Actor', k)
    compute_keys(path_to_datasets_dir + 'Film/DB_Film', k)
    compute_keys(path_to_datasets_dir + 'City/DB_City', k)

In [6]:
compute_all_keys(0.005)

number of exceptions = 51
12 keys found
number of exceptions = 9
10 keys found
number of exceptions = 149
14 keys found
number of exceptions = 82
4 keys found
number of exceptions = 425
4 keys found
number of exceptions = 92
11 keys found
number of exceptions = 29
41 keys found
number of exceptions = 410
24 keys found
number of exceptions = 96
73 keys found


In [7]:
class_keys

{'University': [['playsfor-inv', 'hasmotto'],
  ['playsfor-inv', 'wascreatedonyear'],
  ['skos:preflabel', 'graduatedfrom-inv'],
  ['created'],
  ['playsfor-inv', 'graduatedfrom-inv'],
  ['wascreatedonyear', 'hasmotto'],
  ['wascreatedonyear', 'graduatedfrom-inv'],
  ['skos:preflabel', 'wascreatedonyear'],
  ['owns'],
  ['wascreatedondate'],
  ['hasmotto', 'graduatedfrom-inv'],
  ['skos:preflabel', 'playsfor-inv']],
 'Museum': [['haslongitude', 'wascreatedonyear'],
  ['haslatitude', 'skos:preflabel'],
  ['skos:preflabel', 'wascreatedondate'],
  ['haslatitude', 'wascreatedonyear'],
  ['created-inv'],
  ['skos:preflabel', 'wascreatedonyear'],
  ['haslongitude', 'wascreatedondate'],
  ['islocatedin', 'wascreatedondate'],
  ['haslongitude', 'skos:preflabel'],
  ['haslatitude', 'wascreatedondate']],
 'Book': [['islocatedin', 'skos:preflabel', 'created-inv'],
  ['skos:preflabel', 'wascreatedondate'],
  ['hasisbn', 'created-inv', 'wascreatedonyear'],
  ['skos:preflabel', 'wascreatedonyear'],


# Pre-processing & function definitions

In [8]:
# some useful functions for treating data-type/object properties

def get_values_from_property(dataset, prop):
    searchspace = dataset[dataset[1] == prop]
    return list(set(searchspace[2].values))


def obj_litt(dataset, prop):
    obj = 0; litt = 0
    vals = get_values_from_property(dataset, prop)
    for val in vals:
        if val[0] == '"' or val.isnumeric(): litt+=1
        else: obj+=1
    return [obj, litt]


def extract_obj_properties(dataset):
    objects = []; litts = []
    props = list(set(dataset[1].values))
    for prop in props:
        if obj_litt(dataset, prop)[0] != 0: objects.append(prop)
        else: litts.append(prop)
    return [objects, litts]


def compute_support(dataset, prop):
    covered = 0; not_covered = 0
    subjects = list(set(dataset[0].values))
    for sub in subjects:
        ss = dataset[dataset[0] == sub]
        props = list(set(ss[1].values))
        if prop in props: covered+=1
        else: not_covered+=1
    return round(covered/(covered+not_covered), 3)


def compute_discr(dataset, prop):
    ss = dataset[dataset[1] == prop]
    return len(list(set(ss[2].values)))/len(ss[2].values)

In [10]:
# loading the full dbpedia dataset
dataset = pd.read_pickle('/Users/paulosh/Downloads/dataset_db')

In [11]:
# function that gets the range of a property (i.e. what classes that property targets)
def get_range(prop, classe):
    triples = dataset[dataset[1] == prop] # we select all triples containing this property 
    triples = triples[triples[0].isin(class_instances[classe])] # among those, the one who have a subject from our class
    vals = triples[2].values  # we extract the values of these triples     

    ins = dataset[dataset[0].isin(vals)] # we select all the triples that have these IRIs as subjects
    ins = ins[ins[1] == 'type'] # then, these subjects who have a type
    return list(set(ins[2].values))


class_instances = {}

for classe in class_keys:
    print(classe)
    dataset_searchspace = dataset[dataset[1] == 'type']
    
    dataset_searchspace = dataset_searchspace[dataset_searchspace[2] == classe]
    instances = list(set(dataset_searchspace[0].tolist()))  
    class_instances[classe] = instances

University
Museum
Book
Mountain
Album
Scientist
Actor
Film
City


In [12]:
# pre-processing, ranking data-type properties per class
best_litt_props = {}

def preprocess_props(data, target_classes, k, m, n):
    for selected_class in target_classes:
        
        scores = {}
        dataset = data[data[2] == selected_class]
        vals = list(set(dataset.head(k)[0]))
        sub_dataset = data[data[0].isin(vals)]
        obj, litt = extract_obj_properties(sub_dataset)
        for prop in litt:
            #print(prop)
            scores[prop] = m*compute_support(sub_dataset, prop) + n*compute_discr(sub_dataset, prop)*(1/(m+n))
        scores = dict(sorted(scores.items(), key=lambda item: item[1]))
        max_props = []
        for i in range(1, len(litt)+1):
            max_props.append(list(scores.keys())[-i])
        best_litt_props[selected_class] = max_props

In [13]:
%%time
preprocess_props(dataset, class_keys, 1000, 1, 2)

CPU times: user 1min 4s, sys: 471 ms, total: 1min 4s
Wall time: 1min 5s


In [14]:
best_litt_props

{'University': ['skos:preflabel',
  'hasmotto',
  'wascreatedondate',
  'wascreatedonyear'],
 'Museum': ['skos:preflabel',
  'haslongitude',
  'haslatitude',
  'wascreatedondate',
  'wascreatedonyear'],
 'Book': ['skos:preflabel',
  'hasisbn',
  'haspages',
  'wascreatedondate',
  'wascreatedonyear'],
 'Mountain': ['skos:preflabel', 'haslongitude', 'haslatitude'],
 'Album': ['skos:preflabel', 'wascreatedondate', 'wascreatedonyear'],
 'Scientist': ['skos:preflabel'],
 'Actor': ['skos:preflabel',
  'wasbornondate',
  'wasbornonyear',
  'diedondate',
  'diedonyear'],
 'Film': ['skos:preflabel', 'wascreatedondate', 'wascreatedonyear'],
 'City': ['haslongitude',
  'haslatitude',
  'skos:preflabel',
  'hasnumberofpeople',
  'haspopulationdensity',
  'wascreatedondate',
  'wascreatedonyear',
  'hasmotto']}

# Key Expansion

In [15]:
exp_keys = {}

def expand_keys(target_classes, k):
    for selected_class in target_classes:
        print('\nexpanding the keys of class: ' + selected_class)
        current_ds = eval(str(selected_class[:4]).lower())
        obj = extract_obj_properties(current_ds)[0]
        litt = extract_obj_properties(current_ds)[1]
        newkeys = []
        for key in class_keys[selected_class]:
            flag_newkey = True
            newkey = []
            for prop in key: # data-type properties
                if prop in litt:
                    newkey.append(prop)
            for prop in key: # object properties
                if prop in obj and get_range(prop, selected_class) and get_range(prop, selected_class) != [selected_class]:
                    for pointed_type in get_range(prop, selected_class):
                        newkey2 = newkey.copy()
                        newkey2.append(prop + ': ' + str(pointed_type) + ' -> ' + str(best_litt_props[pointed_type][:k]))
                        newkeys.append(newkey2)
        exp_keys[selected_class] = newkeys

In [16]:
expand_keys(class_keys, 3)


expanding the keys of class: University

expanding the keys of class: Museum

expanding the keys of class: Book

expanding the keys of class: Mountain

expanding the keys of class: Album

expanding the keys of class: Scientist

expanding the keys of class: Actor

expanding the keys of class: Film

expanding the keys of class: City


In [17]:
for key in exp_keys['University']:
    print(key)

['skos:preflabel', "graduatedfrom-inv: Scientist -> ['skos:preflabel']"]
['skos:preflabel', "graduatedfrom-inv: Actor -> ['skos:preflabel', 'wasbornondate', 'wasbornonyear']"]
["created: Book -> ['skos:preflabel', 'hasisbn', 'haspages']"]
["graduatedfrom-inv: Scientist -> ['skos:preflabel']"]
["graduatedfrom-inv: Actor -> ['skos:preflabel', 'wasbornondate', 'wasbornonyear']"]
['wascreatedonyear', "graduatedfrom-inv: Scientist -> ['skos:preflabel']"]
['wascreatedonyear', "graduatedfrom-inv: Actor -> ['skos:preflabel', 'wasbornondate', 'wasbornonyear']"]
['hasmotto', "graduatedfrom-inv: Scientist -> ['skos:preflabel']"]
['hasmotto', "graduatedfrom-inv: Actor -> ['skos:preflabel', 'wasbornondate', 'wasbornonyear']"]


In [18]:
import json

#saving the expanded keys
json.dump(exp_keys, open("my_exp_keys_threshold_005.json", 'w' ))