In [1]:
# imports
from subprocess import run
from subprocess import check_output
import re
import numpy as np
import pandas as pd
import rdflib
import rdflib_hdt
import ast

In [12]:
# global variables
path_to_datasets_dir = '/Users/paulosh/Desktop/M2/stage_material/datasets/'
path_to_sakey = '/Users/paulosh/Desktop/M2/stage_material/sakey/sakey.jar'
class_keys = {}

## Loading the data

In [2]:
# transforms a text file into a dataframe, input: path to the text file
def file_to_dataframe(path):
    ds = []
    with open(path,  'r', encoding='latin-1') as f:
        lines = f.readlines()    
        for line in lines:
            ds.append(np.array(line.split(".\n")[0].split("\t")))
    
    ds = pd.DataFrame(ds).dropna()
    for i, row in ds.iterrows():    # removing the newline characters
        ifor_val = ds[2][i]
        if ds[2][i][-1:] == '\n':
            ifor_val = ds[2][i][:-1]
        ds.at[i,2] = ifor_val
    return ds  

In [3]:
univ = file_to_dataframe(path_to_datasets_dir + 'University/DB_University')
muse = file_to_dataframe(path_to_datasets_dir + 'Museum/DB_Museum')
book = file_to_dataframe(path_to_datasets_dir + 'Book/DB_Book')
moun = file_to_dataframe(path_to_datasets_dir + 'Mountain/DB_Mountain')
albu = file_to_dataframe(path_to_datasets_dir + 'Album/DB_Album')
scie = file_to_dataframe(path_to_datasets_dir + 'Scientist/DB_Scientist')
acto = file_to_dataframe(path_to_datasets_dir + 'Actor/DB_Actor')
film = file_to_dataframe(path_to_datasets_dir + 'Film/DB_Film')
city = file_to_dataframe(path_to_datasets_dir + 'City/DB_City')

## Parsing the keys

In [13]:
# arguments: path to the file we need to parse keys of, and nb of exceptions for SAkey
def compute_keys(dataset1, nb_exceptions): 
    res = check_output(f'java -jar %s %s '%(path_to_sakey, dataset1) + str(nb_exceptions) , shell=True)
    res = res.decode().split(": ")[1][:-2]
    res = res.split("\n%s-almost keys:" %(nb_exceptions - 1))
    almost_keys = res[1]              
    almost_keys_array = []
    for i in almost_keys.split("], ["):
        splited = re.split("[\[,\] ]", i)
        almost_keys_array.append([])
        for j in splited:
            if j != '' and j != 'prop':
                almost_keys_array[-1].append(j)
    
    if [] in almost_keys_array: almost_keys_array.remove([])            
    
    l = len(almost_keys_array)
    if l > 0:           
        class_keys[dataset1.split('DB_')[1]] = almost_keys_array
        print("%i keys found" %l)
    else: print("no keys found")

In [24]:
# computing the keys with a number of exceptions depending on the number of instances
def compute_keys_threshold(dataset1, threshold): # threshold is a real number between 0 and 1
    ds = file_to_dataframe(dataset1)
    nb_exceptions = int(threshold*len(list(set(ds[0].values))))
    print('number of exceptions = %s'%nb_exceptions)
    res = check_output(f'java -jar %s %s '%(path_to_sakey, dataset1) + str(nb_exceptions) , shell=True)
    res = res.decode().split(": ")[1][:-2]
    res = res.split("\n%s-almost keys:" %(nb_exceptions - 1))
    almost_keys = res[1]              
    almost_keys_array = []
    for i in almost_keys.split("], ["):
        splited = re.split("[\[,\] ]", i)
        almost_keys_array.append([])
        for j in splited:
            if j != '' and j != 'prop':
                almost_keys_array[-1].append(j)
    
    if [] in almost_keys_array: almost_keys_array.remove([])            
    
    l = len(almost_keys_array)
    if l > 0:           
        class_keys[dataset1.split('DB_')[1]] = almost_keys_array
        print("%i keys found" %l)
    else: print("no keys found")

## Measuring their coverage

In [15]:
def key_coverage(dataset, classe):
    covered = not_covered = 0
    for ins in list(set(dataset[0].values)): # for each instance,
        good = bad = 0
        triples = dataset[dataset[0] == ins] # we extract its properties 
        props = list(set(triples[1].values)) 
        
        for key in class_keys[classe]:       # for each key of its type,
            flag = True
            for prop in key:                 # for each property in that key,
                if prop not in props:        # if our instance does not have that property
                    flag = False
            
            if flag: good+=1 
            else: bad+=1                     # then that key does not cover it.
        
        if good == 0: not_covered+=1         # if 0 keys cover it, this instance is not covered
        else: covered+=1
            
    ratio = round((covered/(covered + not_covered)) * 100, 4)
    print(str(ratio) + '% of instances covered')        
    return covered, not_covered

In [29]:
def keys_and_coverage_measure(dataset1, nb_exceptions):
    name = str(dataset1).split('DB_')[1]
    print('\n' + name)
    dataset = file_to_dataframe(dataset1)
    if type(nb_exceptions) == float: print('threshold = %s'%nb_exceptions); compute_keys_threshold(dataset1, nb_exceptions)
    else: print('number of exceptions = %s'%(nb_exceptions-1)); compute_keys(dataset1, nb_exceptions)
    key_coverage(dataset, name)

## Actual tests

In [31]:
k = 0.05 # set the threshold, or number of exceptions

keys_and_coverage_measure(path_to_datasets_dir + 'University/DB_University', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Museum/DB_Museum', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Book/DB_Book', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Mountain/DB_Mountain', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Album/DB_Album', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Scientist/DB_Scientist', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Actor/DB_Actor', k)

keys_and_coverage_measure(path_to_datasets_dir + 'Film/DB_Film', k)

keys_and_coverage_measure(path_to_datasets_dir + 'City/DB_City', k)


University
threshold = 0.05
number of exceptions = 517
8 keys found
84.42% of instances covered

Museum
threshold = 0.05
number of exceptions = 91
5 keys found
99.3976% of instances covered

Mountain
threshold = 0.05
number of exceptions = 820
3 keys found
53.0821% of instances covered

Album
threshold = 0.05
number of exceptions = 4253
3 keys found
66.9609% of instances covered
