In [1]:
import sys
sys.path.append('../')
from setting import config_read

In [2]:
import rdflib
import pandas as pd
from owlready2 import *





In [3]:
config = config_read('../')

# Load owl file
data_path = config['owl']['path']
onto = get_ontology(data_path).load()

g = rdflib.Graph()
g.parse(data_path)

knows_query = """
SELECT DISTINCT ?x ?y ?z
WHERE {
    ?x ?y ?z.
}"""

triple_list = []
qres = g.query(knows_query)
for row in qres:
    triple_list.append([str(row[0]), str(row[1]), str(row[2])])
len(triple_list)

3824

In [4]:
triple_text_list = []
for s,p,o in triple_list:
    s_res = onto.search_one(iri=s)
    if s_res == None : 
        s_triple = s
    else:
        s_triple = s_res

    p_res = onto.search_one(iri=p)
    if p_res == None :
        p_triple = p
    else:
        p_triple = p_res

    o_res = onto.search_one(iri=o)
    if o_res == None : 
        o_triple = o
    else:
        o_triple = o_res
    
    triple_text_list.append([s_triple, p_triple, o_triple])

In [5]:
triple_df = pd.DataFrame(triple_text_list, columns=['S','P','O'])
print('겹치는 트리플 개수 : ', sum(triple_df.duplicated(['S','P','O'])))  
triple_df = triple_df.drop_duplicates(['S','P','O']).reset_index(drop=True) # 중복제거
print('중복 제거 후 트리플 개수 : ', len(triple_df))

겹치는 트리플 개수 :  157
중복 제거 후 트리플 개수 :  3667


In [6]:
p_list = list(onto.object_properties())
p_list.extend(onto.data_properties())
drop_prop_df = triple_df[triple_df.P.isin(p_list)].reset_index(drop=True)

In [7]:
p_dict= {}
for p in onto.data_properties():
    if len(p.range) == 0:
        continue
    ## Literal인 경우
    if p.range[0] == None:
         p_dict[p] = 'Literal_'+str(p)

    ## 다른 type인 경우
    elif hasattr(p.range[0], '__name__'):
         p_dict[p] = p.range[0].__name__+'_'+str(p)
    else:
         p_dict[p] = str(p.range[0])+'_'+str(p)

In [8]:
range_tbox = []
for i, row in drop_prop_df.iterrows():
    s,p,o = row
    # datatype property인 경우
    if p in p_dict:
        range_tbox.append(p_dict[p])
    # object property인 경우
    else:
        range_tbox.append(o.is_a)

In [9]:
tbox_df = drop_prop_df.copy()

tbox_df['domain'] = tbox_df['S'].apply(lambda x:x.is_a)
tbox_df['range'] = range_tbox

tbox_df = tbox_df.explode(['domain'])
tbox_df = tbox_df.explode(['range'])

tbox_df = tbox_df[tbox_df['domain'] != Thing]
tbox_df = tbox_df.astype(str).reset_index(drop=True)

In [10]:
# the number of instance-level triples containing the property of the unit path
dpr_df = tbox_df.groupby(['domain', 'P', 'range'], as_index=False).S.count()

# the total number of triples from domain class to range class
dr_df = tbox_df.groupby(['domain', 'range'], as_index=False).S.count()

weight_df = pd.merge(tbox_df, dpr_df, on=['domain', 'P', 'range'], how='inner', suffixes=('','_dpr'))
weight_df = pd.merge(weight_df, dr_df, on=['domain', 'range'], how='inner', suffixes=('','_dr'))
weight_df['W'] = 1 - (weight_df['S_dpr'] / weight_df['S_dr'])

In [11]:
def subclass(tree):
    global head
    if type(tree) is not list:
        head = tree
        tree = list(head.subclasses())
    for node in tree :
        subclass_paths.append((node, 'rdfs.subClassOf', head, 0.0))
        subclass(list(node.subclasses()))

In [12]:
subclass_paths = []
classes = list(onto.classes())
classes.append(Thing)
for c in classes:
    subclass(c)
subclass_df = pd.DataFrame(subclass_paths, columns=['domain', 'P', 'range', 'W'])

In [13]:
subclass_df = pd.concat([weight_df[['domain', 'P', 'range', 'W']], subclass_df])
subclass_df = subclass_df.drop_duplicates().reset_index(drop=True)

In [14]:
def change_prefix(s):
    s = str(s)
    s_list = s.rsplit('.',1) 

    if config['owl']['path'][:-3] in s:
        s = s.replace(config['owl']['path'][:-3],'skmo:')
    else:
        s = s_list[0] + ':' + s_list[-1]
    return s

In [15]:
final_list = []
for i, row in subclass_df.iterrows():
    s,p,o,w = row
    final_list.append((change_prefix(s), change_prefix(p), change_prefix(o), w))

final_df = pd.DataFrame(final_list, columns=['domain', 'P', 'range', 'W'])

In [16]:
final_df

Unnamed: 0,domain,P,range,W
0,skmo:Chart,dc:title,Literal_dc:title,0.000000
1,mo:Track,skmo:hasGenre,skmo:Genre,0.000000
2,mo:MusicGroup,foaf:name,Literal_foaf:name,0.000000
3,mo:Track,skmo:isArrangedBy,skmo:MusicArtist,0.846774
4,mo:Track,skmo:duetPartners,skmo:MusicArtist,0.983871
...,...,...,...,...
150,j.0:Unit,rdfs:subClassOf,owl:Thing,0.000000
151,j.0:TimeUnit,rdfs:subClassOf,owl:Thing,0.000000
152,j.0:Day,rdfs:subClassOf,owl:Thing,0.000000
153,j.0:Month,rdfs:subClassOf,owl:Thing,0.000000


In [17]:
final_df.to_csv('../unit_path.csv', index=False)