In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
from collections import Counter, OrderedDict
import warnings

import rdflib
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn import svm
from sklearn.model_selection import cross_validate

import wlkernel

In [3]:
warnings.simplefilter('ignore')

In [4]:
rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')

In [5]:
affiliation_most_common = Counter(
    str(o) 
    for s, p, o in rdf_graph
    if 'affiliation' in str(p)
).most_common()
print("Most common classes with predicate equal to 'affiliation':")
pprint(affiliation_most_common)

Most common classes with predicate equal to 'affiliation':
[('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance',
  73),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance',
  60),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance',
  28),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance',
  16),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id5instance',
  1)]


In [6]:
instances_class_map = {
    str(s): str(o) for s, p, o in rdf_graph 
    if 'affiliation' in str(p)
    and 'id5instance' not in str(o)
}
instances = list(instances_class_map.keys())
y = list(instances_class_map.values())

In [7]:
triples = list(
    (str(s), str(p), str(o)) for s, p, o in rdf_graph
    if 'affiliation' not in str(p)
    and 'employs' not in str(p)
    and 'member' not in str(p)
    and 'head' not in str(p)
)
print('number of triples:', len(triples))

number of triples: 28699


In [44]:
wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=1)

In [45]:
for i in range(len(wlrdf_graph.labels)):
    for k in wlrdf_graph.labels[i].keys():
        wlrdf_graph.labels[i][k] = 'banana'

In [48]:
kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=0)
kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)

### Weisfeiler-Lehman RDF

In [11]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)
        kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [12]:
fn = 'wlrdf_affiliation_results_with_normalization'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.881955,0.795756,100.0
1,2,0.881955,0.795756,100.0
1,4,0.881955,0.795756,100.0
1,6,0.881955,0.795756,100.0
2,0,0.892114,0.826007,100.0
2,2,0.880057,0.812488,100.0
2,4,0.874501,0.803701,100.0
2,6,0.874501,0.800821,100.0
3,0,0.879579,0.812187,100.0
3,2,0.913751,0.867388,100.0


### Weisfeiler-Lehman

In [13]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wl_graphs = [wlkernel.WLGraph(triples, instance, max_depth=d) for instance in instances]
        kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [14]:
fn = 'wl_affiliation_results_with_normalization'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.881955,0.795756,100.0
1,2,0.868761,0.788673,100.0
1,4,0.868761,0.788673,100.0
1,6,0.868761,0.788673,100.0
2,0,0.886851,0.819787,100.0
2,2,0.858127,0.781563,100.0
2,4,0.770446,0.604246,100.0
2,6,0.752758,0.579145,100.0
3,0,0.884843,0.818408,100.0
3,2,0.8908,0.824622,100.0
