In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
from collections import Counter, OrderedDict
import warnings

import rdflib
import numpy as np
from pprint import pprint
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_validate

import wlkernel

In [3]:
warnings.simplefilter('ignore')

In [4]:
rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')

In [5]:
affiliation_most_common = Counter(
    str(o) 
    for s, p, o in rdf_graph
    if 'affiliation' in str(p)
).most_common()
print("Most common classes with predicate equal to 'affiliation':")
pprint(affiliation_most_common)

Most common classes with predicate equal to 'affiliation':
[('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance',
  73),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id3instance',
  60),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance',
  28),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id4instance',
  16),
 ('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id5instance',
  1)]


In [6]:
instances_class_map = {
    str(s): str(o) for s, p, o in rdf_graph 
    if 'affiliation' in str(p)
    and 'id5instance' not in str(o)
}
instances = list(instances_class_map.keys())
y = list(instances_class_map.values())

In [7]:
triples = list(
    (str(s), str(p), str(o)) for s, p, o in rdf_graph
    if 'affiliation' not in str(p)
    and 'employs' not in str(p)
    and 'member' not in str(p)
    and 'head' not in str(p)
)
print('number of triples:', len(triples))

number of triples: 28699


In [8]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)
        kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)
#         kernel_matrix = wlkernel.kernel_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=1.0, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [9]:
import pandas as pd

fn = 'affiliation_results'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'./results/{fn}.csv')
df_res_test = pd.read_csv(f'./results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'./results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.620494,0.524999,0.001
1,2,0.620494,0.524999,0.001
1,4,0.620494,0.524999,0.001
1,6,0.620494,0.524999,0.001
2,0,0.637896,0.534047,0.001
2,2,0.790336,0.692237,0.001
2,4,0.824725,0.715229,0.001
2,6,0.83028,0.720146,0.001
3,0,0.726671,0.676423,0.001
3,2,0.892884,0.858493,0.001
