In [8]:
import sys
sys.path.insert(0, '../')

In [9]:
from typing import Union
from collections import Counter, OrderedDict
import warnings

import rdflib
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn import svm
from sklearn.model_selection import cross_validate

import wlkernel

In [10]:
warnings.simplefilter('ignore')

In [11]:
rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')

In [12]:
triples = [
    (str(subj), str(pred), str(obj))
     for subj, pred, obj in rdf_graph
]

In [13]:
instances_class_map = {
    subj: obj
    for subj, pred, obj in triples
    if 'affiliation' in pred
    and 'id5instance' not in obj
}
instances = list(instances_class_map.keys())
y = list(instances_class_map.values())

In [14]:
triples = [
    (subj, pred, obj)
    for subj, pred, obj in triples
    if 'affiliation' not in pred
    and 'employs' not in pred
    and 'member' not in pred
    and 'head' not in pred
]

### Weisfeiler-Lehman RDF

In [40]:
def bananize(g: Union[wlkernel.WLRDFGraph, wlkernel.WLGraph]):
    'All the label in the WLRDFGraph are replaced with the same label'
    for i in range(len(g.labels)):
        for k in g.labels[i].keys():
            g.labels[i][k] = 'banana'
    return g

In [None]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iterfrom typing import Union
        wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)
        bananize(wlrdf_graph)
        kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [None]:
fn = 'wlrdf_no_labels'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

### Weisfeiler-Lehman

In [None]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wl_graphs = [bananize(
            wlkernel.WLGraph(triples, instance, max_depth=d)
        ) for instance in instances]
        kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [None]:
fn = 'wl_no_labels'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test