In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
from typing import Union
from collections import Counter, OrderedDict
import warnings

import rdflib
import numpy as np
import pandas as pd
from pprint import pprint
from sklearn import svm
from sklearn.model_selection import cross_validate

import wlkernel

In [3]:
warnings.simplefilter('ignore')

In [4]:
rdf_graph = rdflib.Graph().parse('../data/aifbfixed_complete.n3', format='n3')

In [5]:
triples = [
    (str(subj), str(pred), str(obj))
     for subj, pred, obj in rdf_graph
]

In [6]:
instances_class_map = {
    subj: obj
    for subj, pred, obj in triples
    if 'affiliation' in pred
    and 'id5instance' not in obj
}
instances = list(instances_class_map.keys())
y = list(instances_class_map.values())

In [7]:
triples = [
    (subj, pred, obj)
    for subj, pred, obj in triples
    if 'affiliation' not in pred
    and 'employs' not in pred
    and 'member' not in pred
    and 'head' not in pred
]

### Weisfeiler-Lehman RDF

In [1]:
def bananize(g: Union[wlkernel.WLRDFGraph, wlkernel.WLGraph]) -> Union[wlkernel.WLRDFGraph, wlkernel.WLGraph]:
    'All the label in the WLRDFGraph are replaced with the same label'
    for i in range(len(g.labels)):
        for k in g.labels[i].keys():
            g.labels[i][k] = 'banana'
    return g

NameError: name 'Union' is not defined

In [9]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)
        bananize(wlrdf_graph)
        kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [10]:
fn = 'wlrdf_no_labels'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.524847,0.305547,100.0
1,2,0.647536,0.566394,100.0
1,4,0.67078,0.59106,100.0
1,6,0.67703,0.594329,100.0
2,0,0.565936,0.340732,10.0
2,2,0.681422,0.622212,100.0
2,4,0.740048,0.66396,100.0
2,6,0.762597,0.688069,100.0
3,0,0.407394,0.29332,1.0
3,2,0.898914,0.861681,100.0


### Weisfeiler-Lehman

In [11]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wl_graphs = [bananize(
            wlkernel.WLGraph(triples, instance, max_depth=d)
        ) for instance in instances]
        kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [12]:
fn = 'wl_no_labels'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.322153,0.194477,100.0
1,2,0.530111,0.348672,10.0
1,4,0.530111,0.347049,10.0
1,6,0.530111,0.347049,10.0
2,0,0.564547,0.355253,10.0
2,2,0.503724,0.343148,1.0
2,4,0.481437,0.392366,100.0
2,6,0.502999,0.383461,1.0
3,0,0.491697,0.343404,100.0
3,2,0.641333,0.527556,100.0
