In [5]:
from collections import Counter

import rdflib
import numpy as np
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.model_selection import train_test_split, cross_val_score

import wlkernel

In [6]:
rdf_graph = rdflib.Graph().parse('Lexicon_NamedRockUnit.nt', format='nt')

In [9]:
Counter((str(o)) for s, p, o in rdf_graph if 'hasLithogenesis' in str(p)).most_common(2)

[('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/FLUV', 93),
 ('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/GLACI', 53)]

In [10]:
classes = {
    c for c, _ in Counter((str(o)) for s, p, o in rdf_graph if 'hasLithogenesis' in str(p)).most_common(2)
}
classes

{'http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/FLUV',
 'http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/GLACI'}

In [12]:
instances_class_map = {
    str(s): str(o)
    for s, p, o in rdf_graph
    if str(o) in classes
}
len(instances_class_map)

146

In [14]:
instances = list(instances_class_map)
assert len(instances) == len(set(instances))
len(instances)

146

In [16]:
triples = list(
    (str(s), str(p), str(o))
    for s, p, o in rdf_graph
    if 'hasLithogenesis' not in str(p)
)
len(triples)

313901

In [37]:
wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=1)
len(wlrdf_graph.instance_nodes)

146

In [38]:
kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=0)
len(kernel_matrix)

146

In [39]:
X = list(instances_class_map)
y = list(instances_class_map.values())
for i in range(len(X)):
    assert y[i] == instances_class_map[X[i]]
len(y)

146

In [40]:
classifier = svm.SVC(kernel='precomputed')
classifier.fit(kernel_matrix, y)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
  decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
  kernel='precomputed', max_iter=-1, probability=False, random_state=None,
  shrinking=True, tol=0.001, verbose=False)

In [42]:
scores = cross_val_score(classifier, np.array(kernel_matrix), y, cv=10)
scores.mean()

0.7