In [1]:
import sys
sys.path.insert(0, '../') 

In [2]:
import rdflib
from wlkernel import *
import random
from itertools import chain
from pathlib import Path
from pprint import pprint
from collections import Counter

In [3]:
rdf_graph = rdflib.Graph().parse('../tests/resources/aifbfixed_complete.n3', format='n3')

In [4]:
# there is only one instance that belong to the class id5instance, then we remove it
#Counter([t[1] for t in all_instances]).most_common()  # command to verify it, use it after all_instances definition
rdf_graph.remove( (
    rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Personen/viewPersonOWL/id1959instance'),
    rdflib.term.URIRef('http://swrc.ontoware.org/ontology#affiliation'),
    rdflib.term.URIRef('http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id5instance')
))

In [5]:
affiliation_uri = rdflib.URIRef('http://swrc.ontoware.org/ontology#affiliation')
all_instances = [ (t[0], t[2]) for t in rdf_graph.triples( (None, affiliation_uri, None) ) ]
assert len(all_instances) == len(set(all_instances))

In [6]:
n = 5
random.seed(42)
instances = random.sample(all_instances, n)
instances_id = [x[0] for x in instances]
instances_class = [x[1] for x in instances]
# print('Amount of the values of the classes:\n\t')
# pprint(Counter(instances_class).most_common())

In [7]:
employs_uri = rdflib.term.URIRef('http://swrc.ontoware.org/ontology#employs')
head_uri = rdflib.term.URIRef('http://swrc.ontoware.org/ontology#head')
member_uri = rdflib.term.URIRef('http://swrc.ontoware.org/ontology#member')
predicates_to_remove = [employs_uri, head_uri, member_uri, affiliation_uri]
for predicate in predicates_to_remove:
    rdf_graph.remove( (None, predicate, None) )

In [8]:
instances_id = [str(x) for x in instances_id]
instances_class = [str(x) for x in instances_class]

In [None]:
######## number of cores: 12 | max_depth: 2 | iterations: 1
#### number of instances: 10
# SEQ : 1min 6s ± 501 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# PAR : 28.5s ± 161 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# SPEEDUP: 2.3571
#####

In [9]:
%timeit kernel_matrix = compute_kernel_matrix(rdf_graph, instances_id, 2, 1)

1min 16s ± 898 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%timeit kernel_matrix_par = compute_kernel_matrix_par(rdf_graph, instances_id, 2, 1)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   11.8s remaining:   27.6s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   12.4s remaining:   12.4s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   21.2s remaining:    9.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.1s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   12.7s remaining:   29.5s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   13.7s remaining:   13.7s
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:   18.9s remaining:    8.1s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.3s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 12 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of  10 | elapsed:   11.6s remaining:   27.1s
[Parallel(n_jobs=-1)]: Done   5 out of  10 | elapsed:   13.5

26 s ± 476 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:   26.6s finished


In [40]:
kernel_matrix == kernel_matrix_par

NameError: name 'kernel_matrix' is not defined

In [42]:
X = kernel_matrix_par
y = instances_class

In [43]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=42)

In [44]:
from sklearn.ensemble import RandomForestClassifier

In [45]:
tree_clf = RandomForestClassifier(n_estimators=75)
tree_clf.fit(X_train, y_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=75,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [46]:
y_pred_train = tree_clf.predict(X_train)
y_pred = tree_clf.predict(X_test)

In [47]:
def accuracy(y_true, y_pred): return sum([x == y for x, y in zip(y_true, y_pred)]) / len(y_true)

In [48]:
print('Train accuracy: ', accuracy(y_train, y_pred_train))
print('Test accuracy: ', accuracy(y_test, y_pred))

Train accuracy:  1.0
Test accuracy:  0.8


In [49]:
y_pred

array(['http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance',
       'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance',
       'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id1instance',
       'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance',
       'http://www.aifb.uni-karlsruhe.de/Forschungsgruppen/viewForschungsgruppeOWL/id2instance'],
      dtype='<U86')