In [64]:
import sys
sys.path.insert(0, '../')

In [65]:
from collections import Counter

import rdflib
import numpy as np
from pprint import pprint
from sklearn import svm
from sklearn.metrics import accuracy_score
from sklearn.metrics import f1_score
from sklearn.model_selection import train_test_split, cross_val_score

import wlkernel

In [66]:
rdf_graph = rdflib.Graph().parse('../data/Lexicon_NamedRockUnit.nt', format='nt')

In [67]:
hasLithogenesis_most_common = Counter( str(o) for s, p, o in rdf_graph if 'hasLithogenesis' in str(p) ).most_common(2)
print("Most common classes with predicate equal to 'hasLithogenesis':")
pprint(hasLithogenesis_most_common)
classes = { c for c, _ in hasLithogenesis_most_common }

Most common classes with predicate equal to 'hasLithogenesis':
[('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/FLUV', 93),
 ('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/GLACI', 53)]


In [68]:
instances_class_map = { str(s): str(o) for s, p, o in rdf_graph if str(o) in classes }
assert len(instances_class_map) == 146
instances = list(instances_class_map.keys())
assert len(instances) == len(set(instances))
y = np.array(list(instances_class_map.values()))

In [69]:
triples = list( (str(s), str(p), str(o)) for s, p, o in rdf_graph if 'hasLithogenesis' not in str(p) )
len(triples)

313901

In [70]:
depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

In [None]:
from collections import OrderedDict
from sklearn.model_selection import cross_validate
RANDOM_STATE = 42

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)
        kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)
        kernel_matrix = wlkernel.kernel_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=1.0, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [None]:
import pandas as pd

fn = 'lithogenesis_results_with_normalization'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'./{fn}.csv')
df_res_test = pd.read_csv(f'./{fn}.csv', index_col=['depth', 'iterations'])
df_test.to_html(f'{fn}.html')
df_res_test