In [1]:
import sys
sys.path.insert(0, '../')

In [2]:
from collections import Counter, OrderedDict
import warnings

import rdflib
import numpy as np
from pprint import pprint
from sklearn import svm
from sklearn.model_selection import cross_validate

import wlkernel

In [3]:
warnings.simplefilter('ignore')

In [4]:
rdf_graph = rdflib.Graph().parse('../data/Lexicon_NamedRockUnit.nt', format='nt')

In [5]:
hasLithogenesis_most_common = Counter(
    str(o)
    for s, p, o in rdf_graph
    if 'hasLithogenesis' in str(p)
).most_common(2)
print("Most common classes with predicate equal to 'hasLithogenesis':")
pprint(hasLithogenesis_most_common)
classes = { c for c, _ in hasLithogenesis_most_common }

Most common classes with predicate equal to 'hasLithogenesis':
[('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/FLUV', 93),
 ('http://data.bgs.ac.uk/id/Lexicon/LithogeneticType/GLACI', 53)]


In [6]:
instances_class_map = {
    str(s): str(o)
    for s, p, o in rdf_graph
    if str(o) in classes
}
assert len(instances_class_map) == 146
instances = list(instances_class_map.keys())
assert len(instances) == len(set(instances))
y = np.array(list(instances_class_map.values()))

In [7]:
triples = list(
    (str(s), str(p), str(o))
    for s, p, o in rdf_graph
    if 'hasLithogenesis' not in str(p)
)
print('number of tripes: ', len(triples))

number of tripes:  313901


In [9]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wlrdf_graph = wlkernel.WLRDFGraph(triples, instances, max_depth=d)
        kernel_matrix = wlkernel.wlrdf_kernel_matrix(wlrdf_graph, instances, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [11]:
import pandas as pd

fn = 'wlrdf_lithogenesis_results_with_normalization'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.795536,0.763739,10.0
1,2,0.795536,0.763739,10.0
1,4,0.795536,0.763739,10.0
1,6,0.795536,0.763739,10.0
2,0,0.90625,0.891229,100.0
2,2,0.892857,0.874092,1.0
2,4,0.892857,0.874092,1.0
2,6,0.885714,0.866606,1.0
3,0,0.891071,0.875862,100.0
3,2,0.891964,0.873422,1.0


In [13]:
RANDOM_STATE = 42

depth_values = [1, 2, 3]
iteration_values =  [0, 2, 4, 6]
C_values = [0.001, 0.01, 0.1, 1., 10., 100.]

results = OrderedDict()

for d in depth_values:
    for it in iteration_values:
        wl_graphs = [wlkernel.WLGraph(triples, instance, max_depth=d) for instance in instances]
        kernel_matrix = wlkernel.wl_kernel_matrix(wl_graphs, iterations=it)
        kernel_matrix = wlkernel.kernel_matrix_normalization(kernel_matrix)
        
        results[(d, it)] = [0, 0, 0]
        for c in C_values:
            classifier = svm.SVC(C=c, kernel='precomputed', class_weight='balanced', random_state=RANDOM_STATE)
            scores = cross_validate(classifier, kernel_matrix, y, cv=10, scoring=('accuracy', 'f1_macro'))
            
            acc_mean = scores['test_accuracy'].mean()
            f1_mean = scores['test_f1_macro'].mean()
            
            if acc_mean > results[(d, it)][0]:
                results[(d, it)] = [acc_mean, f1_mean, c]

In [14]:
fn = 'wl_lithogenesis_results_with_normalization'

df_res = pd.DataFrame(index=list(results.keys()))
df_res['accuracy'] = [t[0] for t in results.values()]
df_res['f1'] = [t[1] for t in results.values()]
df_res['C'] = [t[2] for t in results.values()]
df_res = df_res.set_index(pd.MultiIndex.from_tuples(df_res.index, names=['depth', 'iterations']))
df_res.to_csv(f'../results/{fn}.csv')
df_res_test = pd.read_csv(f'../results/{fn}.csv', index_col=['depth', 'iterations'])
df_res_test.to_html(f'../results/{fn}.html')
df_res_test

Unnamed: 0_level_0,Unnamed: 1_level_0,accuracy,f1,C
depth,iterations,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,0,0.802679,0.774383,10.0
1,2,0.796429,0.768842,10.0
1,4,0.796429,0.768842,10.0
1,6,0.796429,0.768842,10.0
2,0,0.891964,0.877311,100.0
2,2,0.892857,0.874092,1.0
2,4,0.873214,0.854485,1.0
2,6,0.865179,0.841353,1.0
3,0,0.883929,0.871406,100.0
3,2,0.913393,0.898291,1.0
