In [28]:
import pandas as pd
import re

from tqdm import tqdm
from datasets import load_from_disk
from rdflib.namespace import XSD, RDFS, RDF, OWL
from rdflib import Graph, Namespace, Literal

In [2]:
prefix = '''@prefix xsd: <http://www.w3.org/2001/XMLSchema#> .
@prefix rdfs: <http://www.w3.org/2000/01/rdf-schema#> .
@prefix rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#> .
@prefix dbo: <http://dbpedia.org/ontology/> .
@prefix dct: <http://purl.org/dc/terms/> .
@prefix owl: <http://www.w3.org/2002/07/owl#> .
@prefix wd: <https://www.wikidata.org/wiki/> .
@prefix lexid-s: <https://w3id.org/lex-id/schema/> .
@prefix lexid: <https://w3id.org/lex-id/data/> .

'''

In [4]:
def is_valid_ttl(triple_str):
    g = Graph()
    DBO = Namespace("http://dbpedia.org/ontology/")
    DCT = Namespace("http://purl.org/dc/terms/")
    WD = Namespace("https://www.wikidata.org/wiki/")
    LEXID_S = Namespace("https://w3id.org/lex-id/schema/")
    LEXID = Namespace("https://w3id.org/lex-id/data/")

    g.bind("xsd", XSD)
    g.bind("rdfs", RDFS)
    g.bind("rdf", RDF)
    g.bind("dbo", DBO)
    g.bind("dct", DCT)
    g.bind("owl", OWL)
    g.bind("wd", WD)
    g.bind("lexid-s", LEXID_S)
    g.bind("lexid", LEXID)
    
    try:
        g.parse(data=triple_str, format="turtle")
        return True
    except:
        return False

In [60]:
def evaluate_ttl(ttl_pred, ttl_gold):
    g_pred = Graph()
    g_pred.parse(data=ttl_pred, format="turtle")

    g_gold = Graph()
    g_gold.parse(data=ttl_gold, format="turtle")

    g_pred_total = len(g_pred)
    g_gold_total = len(g_gold)

    tp = len(set(g_pred).intersection(set(g_gold)))

    fp = g_pred_total - tp
    fn = g_gold_total - tp
    
    if tp + fp == 0:
        p = 0
    else:
        p = round(tp / (tp + fp), 3)
        
    if tp + fn == 0:
        r = 0
    else:
        r = round(tp / (tp + fn), 3)
    
    if p + r == 0:
        f1 = 0
    else:
        f1 = round((2 * p * r)/ (p + r), 3)
    
    return tp, fp, fn, p, r, f1

In [59]:
def literals_case_insensitive_equal(lit1, lit2):
    return str(lit1).lower() == str(lit2).lower()

def triple_case_insensitive_equal(triple1, triple2):
    return (
        triple1[0] == triple2[0] and
        triple1[1] == triple2[1] and
        (
            (isinstance(triple1[2], Literal) and isinstance(triple2[2], Literal) and literals_case_insensitive_equal(triple1[2], triple2[2])) or
            (not isinstance(triple1[2], Literal) and not isinstance(triple2[2], Literal) and triple1[2] == triple2[2])
        )
    )

def evaluate_ttl_case_insensitive(ttl_pred, ttl_gold):
    g_pred = Graph()
    g_pred.parse(data=ttl_pred, format="turtle")

    g_gold = Graph()
    g_gold.parse(data=ttl_gold, format="turtle")

    g_pred_total = len(g_pred)
    g_gold_total = len(g_gold)

    tp = 0
    for triple in g_pred:
        if any(triple_case_insensitive_equal(triple, gold_triple) for gold_triple in g_gold):
            tp += 1

    fp = g_pred_total - tp
    fn = g_gold_total - tp

    if tp + fp == 0:
        p = 0
    else:
        p = round(tp / (tp + fp), 3)
        
    if tp + fn == 0:
        r = 0
    else:
        r = round(tp / (tp + fn), 3)
    
    if p + r == 0:
        f1 = 0
    else:
        f1 = round((2 * p * r) / (p + r), 3)
    
    return tp, fp, fn, p, r, f1

In [62]:
def test_1(name, df, dataset, shot, results, part):
    print('tes')
    
    cnt = 0
    tp_s = 0
    fp_s = 0
    fn_s = 0
    tp_i = 0
    fp_i = 0
    fn_i = 0
    
    for i in range(len(df)):
        reg = df.at[i, 'regulatory']
        ttl_pred = df.at[i, f'ttl_{shot}']
        ttl_test = prefix + dataset[i]['triples']
        
        tp, fp, fn, p, r, f1 = evaluate_ttl(ttl_pred, ttl_test)
        tp_s += tp
        fp_s += fp
        fn_s += fn
        
        results.at[i, 'regulatory'] = reg
        results.at[i, f'{name}_s_p_{shot}'] = p
        results.at[i, f'{name}_s_r_{shot}'] = r
        results.at[i, f'{name}_s_f1_{shot}'] = f1
        
        print(cnt, reg)
        print(f'Case Sensitive   : P: {p} || R: {r} || F1: {f1}')     
        
        tp, fp, fn, p, r, f1 = evaluate_ttl_case_insensitive(ttl_pred, ttl_test)
        tp_i += tp
        fp_i += fp
        fn_i += fn
        
        results.at[i, 'regulatory'] = reg
        results.at[i, f'{name}_i_p_{shot}'] = p
        results.at[i, f'{name}_i_r_{shot}'] = r
        results.at[i, f'{name}_i_f1_{shot}'] = f1
        
        print(f'Case Insensitive : P: {p} || R: {r} || F1: {f1}') 
        
        cnt += 1
        
        if cnt % 100 == 0:
            results.to_csv(f'results/{part}_metrics_code.csv')
            
    results.to_csv(f'results/{part}_metrics_code.csv')
        
    p_s = round(tp_s / (tp_s + fp_s), 3)
    r_s = round(tp_s / (tp_s + fn_s), 3)
    f1_s = round((2 * p_s * r_s)/ (p_s + r_s), 3)
    
    p_i = round(tp_i / (tp_i + fp_i), 3)
    r_i = round(tp_i / (tp_i + fn_i), 3)
    f1_i = round((2 * p_i * r_i)/ (p_i + r_i), 3)
    
    print()
    print('####################################################')
    print()
    print(f'{name} {shot} Shot')

    print()
    print('Case Sensitive')
    print(f'Precision: {p_s}')    
    print(f'Recall: {r_s}')              
    print(f'F1 Score: {f1_s}') 
    
    print()
    print('Case Insensitive')
    print(f'Precision: {p_i}')    
    print(f'Recall: {r_i}')              
    print(f'F1 Score: {f1_i}') 

# Closing

In [61]:
results = pd.DataFrame()
results = pd.DataFrame(columns=['regulatory'
                                , 'ccg_s_p_1'
                                , 'ccg_s_r_1'
                                , 'ccg_s_f1_1'
                                , 'ccg_s_p_2'
                                , 'ccg_s_r_2'
                                , 'ccg_s_f1_2'
                                , 'ccl_s_p_1'
                                , 'ccl_s_r_1'
                                , 'ccl_s_f1_1'
                                , 'ccl_s_p_2'
                                , 'ccl_s_r_2'
                                , 'ccl_s_f1_2'
                                , 'ccp_s_p_1'
                                , 'ccp_s_r_1'
                                , 'ccp_s_f1_1'
                                , 'ccp_s_p_2'
                                , 'ccp_s_r_2'
                                , 'ccp_s_f1_2'
                                , 'ccg_i_p_1'
                                , 'ccg_i_r_1'
                                , 'ccg_i_f1_1'
                                , 'ccg_i_p_2'
                                , 'ccg_i_r_2'
                                , 'ccg_i_f1_2'
                                , 'ccl_i_p_1'
                                , 'ccl_i_r_1'
                                , 'ccl_i_f1_1'
                                , 'ccl_i_p_2'
                                , 'ccl_i_r_2'
                                , 'ccl_i_f1_2'
                                , 'ccp_i_p_1'
                                , 'ccp_i_r_1'
                                , 'ccp_i_f1_1'
                                , 'ccp_i_p_2'
                                , 'ccp_i_r_2'
                                , 'ccp_i_f1_2'
                               ])

In [63]:
test_closing_name = "dataset-surface-info/new-closing/new-closing-1"
test_closing = load_from_disk(test_closing_name)

In [64]:
cc_gemma = pd.read_csv('results/new/CC_gemma_post.csv')
cc_llama = pd.read_csv('results/new/CC_llama_post.csv')
cc_phi = pd.read_csv('results/new/CC_phi_post.csv')

In [None]:
test_1('ccg', cc_gemma, test_closing, '1', results, 'cc')

In [None]:
test_1('ccg', cc_gemma, test_closing, '2', results, 'cc')

In [None]:
test_1('ccl', cc_llama, test_closing, '1', results, 'cc')

In [None]:
test_1('ccl', cc_llama, test_closing, '2', results, 'cc')

In [None]:
test_1('ccp', cc_phi, test_closing, '1', results, 'cc')

In [None]:
test_1('ccp', cc_phi, test_closing, '2', results, 'cc')

# Opening

In [72]:
results = pd.DataFrame()
results = pd.DataFrame(columns=['regulatory'
                                , 'ocg_s_p_1'
                                , 'ocg_s_r_1'
                                , 'ocg_s_f1_1'
                                , 'ocg_s_p_2'
                                , 'ocg_s_r_2'
                                , 'ocg_s_f1_2'
                                , 'ocl_s_p_1'
                                , 'ocl_s_r_1'
                                , 'ocl_s_f1_1'
                                , 'ocl_s_p_2'
                                , 'ocl_s_r_2'
                                , 'ocl_s_f1_2'
                                , 'ocp_s_p_1'
                                , 'ocp_s_r_1'
                                , 'ocp_s_f1_1'
                                , 'ocp_s_p_2'
                                , 'ocp_s_r_2'
                                , 'ocp_s_f1_2'
                                , 'ocg_i_p_1'
                                , 'ocg_i_r_1'
                                , 'ocg_i_f1_1'
                                , 'ocg_i_p_2'
                                , 'ocg_i_r_2'
                                , 'ocg_i_f1_2'
                                , 'ocl_i_p_1'
                                , 'ocl_i_r_1'
                                , 'ocl_i_f1_1'
                                , 'ocl_i_p_2'
                                , 'ocl_i_r_2'
                                , 'ocl_i_f1_2'
                                , 'ocp_i_p_1'
                                , 'ocp_i_r_1'
                                , 'ocp_i_f1_1'
                                , 'ocp_i_p_2'
                                , 'ocp_i_r_2'
                                , 'ocp_i_f1_2'
                               ])

In [73]:
test_opening_name = "dataset-surface-info/new-opening/new-opening-1"
test_opening = load_from_disk(test_opening_name)

In [104]:
oc_gemma = pd.read_csv('results/new/OC_gemma_post.csv')
oc_llama = pd.read_csv('results/new/OC_llama_post.csv')
oc_phi = pd.read_csv('results/new/OC_phi_post.csv')

In [None]:
test_1('ocg', oc_gemma, test_opening, '1', results, 'oc')

In [None]:
test_1('ocg', oc_gemma, test_opening, '2', results, 'oc')

In [None]:
test_1('ocl', oc_llama, test_opening, '1', results, 'oc')

In [None]:
test_1('ocl', oc_llama, test_opening, '2', results, 'oc')

In [None]:
test_1('ocp', oc_phi, test_opening, '1', results, 'oc')

In [None]:
test_1('ocp', oc_phi, test_opening, '2', results, 'oc')

# Body

In [92]:
results = pd.DataFrame()
results = pd.DataFrame(columns=['regulatory'
                                , 'bcg_s_p_1'
                                , 'bcg_s_r_1'
                                , 'bcg_s_f1_1'
                                , 'bcg_s_p_2'
                                , 'bcg_s_r_2'
                                , 'bcg_s_f1_2'
                                , 'bcl_s_p_1'
                                , 'bcl_s_r_1'
                                , 'bcl_s_f1_1'
                                , 'bcl_s_p_2'
                                , 'bcl_s_r_2'
                                , 'bcl_s_f1_2'
                                , 'bcp_s_p_1'
                                , 'bcp_s_r_1'
                                , 'bcp_s_f1_1'
                                , 'bcp_s_p_2'
                                , 'bcp_s_r_2'
                                , 'bcp_s_f1_2'
                                , 'bcg_i_p_1'
                                , 'bcg_i_r_1'
                                , 'bcg_i_f1_1'
                                , 'bcg_i_p_2'
                                , 'bcg_i_r_2'
                                , 'bcg_i_f1_2'
                                , 'bcl_i_p_1'
                                , 'bcl_i_r_1'
                                , 'bcl_i_f1_1'
                                , 'bcl_i_p_2'
                                , 'bcl_i_r_2'
                                , 'bcl_i_f1_2'
                                , 'bcp_i_p_1'
                                , 'bcp_i_r_1'
                                , 'bcp_i_f1_1'
                                , 'bcp_i_p_2'
                                , 'bcp_i_r_2'
                                , 'bcp_i_f1_2'
                               ])

In [93]:
test_body_name = "dataset-surface-info/body-struktur/body-struktur-1"
test_body = load_from_disk(test_body_name)

In [99]:
bc_gemma = pd.read_csv('results/new/BC_gemma_post.csv')
bc_llama = pd.read_csv('results/new/BC_llama_post.csv')
bc_phi = pd.read_csv('results/new/BC_phi_post.csv')

In [None]:
test_1('bcg', bc_gemma, test_body, '1', results, 'bc')

In [None]:
test_1('bcg', bc_gemma, test_body, '2', results, 'bc')

In [None]:
test_1('bcl', bc_llama, test_body, '1', results, 'bc')

In [None]:
test_1('bcl', bc_llama, test_body, '2', results, 'bc')

In [None]:
test_1('bcp', bc_phi, test_body, '1', results, 'bc')

In [None]:
test_1('bcp', bc_phi, test_body, '2', results, 'bc')