In [17]:
from rdflib import Graph
from collections import defaultdict
import re
import string
import json
from smart_dataset.evaluation.dbpedia.evaluate import evaluate, load_ground_truth, load_system_output, load_type_hierarchy
from log_reg_categorizer import LogRegCategorizer
import elasticsearch as es

RESOURCE_URL = 'http://dbpedia.org/resource/'

In [2]:
## READ IN dbpedia short_abstracts_en.ttl
short_abstracts = defaultdict(str)
g = Graph()

g.parse('./data/short_abstracts_en.ttl', format='n3')
for subj, pred, obj in g:
    assert(str(pred) == 'http://www.w3.org/2000/01/rdf-schema#comment') # assuming all predicates in file are same
    short_abstracts[str(subj)] = str(obj)

short_abstracts = dict(short_abstracts)

In [3]:
## READ IN smart_dataset competition dbpedia types
type_hier = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 761 types loaded (max depth: 7)


In [4]:
## READ IN dbpedia instance_types_en.ttl
type_dict = defaultdict(lambda: defaultdict(str))
g = Graph()

g.parse('./data/instance_types_en.ttl', format='n3')
for subj, pred, obj in g:
    assert subj.startswith(RESOURCE_URL) # Assuming entire file contains resource type predicates
    assert str(pred) == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'

    # convert object into e.g. 'dbo:Person', 'dbo:Place', 'owl:Thing' format or skip
    if obj.startswith('http://dbpedia.org/ontology/'):
        smart_type = 'dbo:' + obj.split('http://dbpedia.org/ontology/')[1]
    elif str(obj) == 'http://www.w3.org/2002/07/owl#Thing':
        smart_type = 'owl:Thing'
    else:
        continue

    # split double underscore from remaining string, check for abstract of most specific (rightmost)
    # sub resource type, and continue moving left through the terms until one is found
    resource_name = re.sub(r'_+\d+', '', subj.split(RESOURCE_URL)[1])
    resource_name_split = resource_name.split('__')
    i = len(resource_name_split)
    while i >= 1:
        if RESOURCE_URL + resource_name_split[i-1] in short_abstracts.keys():
            resource_name = resource_name_split[i-1]
            break
        i -= 1

    # write to dictionary if type is contained in the ontology
    if smart_type in type_hier[0].keys():
        type_dict[smart_type][resource_name.replace('_', ' ')] = short_abstracts.get(RESOURCE_URL + resource_name, '')
        
type_dict = dict(type_dict)

In [5]:
' '.join([' '.join([key, value]) for key, value in type_dict['dbo:Ginkgo'].items()])

'Chiropteris Chiropteris is an extinct genus that existed from Permian to Triassic. Ginkgoales Ginkgoales is a plant order containing only one extant species: Ginkgo biloba, the ginkgo tree. It is monotypic (the only taxon) within the class Ginkgoopsida, which itself is monotypic within the division Ginkgophyta. The order includes five families, of which only Ginkgoaceae remains extant. Ginkgoites Ginkgoites refers to extinct plants belonging to Ginkgoaceae. Fossils of these plants have been found around the globe during the Triassic, Jurassic and Cretaceous. The name was created as a form genus in 1919 by Albert Seward who stated: "I ... propose to employ the name Ginkgoites for leaves that it is believed belong either to plants generically identical with Ginkgo or to very closely allied types". Schmeissneria Schmeissneria is a possible early plant recorded from the Lower Jurassic of Europe and the Middle Jurassic of China, traditionally included in the Ginkgophyta. Baieroxylon Baiero

In [6]:
INDEX_SETTINGS = {    
    'settings' : {
        'index' : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        'analysis': {
            'analyzer': {
                'my_english_analyzer': {
                    'type': "custom",
                    'tokenizer': "standard",
                    'stopwords': "_english_",
                    'filter': [
                        "lowercase",
                        "english_stop",
                        "filter_english_minimal"
                    ]                
                }
            },
            'filter' : {
                'filter_english_minimal' : {
                    'type': "stemmer",
                    'name': "minimal_english"
                },
                'english_stop': {
                    'type': "stop",
                    'stopwords': "_english_"
                }
            },
        }
    },
    'mappings': {
        'properties': {
            'catch_all': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            },

        }
    }
}


In [7]:
es = es.Elasticsearch()
INDEX_NAME = 'dbpediatypes'

In [8]:
if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)



{'acknowledged': True, 'shards_acknowledged': True, 'index': 'dbpediatypes'}

In [9]:
for type_id, entity_dict in type_dict.items():
    es.index(index=INDEX_NAME, id=type_id, body={'catch_all': ' '.join([' '.join([key, value]) for key, value in type_dict[type_id].items()])})

In [10]:
res = es.search(index=INDEX_NAME, q='What is the capital of Norway?', size=5)

In [11]:
for hit in res["hits"]["hits"]:
    print("Doc ID: %3r  Score: %5.2f" % (hit["_id"], hit["_score"]))

Doc ID: 'dbo:Country'  Score:  2.47
Doc ID: 'dbo:MilitaryConflict'  Score:  2.46
Doc ID: 'dbo:AdministrativeRegion'  Score:  2.46
Doc ID: 'dbo:Building'  Score:  2.45
Doc ID: 'dbo:Town'  Score:  2.45


In [12]:
# load test data
test_questions = json.load(open('./data/smarttask_dbpedia_test_questions.json'))

In [13]:
# import category classification module
lrc = LogRegCategorizer('./data/smarttask_dbpedia_train.json')

In [14]:
baseline_output = list()

for question in test_questions:
    q_id = question['id']
    q_text = question['question']
    q_cat = lrc.predict([q_text])[0]
    if q_cat == 'boolean':
        q_type = ['boolean']
    elif q_cat == 'literal':
        q_type = lrc.predict_literal_type([q_text]).tolist()
    elif q_cat == 'resource':
        res = es.search(index=INDEX_NAME, q=q_text.translate(str.maketrans('', '', string.punctuation)), size=5)
        q_type = [hit["_id"] for hit in res["hits"]["hits"]]
    else:
        q_type = None
    
    baseline_output.append({
        'id': q_id,
        'question': q_text,
        'category': q_cat,
        'type' : q_type
    })



In [19]:
with open('./data/baseline_TC_results_test.json', 'w') as outfile:
    json.dump(baseline_output, outfile)

In [21]:

so = load_system_output('./data/baseline_TC_results_test.json')
gt = load_ground_truth('./data/smarttask_dbpedia_test.json', type_hier[0].keys())
evaluate(so, gt, type_hier[0], 7)

Loading system predictions from ./data/baseline_type_cent_results_test.json... 
   4369 predictions loaded
Loading ground truth from ./data/smarttask_dbpedia_test.json... 
   4369 questions loaded


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.498
  NDCG@10: 0.477
