In [None]:
from rdflib import Graph
from collections import defaultdict
import re

RESOURCE_URL = 'http://dbpedia.org/resource/'

In [None]:
## READ IN dbpedia short_abstracts_en.ttl
short_abstracts = defaultdict(str)
g = Graph()

g.parse('./data/short_abstracts_en.ttl', format='n3')
for subj, pred, obj in g:
    assert(str(pred) == 'http://www.w3.org/2000/01/rdf-schema#comment') # assuming all predicates in file are same
    short_abstracts[str(subj)] = str(obj)

short_abstracts = dict(short_abstracts)

In [14]:
## READ IN smart_dataset competition dbpedia types
from smart_dataset.evaluation.dbpedia.evaluate import load_type_hierarchy
type_hier = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')


Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 761 types loaded (max depth: 7)


In [None]:
## READ IN dbpedia instance_types_en.ttl

type_dict = defaultdict(lambda: defaultdict(str))
g = Graph()

g.parse('./data/instance_types_en.ttl', format='n3')
for subj, pred, obj in g:
    assert subj.startswith(RESOURCE_URL)
    assert str(pred) == 'http://www.w3.org/1999/02/22-rdf-syntax-ns#type'

    # convert object into e.g. 'dbo:Person', 'dbo:Place', 'owl:Thing' format or skip
    if obj.startswith('http://dbpedia.org/ontology/'):
        smart_type = 'dbo:' + obj.split('http://dbpedia.org/ontology/')[1]
    elif str(obj) == 'http://www.w3.org/2002/07/owl#Thing':
        smart_type = 'owl:Thing'
    else:
        continue

    resource_name = re.sub(r'_+\d+', '', subj.split(RESOURCE_URL)[1])
    resource_name_split = resource_name.split('__')
    i = len(resource_name_split)
    while i >= 1:
        if RESOURCE_URL + resource_name_split[i-1] in short_abstracts.keys():
            resource_name = resource_name_split[i-1]
            break
        i -= 1

    if smart_type in type_hier[0].keys():
        type_dict[smart_type][resource_name.replace('_', ' ')] = short_abstracts.get(RESOURCE_URL + resource_name, '')
        
type_dict = dict(type_dict)

In [None]:
' '.join([' '.join([key, value]) for key, value in type_dict['dbo:Ginkgo'].items()])


In [None]:
INDEX_SETTINGS = {    
    'settings' : {
        'index' : {
            "number_of_shards" : 1,
            "number_of_replicas" : 1
        },
        'analysis': {
            'analyzer': {
                'my_english_analyzer': {
                    'type': "custom",
                    'tokenizer': "standard",
                    'stopwords': "_english_",
                    'filter': [
                        "lowercase",
                        "english_stop",
                        "filter_english_minimal"
                    ]                
                }
            },
            'filter' : {
                'filter_english_minimal' : {
                    'type': "stemmer",
                    'name': "minimal_english"
                },
                'english_stop': {
                    'type': "stop",
                    'stopwords': "_english_"
                }
            },
        }
    },
    'mappings': {
        'properties': {
            'catch_all': {
                'type': "text",
                'term_vector': "with_positions",
                'analyzer': "my_english_analyzer"
            },

        }
    }
}


In [8]:
import elasticsearch as es
es = es.Elasticsearch()
INDEX_NAME = 'dbpediatypes'

In [None]:


if es.indices.exists(index=INDEX_NAME):
    es.indices.delete(index=INDEX_NAME)

es.indices.create(index=INDEX_NAME, body=INDEX_SETTINGS)

In [None]:
for type_id, entity_dict in type_dict.items():
    es.index(index=INDEX_NAME, id=type_id, body={'catch_all': ' '.join([' '.join([key, value]) for key, value in type_dict[type_id].items()])})

In [None]:
res = es.search(index=INDEX_NAME, q='What is the capital of Norway?', size=5)

In [None]:
for hit in res["hits"]["hits"]:
    print("Doc ID: %3r  Score: %5.2f" % (hit["_id"], hit["_score"]))

In [1]:
# load test data
import json
test_questions = json.load(open('./smart_dataset/datasets/DBpedia/smarttask_dbpedia_test_questions.json'))

In [2]:
from log_reg_categorizer import LogRegCategorizer
lrc = LogRegCategorizer()

In [11]:
import string
baseline_output = list()

for question in test_questions:
    q_id = question['id']
    q_text = question['question']
    q_cat = lrc.predict([q_text])[0]
    if q_cat == 'boolean':
        q_type = ['boolean']
    elif q_cat == 'literal':
        q_type = lrc.predict_literal_type([q_text]).tolist()
    elif q_cat == 'resource':
        res = es.search(index=INDEX_NAME, q=q_text.translate(str.maketrans('', '', string.punctuation)), size=5)
        q_type = [hit["_id"] for hit in res["hits"]["hits"]]
    else:
        q_type = None
    
    baseline_output.append({
        'id': q_id,
        'question': q_text,
        'category': q_cat,
        'type' : q_type
    })

[{'id': 'dbpedia_16015',
  'question': 'How many ingredients are in the grain} ?',
  'category': 'literal',
  'type': ['number']},
 {'id': 'dbpedia_3885',
  'question': 'Is the case fatality rate of Fournier gangrene fewer than 9.0?',
  'category': 'boolean',
  'type': ['boolean']},
 {'id': 'dbpedia_12907',
  'question': 'Does the shelf life of spinach equal 8?',
  'category': 'boolean',
  'type': ['boolean']},
 {'id': 'dbpedia_7955',
  'question': 'What sound does a pig make in the French language?',
  'category': 'resource',
  'type': ['dbo:Song', 'dbo:Book', 'dbo:Food', 'dbo:Album', 'dbo:Island']},
 {'id': 'dbpedia_2376',
  'question': 'When was Fergie completed his record label in Interscope records?',
  'category': 'literal',
  'type': ['date']},
 {'id': 'dbpedia_4197',
  'question': 'Which are the coordinates of easternmost point of Estonia?',
  'category': 'literal',
  'type': ['string']},
 {'id': 'dbpedia_22599',
  'question': 'Where did the war take place where one of the comm

In [12]:
with open('baseline_type_cent_results.json', 'w') as outfile:
    json.dump(baseline_output, outfile)

In [21]:
from smart_dataset.evaluation.dbpedia.evaluate import evaluate, load_ground_truth
gt = load_ground_truth('./smart_dataset/datasets/DBpedia/smarttask_dbpedia_test.json', type_hier)
evaluate(baseline_output, gt, type_hier, 3)

Loading ground truth from ./smart_dataset/datasets/DBpedia/smarttask_dbpedia_test.json... 
   4369 questions loaded


TypeError: list indices must be integers or slices, not str