In [1]:
from smart_dataset.evaluation.dbpedia.evaluate import evaluate, load_ground_truth, load_system_output, load_type_hierarchy
import json
import pandas as pd
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.exceptions import NotFittedError
from scipy.sparse import hstack

In [2]:
# Import the type-centric baseline results
type_hier = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 761 types loaded (max depth: 7)


In [3]:
# From ground truth:
# ("What is the name of the opera based on Twelfth Night ?", "dbo:Opera", 1)
# ("What is the name of the opera based on Twelfth Night ?", "dbo:MusicalWork", 1)
# ("What is the name of the opera based on Twelfth Night ?", "dbo:Work", 1)
# From type centric baseline results
# {"id": "dbpedia_14427", "question": "What is the name of the opera based on Twelfth Night ?", "category": "resource", "type": ["dbo:Album", "dbo:Sound", "dbo:Song", "dbo:Book", "dbo:TelevisionEpisode"]}
# ("What is the name of the opera based on Twelfth Night ?", "dbo:Album", 0)
# ("What is the name of the opera based on Twelfth Night ?", "dbo:Song", 0)
# ("What is the name of the opera based on Twelfth Night ?", "dbo:Sound", 0) <- 0 as long as not on same branch

# make one BOW sparse matrix for questions, and one for types, concatenate them horizontally

In [4]:
# load the SMART task training data
train_filename = './data/smarttask_dbpedia_train.json'
df = pd.read_json(train_filename).dropna(subset=['question', 'type'])
df

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."
...,...,...,...,...
17566,dbpedia_7462,Is the flexural strain at break of the acrylon...,boolean,[boolean]
17567,dbpedia_17610,Where did Hilary Putnam receive their Ph.D.?,resource,"[dbo:University, dbo:EducationalInstitution, d..."
17568,dbpedia_505,Who replaced Charles Evans Hughes as the Chief...,resource,"[dbo:Person, dbo:Agent]"
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N..."


In [5]:
# using the DBpedia ontology provided by the SMART task return a list containing the types 
# AND their parents that are in the input type list
def expand_with_parents(type_hierarchy: dict, dbo_types: list) -> list:
    # take a list of dbpedia types and expand it to include the types' parents
    type_set = set(dbo_types)
    for typ in dbo_types:
        if typ not in type_hierarchy.keys():
            type_set.add(typ)
            continue

        typ2=typ
        while type_hierarchy[typ2]['parent'] != 'owl:Thing':
            typ2 =  type_hierarchy[typ2]['parent']
            type_set.add(typ2)
    return list(type_set)

In [6]:
# build a training set using the ground truth from the train set as binary relevant results
df_train = df[df['category'] == 'resource'].copy()
df_train['type'] = df_train['type'].apply(lambda x: expand_with_parents(type_hier[0], x))
df_train = df_train.explode('type').dropna(subset=['type'])
df_train['relevance'] = 1
df_train

Unnamed: 0,id,question,category,type,relevance
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:Work,1
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:MusicalWork,1
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:Opera,1
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,dbo:Agent,1
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,dbo:EducationalInstitution,1
...,...,...,...,...,...
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:River,1
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:NaturalPlace,1
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:Location,1
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:BodyOfWater,1


In [7]:
# Get negative training records from baseline system output types (ones that don't also appear in ground truth)
system_outputs = {'TC': load_system_output('./data/baseline_TC_results_train.json'), 'EC': load_system_output('./data/baseline_EC_results_train.json')}
neg_lst_tc = list()
neg_lst_ec = list()
for system, so in system_outputs.items():
    for key, values in so.items():
        if values['category'] == 'resource' and len(values['type']) > 0:
            system_types = values['type']
            system_types = expand_with_parents(type_hier[0], system_types)
            gt_types = df[df['id'] == key]['type'].tolist()[0]
            question = df[df['id'] == key]['question'].tolist()[0]
            neg_types = list(set(system_types).difference(set(gt_types)))
            for neg_type in neg_types:
                if system == 'TC':
                    neg_lst_tc.append({'id': key, 'question': question, 'category': 'resource', 'type':neg_type, 'relevance':0})
                elif system == 'EC':
                    neg_lst_ec.append({'id': key, 'question': question, 'category': 'resource', 'type':neg_type, 'relevance':0})
df_train_tc = pd.concat([df_train, pd.DataFrame(neg_lst_tc)])
df_train_ec = pd.concat([df_train, pd.DataFrame(neg_lst_ec)])  

Loading system predictions from ./data/baseline_TC_results_train.json... 
   17297 predictions loaded
Loading system predictions from ./data/baseline_EC_results_train.json... 
   17254 predictions loaded


In [8]:
# obtain sparse matrices for the BOW representations of the questions
count_vec_tc = CountVectorizer()
x_train_tc = count_vec_tc.fit_transform(df_train_tc['question'])
count_vec_ec = CountVectorizer()
x_train_ec = count_vec_ec.fit_transform(df_train_ec['question'])
print(x_train_tc.shape, x_train_ec.shape)


(113897, 13020) (82700, 13020)


In [9]:
# obtain sparse matrices for the types of the training records
count_vec_types_tc = CountVectorizer()
count_vec_types_ec = CountVectorizer()
x_train_types_tc = count_vec_types_tc.fit_transform(df_train_tc['type'])
x_train_types_ec = count_vec_types_ec.fit_transform(df_train_ec['type'])
print(x_train_types_tc.shape, x_train_types_ec.shape)

(113897, 466) (82700, 473)


In [10]:
# combine to a single query-type training vector
x_train_ec = hstack([x_train_ec, x_train_types_ec])
x_train_tc = hstack([x_train_tc, x_train_types_tc])
print(x_train_tc.shape, x_train_ec.shape)

(113897, 13486) (82700, 13493)


In [11]:
# instantiate logistic regression relevance classifiers
clf_ec = LogisticRegression(random_state=0, solver='saga', max_iter=1000).fit(x_train_ec, df_train_ec['relevance'].to_list())
clf_tc = LogisticRegression(random_state=0, solver='saga', max_iter=1000).fit(x_train_tc, df_train_tc['relevance'].to_list())

In [12]:
# instantiate random forest relevance classifiers
rfm_ec = RandomForestClassifier(n_jobs=10).fit(x_train_ec, df_train_ec['relevance'].to_list())
rfm_tc = RandomForestClassifier(n_jobs=10).fit(x_train_tc, df_train_tc['relevance'].to_list())

In [13]:
# instantiate naive bayes relevance classifiers
mnb_ec = MultinomialNB().fit(x_train_ec, df_train_ec['relevance'].to_list())
mnb_tc = MultinomialNB().fit(x_train_tc, df_train_tc['relevance'].to_list())

In [14]:
# go through the baseline results one by one, expand types to include the parents, add any types above threshold to replacement types
system_outputs = {'TC': load_system_output('./data/baseline_TC_results_test.json'), 'EC': load_system_output('./data/baseline_EC_results_test.json')}
test_questions = pd.read_json(open('./data/smarttask_dbpedia_test_questions.json'))
for system, so in system_outputs.items():
    ltr_so_lr = list() # logistic regression
    ltr_so_rf = list() # random forest
    ltr_so_nb = list() # naive bayes
    for key, values in so.items():
        question = test_questions[test_questions['id'] == key]['question'].tolist()[0]
        new_types_lr = set()
        new_types_rf = set()
        new_types_nb = set()
        if values['category'] == 'resource':
            types = values['type']
            for typ in expand_with_parents(type_hier[0], types):
                if system == 'TC':
                    query_doc_vec = hstack([count_vec_tc.transform([question]), count_vec_types_tc.transform([typ])])
                    proba = clf_tc.predict_proba(query_doc_vec)[0][1]
                    if proba > 0.01:
                        new_types_lr.add(typ)
                    rf_result = rfm_tc.predict_proba(query_doc_vec)[0][1]
                    if rf_result > 0.2:
                        new_types_rf.add(typ)
                    nb_result = mnb_tc.predict_proba(query_doc_vec)[0][1]
                    if nb_result > 0.2:
                        new_types_nb.add(typ)
                elif system == 'EC':
                    try:
                        query_doc_vec = hstack([count_vec_ec.transform([question]), count_vec_types_ec.transform([typ])])
                    except NotFittedError:
                        continue
                    proba = clf_ec.predict_proba(query_doc_vec)[0][1]
                    if proba > 0.01:
                        new_types_lr.add(typ)
                    rf_result = rfm_ec.predict_proba(query_doc_vec)[0][1]
                    if rf_result > 0.2:
                        new_types_rf.add(typ)
                    nb_result = mnb_ec.predict_proba(query_doc_vec)[0][1]
                    if nb_result > 0.2:
                        new_types_nb.add(typ)
        ltr_so_lr.append({'id': key, 'question': question, 'category': values['category'], 'type': list(new_types_lr) if len(new_types_lr) > 0 else values['type'] })
        ltr_so_rf.append({'id': key, 'question': question, 'category': values['category'], 'type': list(new_types_rf) if len(new_types_rf) > 0 else values['type'] })
        ltr_so_nb.append({'id': key, 'question': question, 'category': values['category'], 'type': list(new_types_nb) if len(new_types_rf) > 0 else values['type'] })

    with open('./data/baseline_' + system + '_results_test_ltr_lr.json', 'w') as outfile:
        json.dump(ltr_so_lr, outfile)
    with open('./data/baseline_' + system + '_results_test_ltr_rf.json', 'w') as outfile:
        json.dump(ltr_so_rf, outfile)
    with open('./data/baseline_' + system + '_results_test_ltr_nb.json', 'w') as outfile:
        json.dump(ltr_so_nb, outfile)


Loading system predictions from ./data/baseline_TC_results_test.json... 
   4369 predictions loaded
Loading system predictions from ./data/baseline_EC_results_test.json... 
   4369 predictions loaded


In [18]:
# LOGISTIC REGRESSION RESULTS
gt = load_ground_truth('./data/smarttask_dbpedia_test.json', type_hier[0].keys())
so = load_system_output('./data/baseline_TC_results_test_ltr_lr.json')
print('TYPE CENTRIC')
evaluate(so, gt, type_hier[0], 7)

so = load_system_output('./data/baseline_EC_results_test_ltr_lr.json')
print('ENTITY CENTRIC')
evaluate(so, gt, type_hier[0], 7)

Loading ground truth from ./data/smarttask_dbpedia_test.json... 
   4369 questions loaded
Loading system predictions from ./data/baseline_TC_results_test_ltr_lr.json... 
   4369 predictions loaded
TYPE CENTRIC


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.526
  NDCG@10: 0.539
Loading system predictions from ./data/baseline_EC_results_test_ltr_lr.json... 
   4369 predictions loaded
ENTITY CENTRIC


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.550
  NDCG@10: 0.546


In [19]:
# RANDOM FOREST RESULTS
gt = load_ground_truth('./data/smarttask_dbpedia_test.json', type_hier[0].keys())
so = load_system_output('./data/baseline_TC_results_test_ltr_rf.json')
print('TYPE CENTRIC')
evaluate(so, gt, type_hier[0], 7)

so = load_system_output('./data/baseline_EC_results_test_ltr_rf.json')
print('ENTITY CENTRIC')
evaluate(so, gt, type_hier[0], 7)

Loading ground truth from ./data/smarttask_dbpedia_test.json... 
   4369 questions loaded
Loading system predictions from ./data/baseline_TC_results_test_ltr_rf.json... 
   4369 predictions loaded
TYPE CENTRIC


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.576
  NDCG@10: 0.552
Loading system predictions from ./data/baseline_EC_results_test_ltr_rf.json... 
   4369 predictions loaded
ENTITY CENTRIC


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.571
  NDCG@10: 0.552


In [20]:
# NAIVE BAYES RESULTS
gt = load_ground_truth('./data/smarttask_dbpedia_test.json', type_hier[0].keys())
so = load_system_output('./data/baseline_TC_results_test_ltr_nb.json')
print('TYPE CENTRIC')
evaluate(so, gt, type_hier[0], 7)

so = load_system_output('./data/baseline_EC_results_test_ltr_nb.json')
print('ENTITY CENTRIC')
evaluate(so, gt, type_hier[0], 7)

Loading ground truth from ./data/smarttask_dbpedia_test.json... 
   4369 questions loaded
Loading system predictions from ./data/baseline_TC_results_test_ltr_nb.json... 
   4369 predictions loaded
TYPE CENTRIC


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.516
  NDCG@10: 0.500
Loading system predictions from ./data/baseline_EC_results_test_ltr_nb.json... 
   4369 predictions loaded
ENTITY CENTRIC


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.537
  NDCG@10: 0.521
