In [17]:
from smart_dataset.evaluation.dbpedia.evaluate import evaluate, load_ground_truth, load_system_output, load_type_hierarchy
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
import json
import pandas as pd

In [18]:
type_hier = load_type_hierarchy('./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv')
gt = load_ground_truth('./data/smarttask_dbpedia_test.json', type_hier[0].keys())
so = load_system_output('./data/baseline_entity_cent_results_train.json')

Loading type hierarchy from ./smart_dataset/evaluation/dbpedia/dbpedia_types.tsv... 761 types loaded (max depth: 7)
Loading ground truth from ./data/smarttask_dbpedia_test.json... 
   4369 questions loaded
Loading system predictions from ./data/baseline_entity_cent_results_train.json... 
   17254 predictions loaded


In [19]:
train_filename = './data/smarttask_dbpedia_train.json'
with open(train_filename) as json_file:
    train_qs = json.load(json_file)

In [20]:
df = pd.read_json(train_filename).dropna(subset=['question', 'type'])
df

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."
...,...,...,...,...
17566,dbpedia_7462,Is the flexural strain at break of the acrylon...,boolean,[boolean]
17567,dbpedia_17610,Where did Hilary Putnam receive their Ph.D.?,resource,"[dbo:University, dbo:EducationalInstitution, d..."
17568,dbpedia_505,Who replaced Charles Evans Hughes as the Chief...,resource,"[dbo:Person, dbo:Agent]"
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N..."


In [22]:
def expand_with_parents(type_hierarchy: dict, dbo_types: list) -> list:
    # take a list of dbpedia types and expand it to include the types' parents
    type_set = set(dbo_types)
    for typ in dbo_types:
        if typ not in type_hierarchy.keys():
            type_set.add(typ)
            continue

        typ2=typ
        while type_hierarchy[typ2]['parent'] != 'owl:Thing':
            typ2 =  type_hierarchy[typ2]['parent']
            type_set.add(typ2)
    return list(type_set)

In [23]:
df_train = df[df['category'] == 'resource']
df_train['type'] = df_train['type'].apply(lambda x: expand_with_parents(type_hier[0], x))
df_train = df_train.explode('type').dropna(subset=['type'])
df_train['relevance'] = 1
df_train

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


Unnamed: 0,id,question,category,type,relevance
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:MusicalWork,1
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:Work,1
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,dbo:Opera,1
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,dbo:Organisation,1
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,dbo:EducationalInstitution,1
...,...,...,...,...,...
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:Place,1
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:NaturalPlace,1
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:River,1
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,dbo:Location,1


In [24]:
# Get negative training records from baseline system output types (ones that don't also appear in ground truth)
neg_lst = list()

for key, values in so.items():
    if values['category'] == 'resource' and len(values['type']) > 0:
        system_types = values['type']
        system_types = expand_with_parents(type_hier[0], system_types)
        gt_types = df[df['id'] == key]['type'].tolist()[0]
        question = df[df['id'] == key]['question'].tolist()[0]
        neg_types = list(set(system_types).difference(set(gt_types)))
        for neg_type in neg_types:
            neg_lst.append({'id': key, 'question': question, 'category': 'resource', 'type':neg_type, 'relevance':0})
df_train = pd.concat([df_train, pd.DataFrame(neg_lst)])

In [25]:
count_vectorizer = CountVectorizer()
x_train = count_vectorizer.fit_transform(df_train['question'])
x_train

<82700x13020 sparse matrix of type '<class 'numpy.int64'>'
	with 806237 stored elements in Compressed Sparse Row format>

In [26]:
count_vectorizer_types = CountVectorizer()
x_train_types = count_vectorizer_types.fit_transform(df_train['type'])
x_train

<82700x13020 sparse matrix of type '<class 'numpy.int64'>'
	with 806237 stored elements in Compressed Sparse Row format>

In [27]:
from scipy.sparse import  hstack
x_train = hstack([x_train, x_train_types])
x_train

<82700x13493 sparse matrix of type '<class 'numpy.int64'>'
	with 971715 stored elements in COOrdinate format>

In [28]:
clf = LogisticRegression(random_state=0, solver='saga', max_iter=1000).fit(x_train, df_train['relevance'].to_list())
df

Unnamed: 0,id,question,category,type
0,dbpedia_1177,Was Jacqueline Kennedy Onassis a follower of M...,boolean,[boolean]
1,dbpedia_14427,What is the name of the opera based on Twelfth...,resource,"[dbo:Opera, dbo:MusicalWork, dbo:Work]"
2,dbpedia_16615,When did Lena Horne receive the Grammy Award f...,literal,[date]
3,dbpedia_23480,Do Prince Harry and Prince William have the sa...,boolean,[boolean]
4,dbpedia_3681,What is the subsidiary company working for Leo...,resource,"[dbo:EducationalInstitution, dbo:Organisation,..."
...,...,...,...,...
17566,dbpedia_7462,Is the flexural strain at break of the acrylon...,boolean,[boolean]
17567,dbpedia_17610,Where did Hilary Putnam receive their Ph.D.?,resource,"[dbo:University, dbo:EducationalInstitution, d..."
17568,dbpedia_505,Who replaced Charles Evans Hughes as the Chief...,resource,"[dbo:Person, dbo:Agent]"
17569,dbpedia_18989,Name the river with source as Columbia Lake an...,resource,"[dbo:River, dbo:Stream, dbo:BodyOfWater, dbo:N..."


In [29]:
# go through the baseline results one by one, expand types to include the parents, add any types above threshold to replacement types
ltr_so = list()
so = load_system_output('./data/baseline_entity_cent_results.json')
test_questions = pd.read_json(open('./data/smarttask_dbpedia_test_questions.json'))

for key, values in so.items():
    question = test_questions[test_questions['id'] == key]['question'].tolist()[0]
    new_types = set()
    if values['category'] == 'resource':
        types = values['type']

        for typ in expand_with_parents(type_hier[0], types):
            proba = clf.predict_proba(hstack([count_vectorizer.transform([question]), count_vectorizer_types.transform([typ])]))[0][1]
            if proba > 0.01:
                new_types.add(typ)
    ltr_so.append({'id': key, 'question': question, 'category': values['category'], 'type': list(new_types) if len(new_types) > 0 else values['type'] })

with open('./data/baseline_entity_cent_results_test_ltr.json', 'w') as outfile:
    json.dump(ltr_so, outfile)

Loading system predictions from ./data/baseline_entity_cent_results.json... 
   4369 predictions loaded


In [30]:
from smart_dataset.evaluation.dbpedia.evaluate import evaluate, load_ground_truth, load_system_output
so = load_system_output('./data/baseline_entity_cent_results_test_ltr.json')
gt = load_ground_truth('./data/smarttask_dbpedia_test.json', type_hier[0].keys())
evaluate(so, gt, type_hier[0], 7)

Loading system predictions from ./data/baseline_entity_cent_results_test_ltr.json... 
   4369 predictions loaded
Loading ground truth from ./data/smarttask_dbpedia_test.json... 
   4369 questions loaded


Evaluation results:
-------------------
Category prediction (based on 4369 questions)
  Accuracy: 0.939
Type ranking (based on 4369 questions)
  NDCG@5:  0.553
  NDCG@10: 0.548
