In [1]:
import numpy as np
from dataLoader import DataLoader

In [2]:
input_path = 'input/BioASQ-trainingDataset6b.json'
data = DataLoader(input_path)
data.load_ner_entities()

Loading ner entities from file: nerCache/BioASQ-trainingDataset6b.json


In [3]:
run retrieval_model.py

In [4]:
q = data[0]
scores = {}
for sentence, score in get_ranked_sentences(q.question, q.snippet_sentences, 'BM25'):
    scores[sentence] = score
for sentence, score in get_ranked_sentences(q.question, q.snippet_sentences, 'Indri'):
    scores[sentence] = [scores[sentence], score]

In [5]:
len(np.unique(q.snippet_sentences).tolist()), len(q.snippet_sentences)

(15, 20)

In [6]:
len(get_ranked_sentences(q.question, q.snippet_sentences, 'BM25'))

15

In [7]:
q.ranked_sentences()

[{'BM25': 2.3188813786746474,
  'Indri': 0.41422164245653387,
  'text': 'Hirschsprung disease (HSCR) is a multifactorial, non-mendelian disorder in which rare high-penetrance coding sequence mutations in the receptor tyrosine kinase RET contribute to risk in combination with mutations at other genes'},
 {'BM25': 1.9989343890926352,
  'Indri': 0.41446496181644588,
  'text': "Chromosomal and related Mendelian syndromes associated with Hirschsprung's disease"},
 {'BM25': 1.9633290741310481,
  'Indri': 0.41442363788439585,
  'text': "The majority of the identified genes are related to Mendelian syndromic forms of Hirschsprung's disease"},
 {'BM25': 1.8805000486638965,
  'Indri': 0.41432047225542812,
  'text': "In this study, we review the identification of genes and loci involved in the non-syndromic common form and syndromic Mendelian forms of Hirschsprung's disease"},
 {'BM25': 1.6718072220029261,
  'Indri': 0.41425099990895259,
  'text': 'On the basis of a skewed sex-ratio (M/F = 4/1) a

In [8]:
import constants as C

In [12]:
data.get_questions_of_type(C.FACTOID_TYPE)[0].ranked_sentences()

[{'BM25': 0.34542751083360856,
  'Indri': 0.19294133515825904,
  'text': 'Acrokeratosis paraneoplastica of Bazex'},
 {'BM25': 0.34431997105739548,
  'Indri': 0.19293169932806986,
  'text': 'Bazex syndrome: acrokeratosis paraneoplastica'},
 {'BM25': 0.34431997105739548,
  'Indri': 0.19293169932806986,
  'text': 'Acrokeratosis paraneoplastica: Bazex syndrome'},
 {'BM25': 0.34431997105739548,
  'Indri': 0.19293169932806986,
  'text': "Acrokeratosis paraneoplastica (Bazex' syndrome)"},
 {'BM25': 0.34212613889472798,
  'Indri': 0.1929124392168288,
  'text': 'We diagnosed a minor form of acrokeratosis paraneoplastica Bazex'},
 {'BM25': 0.34103971061298166,
  'Indri': 0.19290281493116371,
  'text': "Acrokeratosis paraneoplastica (Bazex's syndrome): association with liposarcoma"},
 {'BM25': 0.33996018504925696,
  'Indri': 0.19289319448906314,
  'text': 'Acrokeratosis paraneoplastica of Bazex: report of a case in a young black woman'},
 {'BM25': 0.33996018504925696,
  'Indri': 0.192893194489063

In [16]:
def overlap_score(entity1, entity2):
    if entity1 == '':
        return 0.0
    words1 = entity1.split(' ')
    words2 = entity2.split(' ')
    return float(len(set(words1).intersection(set(words2)))) / float(len(set(words1)))

def gold_candidate_rank(candidates, gold_answers):
    scores = []
    candidates = [candidate[C.ENTITY] for candidate in candidates]
    for candidate in candidates:
        score = np.array([overlap_score(candidate, answer) for answer in gold_answers]).max()
        scores.append((score, candidate))
    ranks = {}
    scores = sorted(scores, reverse=True)
    for rank, (score, candidate) in enumerate(scores):
        ranks[candidate] = rank
    return [ranks[candidate] for candidate in candidates]

def get_features(question, ranked_sentences):
    candidates = question.snippet_ner_entities
    X = np.array([factoid_letor_features.all_features(question.question, ranked_sentences, candidate) for candidate in candidates])
    y = gold_candidate_rank(candidates, question.exact_answer_ref)
    return X.tolist(), y

In [2]:
from dataLoader import DataLoader
import constants as C
import factoid_letor_features
import numpy as np
import json

In [3]:
file_name = 'input/BioASQ-trainingDataset6b.json'
data = DataLoader(file_name)
data.load_ner_entities()
questions = data.get_questions_of_type(C.FACTOID_TYPE)

Loading ner entities from file: nerCache/BioASQ-trainingDataset6b.json


In [4]:
q = questions[0]
entity = q.snippet_ner_entities[1]
entity

{u'entity': u'neoplasia', u'source': u'PubTator', u'type': u'Disease'}

In [5]:
ranked_sentences = q.ranked_sentences()
X, y = get_features(q, ranked_sentences)

In [22]:
overlap_score('Gaeb is is', 'is')

1.0