##Preprocess the BioASQ dataset to the format specified in

https://github.com/dmis-lab/bioasq-biobert

In [None]:
!pip install beautifulsoup4

In [2]:
!unzip BioASQ-training11b.zip

Archive:  BioASQ-training11b.zip
   creating: BioASQ-training11b/
  inflating: BioASQ-training11b/README  
  inflating: BioASQ-training11b/training11b.json  


In [3]:
import json

x = json.load(open("BioASQ-training11b/training11b.json"))

In [40]:
list_questions = [q for q in x['questions'] if q['type'] == 'list']
list_questions[0]

{'body': 'List signaling molecules (ligands) that interact with the receptor EGFR?',
 'documents': ['http://www.ncbi.nlm.nih.gov/pubmed/23959273',
  'http://www.ncbi.nlm.nih.gov/pubmed/21514161',
  'http://www.ncbi.nlm.nih.gov/pubmed/23212918',
  'http://www.ncbi.nlm.nih.gov/pubmed/23888072',
  'http://www.ncbi.nlm.nih.gov/pubmed/23821377',
  'http://www.ncbi.nlm.nih.gov/pubmed/23099994',
  'http://www.ncbi.nlm.nih.gov/pubmed/22260327',
  'http://www.ncbi.nlm.nih.gov/pubmed/24204699',
  'http://www.ncbi.nlm.nih.gov/pubmed/24323361',
  'http://www.ncbi.nlm.nih.gov/pubmed/23089711',
  'http://www.ncbi.nlm.nih.gov/pubmed/23399900',
  'http://www.ncbi.nlm.nih.gov/pubmed/23382875',
  'http://www.ncbi.nlm.nih.gov/pubmed/23729230',
  'http://www.ncbi.nlm.nih.gov/pubmed/23787814',
  'http://www.ncbi.nlm.nih.gov/pubmed/24124521',
  'http://www.ncbi.nlm.nih.gov/pubmed/22247333'],
 'triples': [{'p': 'http://purl.uniprot.org/core/encodedBy',
   's': 'http://purl.uniprot.org/uniprot/Q9QX70',
   'o'

In [None]:
# import requests
# from bs4 import BeautifulSoup

# def get_abstract(url):
#   resp = requests.get(url).text
#   site = BeautifulSoup(resp, 'html.parser')
#   abstract = site.find(id="eng-abstract").p.string
#   return abstract.strip()

# get_abstract("http://www.ncbi.nlm.nih.gov/pubmed/24323361")

'Pituitary adenylate cyclase-activating polypeptide (PACAP), a neuropeptide with trophic and cytoprotective effects, has been shown to affect cell survival, proliferation, and also differentiation of various cell types. The high PACAP level in the milk and its changes during lactation suggest a possible effect of PACAP on the differentiation of mammary epithelial cells. Mammary cell differentiation is regulated by hormones, growth factors, cytokines/chemokines, and angiogenic proteins. In this study, differentiation was hormonally induced by lactogenic hormones in confluent cultures of HC11 mouse mammary epithelial cells. We investigated the effect of PACAP on mammary cell differentiation as well as release of cytokines, chemokines, and growth factors. Differentiation was assessed by expression analysis of the milk protein β-casein. Differentiation significantly decreased the secretion of interferon gammainduced protein (IP)-10, "regulated upon activation normal T cell expressed and pr

In [35]:
def preprocess_list(datum):
  processed = []
  i = 1
  qid = datum['id']
  
  # for answer in datum['exact_answer']:
  #   answer = answer[0]
  #   for snippet in datum['snippets']:
  #     snippet = snippet['text']
  for snippet in datum['snippets']:
    snippet = snippet['text'].lower()
    for answer in datum['exact_answer']:
      answer = answer[0].lower()
  
      ind = snippet.find(answer)
      if ind > -1:
        new_entry = {'id': f'{qid}_{i:04}', 'question': datum['body'], 'answers': [{'text': answer, 'answer_start': ind}]}
        processed.append({'qas': [new_entry], 'context': snippet})
        i += 1
  return processed

def preprocess_factoid(datum):
  processed = []
  i = 1
  qid = datum['id']

  for snippet in datum['snippets']:
    snippet = snippet['text'].lower()
    answer = datum['exact_answer'][0].lower()

    ind = snippet.find(answer)
    if ind > -1:
      new_entry = {'id': f'{qid}_{i:04}', 'question': datum['body'], 'answers': [{'text': answer, 'answer_start': ind}]}
      processed.append({'qas': [new_entry], 'context': snippet})
      i += 1
  return processed

def preprocess_dataset(dataset):
  processed = []
  for datum in dataset:
    if datum['type'] == 'list':
      processed.extend(preprocess_list(datum))
    elif datum['type'] == 'factoid':
      processed.extend(preprocess_factoid(datum))
  return processed

In [43]:
processed = preprocess_dataset(x['questions'])
[a for a in processed[:27]]

[{'qas': [{'id': '55046d5ff8aee20f27000007_0001',
    'question': 'List signaling molecules (ligands) that interact with the receptor EGFR?',
    'answers': [{'text': 'epidermal growth factor', 'answer_start': 4}]}],
  'context': 'the epidermal growth factor receptor (egfr) ligands, such as epidermal growth factor (egf) and amphiregulin (areg)'},
 {'qas': [{'id': '55046d5ff8aee20f27000007_0002',
    'question': 'List signaling molecules (ligands) that interact with the receptor EGFR?',
    'answers': [{'text': 'amphiregulin', 'answer_start': 95}]}],
  'context': 'the epidermal growth factor receptor (egfr) ligands, such as epidermal growth factor (egf) and amphiregulin (areg)'},
 {'qas': [{'id': '55046d5ff8aee20f27000007_0003',
    'question': 'List signaling molecules (ligands) that interact with the receptor EGFR?',
    'answers': [{'text': 'epidermal growth factor', 'answer_start': 14}]}],
  'context': ' egfr ligands epidermal growth factor (egf), amphiregulin (areg) and transformin

In [39]:
json.dump({'data': [{'paragraphs': processed, 'title': 'BioASQ11b'}], 'version': 'BioASQ11b'}, open('preprocessed-training11b.json', 'w'))