### This script examines how to embed text with BioBert embeddings + necessary data preprocessing.

In [2]:
import json

In [3]:
def load_json(filename):
    with open(filename) as in_f:
        return json.load(in_f)

In [4]:
filename = "/data/medg/misc/phuongpm/" + "train1.0.json"

In [5]:
dataset = load_json(filename)

In [10]:
dataset.keys()

dict_keys(['data', 'version'])

In [9]:
datum = dataset['data'][0]

In [11]:
datum.keys()

dict_keys(['document', 'source'])

In [12]:
datum_document = datum['document']

In [13]:
datum_document.keys()

dict_keys(['qas', 'title', 'context'])

In [14]:
datum_context = datum_document['context']

In [15]:
datum_context

'Summary\n\nThis report describes a term newborn with BEG__isolated distortion in the left parietal bone__END without any BEG__other visible congenital anomaly__END , due to BEG__amniotic band disruption__END .\nA BEG__skull x-ray__END , BEG__ultrasound scan__END and BEG__subsequent MRI scan of the brain__END did not show any BEG__apparent distortion__END apart from BEG__depression__END and BEG__concavity in the left parietal bone__END .\nThe purpose of this case report is to raise awareness of this possible , mild outcome of this BEG__little - known entity__END , which may BEG__mimic caput succedaneum__END ( moulding of the presenting part in the birth canal during natural delivery ) , and to provide a historical and embryological background .\n\nBackground\n\nBEG__Amniotic band disruption syndrome__END is a rare entity which occurs in 1 in 1200 to 1 in 15 000 live births .\n1 It may cause a BEG__myriad of deformities of fetal body parts__END from BEG__mild defects in limbs to severe 

In [16]:
datum_qas = datum_document['qas']

In [17]:
datum_qas

[{'answers': [{'text': 'Isolated calvarial deformity mimicking caput succedenum',
    'origin': 'dataset',
    'sem_type': 'problem',
    'cui': 'C2825501'},
   {'text': 'Calvarial',
    'origin': 'UMLS',
    'sem_type': 'problem',
    'cui': 'C2825501'}],
  'id': 'bcr.12.2009.2549.1',
  'query': '▶ @placeholder from BEG__amniotic band disruption__END is a possibility .'},
 {'answers': [{'text': 'amniotic band disruption',
    'origin': 'dataset',
    'sem_type': 'problem',
    'cui': 'C1527388'},
   {'text': 'Amniotic bands',
    'origin': 'UMLS',
    'sem_type': 'problem',
    'cui': 'C1527388'},
   {'text': 'Amniotic Band',
    'origin': 'UMLS',
    'sem_type': 'problem',
    'cui': 'C1527388'},
   {'text': 'AMNIOTIC BAND',
    'origin': 'UMLS',
    'sem_type': 'problem',
    'cui': 'C1527388'},
   {'text': 'Bands, Amniotic',
    'origin': 'UMLS',
    'sem_type': 'problem',
    'cui': 'C1527388'},
   {'text': 'amniotic band',
    'origin': 'UMLS',
    'sem_type': 'problem',
    'cui

In [18]:
datum_title = datum_document['title']

In [19]:
datum_title

'BEG__Isolated cranial distortion mimicking caput succedenum__END from BEG__amniotic band disruption__END without any BEG__neurological abnormality__END'

In [20]:
def to_entities(text, ent_marker="@entity"):
    """
    Text includes entities marked as BEG__w1 w2 w3__END. Transform to a single entity @entityw1_w2_w3.
    """
    word_list = []
    inside = False
    for w in text.split():
        w_stripped = w.strip()
        if w_stripped.startswith("BEG__") and w_stripped.endswith("__END"):
            concept = [w_stripped.split("_")[2]]
            word_list.append(ent_marker + "_".join(concept))
            if inside:  # something went wrong, leave as is
                print("Inconsistent markup.")
        elif w_stripped.startswith("BEG__"):
            assert not inside
            inside = True
            concept = [w_stripped.split("_", 2)[-1]]
        elif w_stripped.endswith("__END"):
            if not inside:
                word_list.append(w_stripped[:-5])
            else:
                concept.append(w_stripped.rsplit("_", 2)[0])
                word_list.append(ent_marker + "_".join(concept))
                inside = False
        else:
            if inside:
                concept.append(w_stripped)
            else:
                word_list.append(w_stripped)

    return " ".join(word_list)

In [21]:
to_entities(datum_title)

'@entityIsolated_cranial_distortion_mimicking_caput_succedenum from @entityamniotic_band_disruption without any @entityneurological_abnormality'

In [22]:
to_entities(datum_context)

'Summary This report describes a term newborn with @entityisolated_distortion_in_the_left_parietal_bone without any @entityother_visible_congenital_anomaly , due to @entityamniotic_band_disruption . A @entityskull_x-ray , @entityultrasound_scan and @entitysubsequent_MRI_scan_of_the_brain did not show any @entityapparent_distortion apart from @entitydepression and @entityconcavity_in_the_left_parietal_bone . The purpose of this case report is to raise awareness of this possible , mild outcome of this @entitylittle_-_known_entity , which may @entitymimic_caput_succedaneum ( moulding of the presenting part in the birth canal during natural delivery ) , and to provide a historical and embryological background . Background @entityAmniotic_band_disruption_syndrome is a rare entity which occurs in 1 in 1200 to 1 in 15 000 live births . 1 It may cause a @entitymyriad_of_deformities_of_fetal_body_parts from @entitymild_defects_in_limbs_to_severe_craniofacial_defects incompatible with life . The

In [24]:
cand = [w for w in to_entities(datum_title + " " + datum_context).lower().split() if w.startswith('@entity')]

In [25]:
cand

['@entityisolated_cranial_distortion_mimicking_caput_succedenum',
 '@entityamniotic_band_disruption',
 '@entityneurological_abnormality',
 '@entityisolated_distortion_in_the_left_parietal_bone',
 '@entityother_visible_congenital_anomaly',
 '@entityamniotic_band_disruption',
 '@entityskull_x-ray',
 '@entityultrasound_scan',
 '@entitysubsequent_mri_scan_of_the_brain',
 '@entityapparent_distortion',
 '@entitydepression',
 '@entityconcavity_in_the_left_parietal_bone',
 '@entitylittle_-_known_entity',
 '@entitymimic_caput_succedaneum',
 '@entityamniotic_band_disruption_syndrome',
 '@entitymyriad_of_deformities_of_fetal_body_parts',
 '@entitymild_defects_in_limbs_to_severe_craniofacial_defects',
 '@entityspectrum_of_defects',
 '@entitydisruption',
 '@entitydeformation',
 '@entitymalformation',
 '@entityamniotic_bands_at_different_stages_of_organogenesis',
 '@entitysyndrome',
 '@entityamniotic_deformities',
 '@entityadhesions',
 '@entityamniotic_band_disruption_complex',
 '@entitytropin',
 '@

In [28]:
qa_txt_option = (" " + datum_qas[0]['query'])

In [29]:
qa_txt_option

' ▶ @placeholder from BEG__amniotic band disruption__END is a possibility .'