In [None]:
!pip install spacy
!pip install scispacy
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bionlp13cg_md-0.5.4.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz

Collecting scispacy
  Using cached scispacy-0.5.4-py3-none-any.whl (45 kB)
Collecting scipy<1.11 (from scispacy)
  Using cached scipy-1.10.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (34.4 MB)
Collecting conllu (from scispacy)
  Using cached conllu-4.5.3-py2.py3-none-any.whl (16 kB)
Collecting nmslib>=1.7.3.6 (from scispacy)
  Using cached nmslib-2.1.1.tar.gz (188 kB)
  Preparing metadata (setup.py) ... [?25l[?25hdone
Collecting pysbd (from scispacy)
  Using cached pysbd-0.3.4-py3-none-any.whl (71 kB)
Collecting pybind11<2.6.2 (from nmslib>=1.7.3.6->scispacy)
  Using cached pybind11-2.6.1-py2.py3-none-any.whl (188 kB)
Building wheels for collected packages: nmslib
  Building wheel for nmslib (setup.py) ... [?25l[?25hdone
  Created wheel for nmslib: filename=nmslib-2.1.1-cp310-cp310-linux_x86_64.whl size=13578644 sha256=5b29351436007d1365b9147c87e0ac694e60f41c4b410ac27487e0f03ce2f76f
  Stored in directory: /root/.cache/pip/wheels/21/1a/5d/4cc754a5b1a88405cad184b76f8

In [None]:
import pandas as pd
import spacy
import scispacy
from spacy import displacy
import en_ner_bc5cdr_md
import en_ner_bionlp13cg_md
from scispacy.linking import EntityLinker
from collections import Counter, OrderedDict
from pprint import pprint

In [None]:
df = pd.read_json('training11b.json')
df['questions'] =  df['questions'].apply(lambda x: x['body'])
questions = ' '.join(df['questions'])

In [None]:
def display_entity(model, document):
  nlp = model.load()
  doc = nlp(document)
  displacy_image = displacy.render(doc, jupyter=True, style='ent')
  entity_label = set([(X.text,X.label_) for X in doc.ents])
  return displacy_image, entity_label

In [None]:
bc5_entity = display_entity(en_ner_bc5cdr_md, questions)
bc5_entity_df = pd.DataFrame(bc5_entity[1],columns=['Entity','label'])
bc5_entity_df['NER model'] = 'bc5cdr'

In [None]:
bio13_entity = display_entity(en_ner_bionlp13cg_md, questions)
bio13_entity_df = pd.DataFrame(bio13_entity[1],columns=['Entity','label'])
bio13_entity_df['NER model'] = 'bionlp13cg'

In [None]:
bio13_entity_df.head()

Unnamed: 0,Entity,label,NER model
0,Facioscapulohumeral muscular,CANCER,bionlp13cg
1,miR-155,GENE_OR_GENE_PRODUCT,bionlp13cg
2,RAGE,GENE_OR_GENE_PRODUCT,bionlp13cg
3,CD55,GENE_OR_GENE_PRODUCT,bionlp13cg
4,pontine glioma,CANCER,bionlp13cg


In [None]:
bc5_entity_df.head()

Unnamed: 0,Entity,label,NER model
0,Lebers syndrome,DISEASE,bc5cdr
1,tacrine,CHEMICAL,bc5cdr
2,dystopia canthorum,DISEASE,bc5cdr
3,Chernobyl,DISEASE,bc5cdr
4,Jamaican vomiting sickness,DISEASE,bc5cdr


In [None]:
#Combining both
entities_df = pd.concat([bc5_entity_df,bio13_entity_df])
entities_df.to_csv('NER-entities.csv')
entities_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4279 entries, 0 to 2598
Data columns (total 3 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   Entity     4279 non-null   object
 1   label      4279 non-null   object
 2   NER model  4279 non-null   object
dtypes: object(3)
memory usage: 133.7+ KB


In [None]:
entities_df.head()

Unnamed: 0,Entity,label,NER model
0,Lebers syndrome,DISEASE,bc5cdr
1,tacrine,CHEMICAL,bc5cdr
2,dystopia canthorum,DISEASE,bc5cdr
3,Chernobyl,DISEASE,bc5cdr
4,Jamaican vomiting sickness,DISEASE,bc5cdr


In [None]:
class NER:
  def __init__(self, model1='en_ner_bc5cdr_md',model2='en_ner_bionlp13cg_md'):
    self.model_1 = spacy.load(model1)
    self.model_2 = spacy.load(model2)

  def get_entities_model_bc5(self,document):
    doc = self.model_1(document)
    entities = {}
    for ent in doc.ents:
      entities[ent.text] = ent.label_
    return entities

  def get_entities_model_bio13(self,document):
    doc = self.model_2(document)
    entities = {}
    for ent in doc.ents:
      entities[ent.text] = ent.label_
    return entities

  def get_all_entities(self,document):
    entities_model_bc5 = self.get_entities_model_bc5(document)
    entities_model_bio13 = self.get_entities_model_bio13(document)
    all_entities_set = set(entities_model_bc5.keys()) | set(entities_model_bio13.keys())

    all_entities = {}
    for ent_, label in entities_model_bc5.items():
      all_entities[ent_] = label

    for ent_, label in entities_model_bio13.items():
      if ent_ not in all_entities:
        all_entities[ent_] = label

    return all_entities

In [None]:
ner_model = NER(model1='en_ner_bc5cdr_md', model2='en_ner_bionlp13cg_md')

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


In [None]:
all_entities = ner_model.get_all_entities('pontine glioma')

In [None]:
all_entities

{'pontine glioma': 'DISEASE'}

In [None]:
questions[:500]

'Is Hirschsprung disease a mendelian or a multifactorial disorder? List signaling molecules (ligands) that interact with the receptor EGFR? Is the protein Papilin secreted? Are long non coding RNAs spliced? Is RANKL secreted from the cells? Does metformin interfere thyroxine absorption? Which miRNAs could be used as potential biomarkers for epithelial ovarian cancer? Which acetylcholinesterase inhibitors are used for treatment of myasthenia gravis? Has Denosumab (Prolia) been approved by FDA? Lis'