# Scispacy normalizer in English

In this notebook, [Scispacy](https://allenai.github.io/scispacy/) Normalizer will be applied to the original and generated documents for En-En reformatting.

Tutorials: 

1. [NER + UMLS](https://oyewusiwuraola.medium.com/how-to-use-scispacy-for-biomedical-named-entity-recognition-abbreviation-resolution-and-link-umls-87d3f7c08db2)

2. [UMLS, MESH, RxNorm, GO, HPO](https://oyewusiwuraola.medium.com/how-to-use-scispacy-entity-linkers-for-biomedical-named-entities-7cf13b29ef67)

In [9]:
# !pip install scispacy
# !pip install swifter
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz;      #scispacy medium model
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz    #biomedical NER model trained on BC5CDR corpus
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bionlp13cg_md-0.2.5.tar.gz  #biomedical NER model trained on BIONLP13CG corpus
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_craft_md-0.2.5.tar.gz    #biomedical NER model trained on CRAFT corpus
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_jnlpba_md-0.2.5.tar.gz     #biomedical NER model trained on JNLPBA corpus

Collecting https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz
  Downloading https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_ner_bc5cdr_md-0.5.4.tar.gz (119.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m119.8/119.8 MB[0m [31m20.1 MB/s[0m eta [36m0:00:00[0m00:01[0m00:01[0m
[?25h  Installing build dependencies ... [?25ldone
[?25h  Getting requirements to build wheel ... [?25ldone
[?25h  Preparing metadata (pyproject.toml) ... [?25ldone
Building wheels for collected packages: en_ner_bc5cdr_md
  Building wheel for en_ner_bc5cdr_md (pyproject.toml) ... [?25ldone
[?25h  Created wheel for en_ner_bc5cdr_md: filename=en_ner_bc5cdr_md-0.5.4-py3-none-any.whl size=119787677 sha256=712ce0e1d32bd50d2616f09bdcbb81746c6e2123491bf29dd415cf8e99d1a527
  Stored in directory: /gpfs/home/bsc/bsc830651/.cache/pip/wheels/5e/28/69/338ab4a7f1ebd51895058e69c44704521d29a53f28db8ca19f
Successfully built en_ner_b

In [2]:
import spacy
import scispacy
import swifter
import pandas as pd
from spacy import displacy
# import en_core_sci_sm
import en_ner_bc5cdr_md
# import en_ner_jnlpba_md
# import en_ner_craft_md
# import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import OrderedDict,Counter
from pprint import pprint
from tqdm import tqdm
import os
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm
Your CPU supports instructions that this binary was not compiled to use: SSE3 SSE4.1 SSE4.2 AVX AVX2
For maximum performance, you can install NMSLIB from sources 
pip install --no-binary :all: nmslib


# Load Data

In [3]:
MAIN_PATH_ORIG_TXT =  "data/1_original/txt"

In [4]:
def extract_txt(path, filename):
    total_path = os.path.join(path, filename)
    return filename, open(total_path, "r").read()

def files_to_df(path, extensions=["txt"]):
    files = [x for x in os.listdir(path) if x.split(".")[-1] in extensions]
    data = [extract_txt(path, f) for f in files]
    return pd.DataFrame(data, columns=["filename", "text"])

df_orig = files_to_df(MAIN_PATH_ORIG_TXT)

df_orig.head()

Unnamed: 0,filename,text
0,33175723_1.txt,"Patient: Male, 63-year-old\n\n\nFinal Diagnosi..."
1,36305455.txt,"In November 1990, a 25‐year‐old male patient w..."
2,31668014.txt,A 67‐year‐old man treated with hemodialysis wa...
3,32153696.txt,An 80-year-old man who worked as a constructio...
4,32470561.txt,A 46-years-old female was admitted to our depa...


In [5]:
def display_entities(model,document):
    """ A function that returns a tuple of displacy image of named or unnamed word entities and
        a set of unique entities recognized based on scispacy model in use
        Args: 
            model: A pretrained model from spaCy or ScispaCy
            document: text data to be analysed"""
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    return  displacy_image, entity_and_label

display_entities(en_core_sci_sm, df_orig.text[0])

(None,
 {('AL amyloidosis', 'ENTITY'),
  ('Amyloid light-chain amyloidosis\n', 'ENTITY'),
  ('Aphasia\n', 'ENTITY'),
  ('Biopsy\n', 'ENTITY'),
  ('Bone marrow biopsy', 'ENTITY'),
  ('CD138', 'ENTITY'),
  ('Chemotherapy', 'ENTITY'),
  ('Clinical Procedure', 'ENTITY'),
  ('Congo', 'ENTITY'),
  ('Diagnosis', 'ENTITY'),
  ('ECG', 'ENTITY'),
  ('IgG', 'ENTITY'),
  ('Male', 'ENTITY'),
  ('Monoclonal protein', 'ENTITY'),
  ('Patient', 'ENTITY'),
  ('Spontaneous', 'ENTITY'),
  ('Symptoms', 'ENTITY'),
  ('admitted', 'ENTITY'),
  ('amyloid infiltrates', 'ENTITY'),
  ('amyloidosis', 'ENTITY'),
  ('atypical', 'ENTITY'),
  ('cardiac amyloidosis\n• cardiomyopathy\n', 'ENTITY'),
  ('cardioembolic cerebral event', 'ENTITY'),
  ('cerebral damage', 'ENTITY'),
  ('chronically elevated', 'ENTITY'),
  ('clinical presentation', 'ENTITY'),
  ('decreased glomerular filtration rate', 'ENTITY'),
  ('detected', 'ENTITY'),
  ('diagnosed', 'ENTITY'),
  ('diagnosis', 'ENTITY'),
  ('died', 'ENTITY'),
  ('echo contra

In [7]:
from spacy.language import Language
import en_core_sci_sm
from scispacy.linking import EntityLinker

nlp = en_core_sci_sm.load()

if "umls_linker" not in nlp.pipe_names:
    # Register the EntityLinker component
    @Language.factory("umls_linker")
    def create_umls_linker(nlp, name):
        return EntityLinker(k=10, max_entities_per_mention=2, name="umls")

    nlp.add_pipe("umls_linker")

def entity_linker(document, nlp, name):
    """ A function that accepts and document and returns the entity link details"""
    linker = EntityLinker(k = 10,max_entities_per_mention = 2, name=name)  #parameters are tunable,so it can be set to return more than 2 entity matches
    #nlp = en_core_sci_sm.load()
    #nlp.add_pipe(linker)
    doc = nlp(document)
    # linker = dict(nlp.components)[name]
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details

# entity_linker(df_orig.text[0], nlp, name="umls")

https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations
https://scikit-learn.org/stable/model_persistence.html#security-maintainability-limitations


In [None]:
from spacy.language import Language
import en_core_sci_sm
from scispacy.linking import EntityLinker

nlp = en_core_sci_sm.load()

In [17]:
dict(nlp.components)["umls_linker"]

<scispacy.linking.EntityLinker at 0x7f9108aa5790>

In [12]:
dir(nlp)

['Defaults',
 '__annotations__',
 '__call__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__le__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_components',
 '_config',
 '_disabled',
 '_ensure_doc',
 '_ensure_doc_with_context',
 '_factory_meta',
 '_get_pipe_index',
 '_has_gpu_model',
 '_link_components',
 '_meta',
 '_multiprocessing_pipe',
 '_optimizer',
 '_path',
 '_pipe_configs',
 '_pipe_meta',
 '_resolve_component_status',
 'add_pipe',
 'analyze_pipes',
 'batch_size',
 'begin_training',
 'component',
 'component_names',
 'components',
 'config',
 'create_optimizer',
 'create_pipe',
 'create_pipe_from_source',
 'default_config',
 'default_error_handler',
 'disable_pipe',
 'disable_pipes',
 'disabled',
 'enable_pip