# Scispacy normalizer in English

In this notebook, [Scispacy](https://allenai.github.io/scispacy/) Normalizer will be applied to the original and generated documents for En-En reformatting.

Tutorials: 

1. [NER + UMLS](https://oyewusiwuraola.medium.com/how-to-use-scispacy-for-biomedical-named-entity-recognition-abbreviation-resolution-and-link-umls-87d3f7c08db2)

2. [UMLS, MESH, RxNorm, GO, HPO](https://oyewusiwuraola.medium.com/how-to-use-scispacy-entity-linkers-for-biomedical-named-entities-7cf13b29ef67)

In [6]:
!pip install scispacy
!pip install swifter
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.4/en_core_sci_sm-0.5.4.tar.gz;      #scispacy medium model
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bc5cdr_md-0.2.5.tar.gz    #biomedical NER model trained on BC5CDR corpus
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_bionlp13cg_md-0.2.5.tar.gz  #biomedical NER model trained on BIONLP13CG corpus
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_craft_md-0.2.5.tar.gz    #biomedical NER model trained on CRAFT corpus
# !pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.2.5/en_ner_jnlpba_md-0.2.5.tar.gz     #biomedical NER model trained on JNLPBA corpus

Collecting spacy<3.8.0,>=3.7.0
  Using cached spacy-3.7.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (6.6 MB)
Collecting catalogue<2.1.0,>=2.0.6
  Using cached catalogue-2.0.10-py3-none-any.whl (17 kB)
Collecting thinc<8.3.0,>=8.2.2
  Using cached thinc-8.2.5-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (922 kB)
Collecting srsly<3.0.0,>=2.4.3
  Using cached srsly-2.4.8-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (493 kB)
Installing collected packages: catalogue, srsly, thinc, spacy
  Attempting uninstall: catalogue
    Found existing installation: catalogue 1.0.2
    Uninstalling catalogue-1.0.2:
      Successfully uninstalled catalogue-1.0.2
  Attempting uninstall: srsly
    Found existing installation: srsly 1.0.7
    Uninstalling srsly-1.0.7:
      Successfully uninstalled srsly-1.0.7
  Attempting uninstall: thinc
    Found existing installation: thinc 7.4.6
    Uninstalling thinc-7.4.6:
      Successfully uninstalled thinc-7.4.6
  Attemp

In [2]:
import spacy
import scispacy
import swifter
import pandas as pd
from spacy import displacy
import en_core_sci_sm
# import en_ner_bc5cdr_md
# import en_ner_jnlpba_md
# import en_ner_craft_md
# import en_ner_bionlp13cg_md
from scispacy.abbreviation import AbbreviationDetector
from scispacy.linking import EntityLinker
from collections import OrderedDict,Counter
from pprint import pprint
from tqdm import tqdm
import os
tqdm.pandas()

  from .autonotebook import tqdm as notebook_tqdm


# Load Data

In [3]:
MAIN_PATH_ORIG_TXT =  "data/1_original/txt"

In [4]:
def extract_txt(path, filename):
    total_path = os.path.join(path, filename)
    return filename, open(total_path, "r").read()

def files_to_df(path, extensions=["txt"]):
    files = [x for x in os.listdir(path) if x.split(".")[-1] in extensions]
    data = [extract_txt(path, f) for f in files]
    return pd.DataFrame(data, columns=["filename", "text"])

df_orig = files_to_df(MAIN_PATH_ORIG_TXT)

df_orig.head()

Unnamed: 0,filename,text
0,29997384.txt,"A 25-year-old nulliparous white woman, 36 week..."
1,34152792.txt,A 68-year-old man with history of hypertension...
2,30946343.txt,A 50-year-old man was admitted to our hospital...
3,29434126.txt,A 35-year-old Japanese man was emergently admi...
4,30478956.txt,Patient was a 51‐year‐old woman who had been d...


In [5]:
def display_entities(model,document):
    """ A function that returns a tuple of displacy image of named or unnamed word entities and
        a set of unique entities recognized based on scispacy model in use
        Args: 
            model: A pretrained model from spaCy or ScispaCy
            document: text data to be analysed"""
    nlp = model.load()
    doc = nlp(document)
    displacy_image = displacy.render(doc, jupyter=True,style='ent')
    entity_and_label = set([(X.text, X.label_) for X in doc.ents])
    return  displacy_image, entity_and_label

display_entities(en_core_sci_sm, df_orig.text[0])

(None,
 {('California', 'ENTITY'),
  ('Cardiovascular', 'ENTITY'),
  ('EF', 'ENTITY'),
  ('HF', 'ENTITY'),
  ('Illumina', 'ENTITY'),
  ('LV', 'ENTITY'),
  ('LV longitudinal systolic function', 'ENTITY'),
  ('LVEF', 'ENTITY'),
  ('Leu23499fs/c.70497_40498insT', 'ENTITY'),
  ('NGS', 'ENTITY'),
  ('NM_001267550', 'ENTITY'),
  ('Next-generation sequencing', 'ENTITY'),
  ('RVEF', 'ENTITY'),
  ('San Diego', 'ENTITY'),
  ('TTN gene', 'ENTITY'),
  ('TrueSight One (TSO', 'ENTITY'),
  ('United States) sequencing panel', 'ENTITY'),
  ('VT', 'ENTITY'),
  ('accident', 'ENTITY'),
  ('admission', 'ENTITY'),
  ('admitted to', 'ENTITY'),
  ('age', 'ENTITY'),
  ('asymptomatic', 'ENTITY'),
  ('bromocriptine', 'ENTITY'),
  ('cardiac surgery', 'ENTITY'),
  ('cesarean section', 'ENTITY'),
  ('clinical', 'ENTITY'),
  ('depressed level', 'ENTITY'),
  ('died', 'ENTITY'),
  ('dilated', 'ENTITY'),
  ('discharged', 'ENTITY'),
  ('discontinuation', 'ENTITY'),
  ('echocardiogram', 'ENTITY'),
  ('echocardiography', 

In [7]:
from spacy.language import Language
import en_core_sci_sm
from scispacy.linking import EntityLinker

# Register the EntityLinker component
@Language.factory("umls_linker")
def create_umls_linker(nlp, name):
    return EntityLinker(k=10, max_entities_per_mention=2, name="umls")

nlp = en_core_sci_sm.load()
nlp.add_pipe("umls_linker")

def umls_entity_linker(document):
    """ A function that accepts and document and returns the entity link details"""
    #linker = EntityLinker(k = 10,max_entities_per_mention = 2, name=linker_name)  #parameters are tunable,so it can be set to return more than 2 entity matches
    #nlp = en_core_sci_sm.load()
    #nlp.add_pipe(linker)
    doc = nlp(document)
    try:
        entity = doc.ents[0]
    except IndexError:
        entity = 'Nan'
    entity_details = []
    entity_details.append(entity)
    try:
        for linker_ent in entity._.kb_ents:
            Concept_Id, Score = linker_ent
            entity_details.append('Entity_Matching_Score :{}'.format(Score))
            entity_details.append(linker.kb.cui_to_entity[linker_ent[0]])
    except AttributeError:
        pass
    return entity_details

umls_entity_linker(df_orig.text[0])

: 