In [None]:
import numpy as np
import pandas as pd
import matplotlib as plt

### Data Preparation

In [None]:
DS_URL = "https://drive.google.com/file/d/1VMu8eLNLfk1SQUJ8HldFYYiEX_WYWKW7/view?usp=drive_link"

In [None]:
df = pd.read_csv('https://drive.google.com/uc?id=' + DS_URL.split('/')[-2])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         4999 non-null   int64 
 1   description        4999 non-null   object
 2   medical_specialty  4999 non-null   object
 3   sample_name        4999 non-null   object
 4   transcription      4966 non-null   object
 5   keywords           3931 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


In [None]:
keyword_freq = dict();
for keyword in df["keywords"]:
  for word in str(keyword).split(","):
    if word in keyword_freq:
      keyword_freq[word.strip()] += 1
    else:
       keyword_freq[word.strip()] = 1

sorted(keyword_freq.items(), key=lambda x:x[1], reverse=True)

print("10 most common keywords are: ")
i = 0
for word in keyword_freq:
  if i==10:
    break
  print(word)
  i += 1

10 most common keywords are: 
allergy / immunology
allergic rhinitis
allergies
asthma
nasal sprays
rhinitis
nasal
erythematous
allegra
sprays


In [None]:
!pip install -U spacy
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_sm-0.5.1.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_core_sci_md-0.5.1.tar.gz
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz

In [None]:
!pip install render



In [None]:
import scispacy
import spacy
#Core models
import en_core_sci_sm
import en_core_sci_md
#NER specific models
import en_ner_bc5cdr_md

In [None]:
# Pick specific transcription to use (row 3, column "transcription") and test the scispacy NER model
text = df.loc[10, "transcription"]

In [None]:
nlp_sm = en_core_sci_sm.load()
doc = nlp_sm(text)
#Display resulting entity extraction
displacy_image = spacy.displacy.render(doc, jupyter=True,style='ent')

In [None]:
nlp_md = en_core_sci_md.load()
doc = nlp_md(text)
#Display resulting entity extraction
displacy_image = spacy.displacy.render(doc, jupyter=True,style='ent')

In [None]:
nlp_bc = en_ner_bc5cdr_md.load()
doc = nlp_bc(text)
#Display resulting entity extraction
displacy_image = spacy.displacy.render(doc, jupyter=True,style='ent')

In [None]:
print("TEXT", "START", "END", "ENTITY TYPE")
for ent in doc.ents:
    print(ent.text, ent.start_char, ent.end_char, ent.label_)

TEXT START END ENTITY TYPE
Morbid obesity 26 40 DISEASE
Morbid obesity 70 84 DISEASE
weight loss 400 411 DISEASE
Marcaine 1256 1264 CHEMICAL


In [None]:
df.dropna(subset=['transcription'], inplace=True)
df_subset = df.sample(n=100, replace=False, random_state=42)
df_subset.info()
df_subset.head()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100 entries, 3162 to 3581
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         100 non-null    int64 
 1   description        100 non-null    object
 2   medical_specialty  100 non-null    object
 3   sample_name        100 non-null    object
 4   transcription      100 non-null    object
 5   keywords           78 non-null     object
dtypes: int64(1), object(5)
memory usage: 5.5+ KB


Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
3162,3162,Markedly elevated PT INR despite stopping Cou...,Hematology - Oncology,Hematology Consult - 1,"HISTORY OF PRESENT ILLNESS:, The patient is w...",
1981,1981,Intercostal block from fourth to tenth interc...,Pain Management,Intercostal block - 1,"PREPROCEDURE DIAGNOSIS:, Chest pain secondary...","pain management, xylocaine, marcaine, intercos..."
1361,1361,The patient is a 65-year-old female who under...,SOAP / Chart / Progress Notes,Lobectomy - Followup,"HISTORY OF PRESENT ILLNESS: , The patient is a...","soap / chart / progress notes, non-small cell ..."
3008,3008,Construction of right upper arm hemodialysis ...,Nephrology,Hemodialysis Fistula Construction,"PREOPERATIVE DIAGNOSIS: , End-stage renal dise...","nephrology, end-stage renal disease, av dialys..."
4943,4943,Bronchoscopy with brush biopsies. Persistent...,Cardiovascular / Pulmonary,Bronchoscopy - 8,"PREOPERATIVE DIAGNOSIS: , Persistent pneumonia...","cardiovascular / pulmonary, persistent pneumon..."


In [None]:
from spacy.matcher import Matcher
pattern = [{'ENT_TYPE':'CHEMICAL'}, {'LIKE_NUM': True}, {'IS_ASCII': True}]
matcher = Matcher(nlp_bc.vocab)
matcher.add("DRUG_DOSE", [pattern])
for transcription in df_subset['transcription']:
    doc = nlp_bc(transcription)
    matches = matcher(doc)
    for match_id, start, end in matches:
        string_id = nlp_bc.vocab.strings[match_id]  # get string representation
        span = doc[start:end]  # the matched span adding drugs doses
        print(span.text, start, end, string_id,)
        #Add disease and drugs
        for ent in doc.ents:
            print(ent.text, ent.start_char, ent.end_char, ent.label_)

Xylocaine 20 mL 129 132 DRUG_DOSE
Chest pain 26 36 DISEASE
Chest pain 122 132 DISEASE
intercostal block 318 335 DISEASE
chest pain 388 398 DISEASE
Xylocaine 730 739 CHEMICAL
Marcaine 750 758 CHEMICAL
contusion 987 996 DISEASE
respiratory distress 1076 1096 DISEASE
pain 1150 1154 DISEASE
Marcaine 0.25% 133 136 DRUG_DOSE
Chest pain 26 36 DISEASE
Chest pain 122 132 DISEASE
intercostal block 318 335 DISEASE
chest pain 388 398 DISEASE
Xylocaine 730 739 CHEMICAL
Marcaine 750 758 CHEMICAL
contusion 987 996 DISEASE
respiratory distress 1076 1096 DISEASE
pain 1150 1154 DISEASE
Aspirin 81 mg 204 207 DRUG_DOSE
non-small cell lung cancer 114 140 DISEASE
barium 322 328 CHEMICAL
hiatal hernia 373 386 DISEASE
odynophagia 647 658 DISEASE
tenderness 829 839 DISEASE
DVT 918 921 DISEASE
weight loss 952 963 DISEASE
anorexia 965 973 DISEASE
fevers 975 981 DISEASE
chills 983 989 DISEASE
headaches 991 1000 DISEASE
aches 1006 1011 DISEASE
pains 1015 1020 DISEASE
cough 1022 1027 DISEASE
hemoptysis 1029 1039 DI