In [None]:
!pip install pyspellchecker -q

In [None]:
import pandas as pd
import re
import nltk
import spacy
import string
from nltk.tokenize import word_tokenize
from nltk.corpus import stopwords
from tabulate import tabulate
from nltk.corpus import wordnet as wn
from spellchecker import SpellChecker
from nltk.stem import PorterStemmer, WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

# Load the Dataset

In [None]:
df = pd.read_csv('/content/drive/MyDrive/Datasets/mtsamples.csv')

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23-year-old white female presents with comp...,Allergy / Immunology,Allergic Rhinitis,"SUBJECTIVE:, This 23-year-old white female pr...","allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 2,"PAST MEDICAL HISTORY:, He has difficulty climb...","bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass.,Bariatrics,Laparoscopic Gastric Bypass Consult - 1,"HISTORY OF PRESENT ILLNESS: , I have seen ABC ...","bariatrics, laparoscopic gastric bypass, heart..."
3,3,2-D M-Mode. Doppler.,Cardiovascular / Pulmonary,2-D Echocardiogram - 1,"2-D M-MODE: , ,1. Left atrial enlargement wit...","cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2-D Echocardiogram,Cardiovascular / Pulmonary,2-D Echocardiogram - 2,1. The left ventricular cavity size and wall ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


# Data Preparation

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4999 entries, 0 to 4998
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         4999 non-null   int64 
 1   description        4999 non-null   object
 2   medical_specialty  4999 non-null   object
 3   sample_name        4999 non-null   object
 4   transcription      4966 non-null   object
 5   keywords           3931 non-null   object
dtypes: int64(1), object(5)
memory usage: 234.5+ KB


In [None]:
df = df.dropna(subset=['transcription'])

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 4966 entries, 0 to 4998
Data columns (total 6 columns):
 #   Column             Non-Null Count  Dtype 
---  ------             --------------  ----- 
 0   Unnamed: 0         4966 non-null   int64 
 1   description        4966 non-null   object
 2   medical_specialty  4966 non-null   object
 3   sample_name        4966 non-null   object
 4   transcription      4966 non-null   object
 5   keywords           3898 non-null   object
dtypes: int64(1), object(5)
memory usage: 271.6+ KB


# Data Cleaning

In [None]:
def remove_special_characters(text):
    remove_special = re.sub(r'[^a-zA-Z0-9\s]', '', text)
    return ''.join(remove_special)

In [None]:
for col in df:
  if col!= 'Unnamed: 0' and col!= 'keywords':
    df[col] = df[col].apply(remove_special_characters)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,A 23yearold white female presents with compla...,Allergy Immunology,Allergic Rhinitis,SUBJECTIVE This 23yearold white female presen...,"allergy / immunology, allergic rhinitis, aller..."
1,1,Consult for laparoscopic gastric bypass,Bariatrics,Laparoscopic Gastric Bypass Consult 2,PAST MEDICAL HISTORY He has difficulty climbin...,"bariatrics, laparoscopic gastric bypass, weigh..."
2,2,Consult for laparoscopic gastric bypass,Bariatrics,Laparoscopic Gastric Bypass Consult 1,HISTORY OF PRESENT ILLNESS I have seen ABC to...,"bariatrics, laparoscopic gastric bypass, heart..."
3,3,2D MMode Doppler,Cardiovascular Pulmonary,2D Echocardiogram 1,2D MMODE 1 Left atrial enlargement with left...,"cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2D Echocardiogram,Cardiovascular Pulmonary,2D Echocardiogram 2,1 The left ventricular cavity size and wall t...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


# Text Preprocessing

In [None]:
spell = SpellChecker()
stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()

def text_preprocessing(text):
    # Tokenization
    words = word_tokenize(text)

    # Lowercasing and removing punctuation
    words = [re.sub(r'[^\w\s]', '', word.lower()) for word in words]

    # Stop word removal
    stop_words = set(stopwords.words('english'))
    words = [word for word in words if word not in stop_words]

    # Stemming
    words = [lemmatizer.lemmatize(word) for word in words]

    return ' '.join(words)

In [None]:
for col in df:
  if col!= 'Unnamed: 0' and col!= 'keywords':
    df[col] = df[col].apply(text_preprocessing)

In [None]:
df.head()

Unnamed: 0.1,Unnamed: 0,description,medical_specialty,sample_name,transcription,keywords
0,0,23yearold white female present complaint allergy,allergy immunology,allergic rhinitis,subjective 23yearold white female present comp...,"allergy / immunology, allergic rhinitis, aller..."
1,1,consult laparoscopic gastric bypass,bariatrics,laparoscopic gastric bypass consult 2,past medical history difficulty climbing stair...,"bariatrics, laparoscopic gastric bypass, weigh..."
2,2,consult laparoscopic gastric bypass,bariatrics,laparoscopic gastric bypass consult 1,history present illness seen abc today pleasan...,"bariatrics, laparoscopic gastric bypass, heart..."
3,3,2d mmode doppler,cardiovascular pulmonary,2d echocardiogram 1,2d mmode 1 left atrial enlargement left atrial...,"cardiovascular / pulmonary, 2-d m-mode, dopple..."
4,4,2d echocardiogram,cardiovascular pulmonary,2d echocardiogram 2,1 left ventricular cavity size wall thickness ...,"cardiovascular / pulmonary, 2-d, doppler, echo..."


# NER

In [None]:
!pip install -U spacy
!pip install scispacy

In [None]:
!pip install https://s3-us-west-2.amazonaws.com/ai2-s2-scispacy/releases/v0.5.1/en_ner_bc5cdr_md-0.5.1.tar.gz -q

In [None]:
!pip install render -q

In [None]:
import scispacy
import spacy
import en_ner_bc5cdr_md
from spacy import displacy
import pandas as pd

In [None]:
nlp_bc = en_ner_bc5cdr_md.load()
doc = nlp_bc(df['transcription'][2])
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [None]:
doc = nlp_bc(df['transcription'][4])
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [None]:
doc = nlp_bc(df['transcription'][26])
displacy_image = displacy.render(doc, jupyter=True,style='ent')

In [None]:
examples = []
for i in range(5):
  doc = nlp_bc(df['transcription'][i])
  for ent in doc.ents:
    examples.append((ent.text,[(ent.start_char, ent.end_char, ent.label_)]))

In [None]:
examples

[('allergy', [(52, 59, 'DISEASE')]),
 ('allergy', [(65, 72, 'DISEASE')]),
 ('asthma', [(301, 307, 'DISEASE')]),
 ('throat', [(504, 510, 'DISEASE')]),
 ('loratadine', [(730, 740, 'CHEMICAL')]),
 ('muscle joint pain', [(226, 243, 'DISEASE')]),
 ('knee pain', [(254, 263, 'DISEASE')]),
 ('pain', [(269, 273, 'DISEASE')]),
 ('ankle pain swelling gastroesophageal reflux diseasepast',
  [(279, 334, 'DISEASE')]),
 ('smoke', [(493, 498, 'CHEMICAL')]),
 ('heart disease', [(536, 549, 'DISEASE')]),
 ('grandmother stroke grandmother diabetes denies obesity hypertension',
  [(562, 629, 'DISEASE')]),
 ('allergic', [(677, 685, 'DISEASE')]),
 ('chest pain', [(1170, 1180, 'DISEASE')]),
 ('coronary artery disease congestive heart failure arrhythmia',
  [(1194, 1253, 'DISEASE')]),
 ('atrial fibrillation', [(1254, 1273, 'DISEASE')]),
 ('cholesterol', [(1289, 1300, 'CHEMICAL')]),
 ('venous insufficiency thrombophlebitis asthma shortness breath copd emphysema sleep apnea diabetes leg foot swelling osteoarthri

In [None]:
from spacy.scorer import Scorer
from spacy.tokens import Doc
from spacy.training.example import Example

def evaluate(ner_model, examples):
    scorer = Scorer()
    example = []
    for input_, annot in examples:
        pred = ner_model(input_)
        temp = Example.from_dict(pred, dict.fromkeys(annot))
        example.append(temp)
    scores = scorer.score(example)
    return scores

results = evaluate(nlp_bc, examples)
print(results)

{'token_acc': 1.0, 'token_p': 1.0, 'token_r': 1.0, 'token_f': 1.0, 'sents_p': 1.0, 'sents_r': 1.0, 'sents_f': 1.0, 'tag_acc': None, 'pos_acc': None, 'morph_acc': None, 'morph_micro_p': None, 'morph_micro_r': None, 'morph_micro_f': None, 'morph_per_feat': None, 'dep_uas': None, 'dep_las': None, 'dep_las_per_type': None, 'ents_p': None, 'ents_r': None, 'ents_f': None, 'ents_per_type': None, 'cats_score': 0.0, 'cats_score_desc': 'macro F', 'cats_micro_p': 0.0, 'cats_micro_r': 0.0, 'cats_micro_f': 0.0, 'cats_macro_p': 0.0, 'cats_macro_r': 0.0, 'cats_macro_f': 0.0, 'cats_macro_auc': 0.0, 'cats_f_per_type': {}, 'cats_auc_per_type': {}}
