<a href="https://colab.research.google.com/github/mithunareddy/NLP/blob/main/lab_assignment_02.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Healthcare NLP Preprocessing Notebook

1.Aim Apply preprocessing to sensitive medical or healthcare‑related text using NLTK and spaCy.

2.Setup

In [16]:
# Install required libraries
!pip install nltk spacy

# Download NLTK resources
import nltk
nltk.download('punkt')
nltk.download('wordnet')

# Download spaCy model
!python -m spacy download en_core_web_sm




[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Collecting en-core-web-sm==3.8.0
  Downloading https://github.com/explosion/spacy-models/releases/download/en_core_web_sm-3.8.0/en_core_web_sm-3.8.0-py3-none-any.whl (12.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m12.8/12.8 MB[0m [31m88.9 MB/s[0m eta [36m0:00:00[0m
[?25h[38;5;2m✔ Download and installation successful[0m
You can now load the package via spacy.load('en_core_web_sm')
[38;5;3m⚠ Restart to reload dependencies[0m
If you are in a Jupyter or Colab notebook, you may need to restart Python in
order to load all the package's dependencies. You can do this by selecting the
'Restart kernel' or 'Restart runtime' option.


Load Medical text Corpus

In [17]:
medical_text = """
Hypertension is a chronic medical condition in which the blood pressure in the arteries is persistently elevated.
Diabetes mellitus is a group of metabolic diseases characterized by high blood sugar levels over a prolonged period.
Patients with asthma often experience difficulty in breathing due to airway inflammation.
"""
print(medical_text)



Hypertension is a chronic medical condition in which the blood pressure in the arteries is persistently elevated.
Diabetes mellitus is a group of metabolic diseases characterized by high blood sugar levels over a prolonged period.
Patients with asthma often experience difficulty in breathing due to airway inflammation.



Using NLTK

In [18]:
import nltk
nltk.download('punkt_tab')

from nltk.tokenize import sent_tokenize, word_tokenize

sentences_nltk = sent_tokenize(medical_text)
words_nltk = word_tokenize(medical_text)

print("Sentences (NLTK):", sentences_nltk)
print("Words (NLTK):", words_nltk)

Sentences (NLTK): ['\nHypertension is a chronic medical condition in which the blood pressure in the arteries is persistently elevated.', 'Diabetes mellitus is a group of metabolic diseases characterized by high blood sugar levels over a prolonged period.', 'Patients with asthma often experience difficulty in breathing due to airway inflammation.']
Words (NLTK): ['Hypertension', 'is', 'a', 'chronic', 'medical', 'condition', 'in', 'which', 'the', 'blood', 'pressure', 'in', 'the', 'arteries', 'is', 'persistently', 'elevated', '.', 'Diabetes', 'mellitus', 'is', 'a', 'group', 'of', 'metabolic', 'diseases', 'characterized', 'by', 'high', 'blood', 'sugar', 'levels', 'over', 'a', 'prolonged', 'period', '.', 'Patients', 'with', 'asthma', 'often', 'experience', 'difficulty', 'in', 'breathing', 'due', 'to', 'airway', 'inflammation', '.']


[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


Using Spacy

In [19]:
import spacy
nlp = spacy.load("en_core_web_sm")

doc = nlp(medical_text)

sentences_spacy = [sent.text for sent in doc.sents]
words_spacy = [token.text for token in doc]

print("Sentences (spaCy):", sentences_spacy)
print("Words (spaCy):", words_spacy)


Sentences (spaCy): ['\nHypertension is a chronic medical condition in which the blood pressure in the arteries is persistently elevated.\n', 'Diabetes mellitus is a group of metabolic diseases characterized by high blood sugar levels over a prolonged period.\n', 'Patients with asthma often experience difficulty in breathing due to airway inflammation.\n']
Words (spaCy): ['\n', 'Hypertension', 'is', 'a', 'chronic', 'medical', 'condition', 'in', 'which', 'the', 'blood', 'pressure', 'in', 'the', 'arteries', 'is', 'persistently', 'elevated', '.', '\n', 'Diabetes', 'mellitus', 'is', 'a', 'group', 'of', 'metabolic', 'diseases', 'characterized', 'by', 'high', 'blood', 'sugar', 'levels', 'over', 'a', 'prolonged', 'period', '.', '\n', 'Patients', 'with', 'asthma', 'often', 'experience', 'difficulty', 'in', 'breathing', 'due', 'to', 'airway', 'inflammation', '.', '\n']


In [20]:
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()

stems = [stemmer.stem(word) for word in words_nltk]
print("Stems (NLTK):", stems)


Stems (NLTK): ['hypertens', 'is', 'a', 'chronic', 'medic', 'condit', 'in', 'which', 'the', 'blood', 'pressur', 'in', 'the', 'arteri', 'is', 'persist', 'elev', '.', 'diabet', 'mellitu', 'is', 'a', 'group', 'of', 'metabol', 'diseas', 'character', 'by', 'high', 'blood', 'sugar', 'level', 'over', 'a', 'prolong', 'period', '.', 'patient', 'with', 'asthma', 'often', 'experi', 'difficulti', 'in', 'breath', 'due', 'to', 'airway', 'inflamm', '.']


In [21]:
from nltk.stem import WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

lemmas_nltk = [lemmatizer.lemmatize(word) for word in words_nltk]
print("Lemmas (NLTK):", lemmas_nltk)


Lemmas (NLTK): ['Hypertension', 'is', 'a', 'chronic', 'medical', 'condition', 'in', 'which', 'the', 'blood', 'pressure', 'in', 'the', 'artery', 'is', 'persistently', 'elevated', '.', 'Diabetes', 'mellitus', 'is', 'a', 'group', 'of', 'metabolic', 'disease', 'characterized', 'by', 'high', 'blood', 'sugar', 'level', 'over', 'a', 'prolonged', 'period', '.', 'Patients', 'with', 'asthma', 'often', 'experience', 'difficulty', 'in', 'breathing', 'due', 'to', 'airway', 'inflammation', '.']


In [22]:
lemmas_spacy = [token.lemma_ for token in doc]
print("Lemmas (spaCy):", lemmas_spacy)


Lemmas (spaCy): ['\n', 'Hypertension', 'be', 'a', 'chronic', 'medical', 'condition', 'in', 'which', 'the', 'blood', 'pressure', 'in', 'the', 'artery', 'be', 'persistently', 'elevated', '.', '\n', 'Diabetes', 'mellitus', 'be', 'a', 'group', 'of', 'metabolic', 'disease', 'characterize', 'by', 'high', 'blood', 'sugar', 'level', 'over', 'a', 'prolonged', 'period', '.', '\n', 'patient', 'with', 'asthma', 'often', 'experience', 'difficulty', 'in', 'breathe', 'due', 'to', 'airway', 'inflammation', '.', '\n']


Compare Output

In [23]:
import pandas as pd

comparison = pd.DataFrame({
    "Word": words_nltk,
    "Stem (NLTK)": stems,
    "Lemma (NLTK)": lemmas_nltk,
    "Lemma (spaCy)": lemmas_spacy[:len(words_nltk)]
})

comparison.head(20)


Unnamed: 0,Word,Stem (NLTK),Lemma (NLTK),Lemma (spaCy)
0,Hypertension,hypertens,Hypertension,\n
1,is,is,is,Hypertension
2,a,a,a,be
3,chronic,chronic,chronic,a
4,medical,medic,medical,chronic
5,condition,condit,condition,medical
6,in,in,in,condition
7,which,which,which,in
8,the,the,the,which
9,blood,blood,blood,the
