# Cell 1: The first cell installs the necessary packages: nltk (natural language processing toolkit) and scikit-learn (a machine learning library in Python).

In [None]:
!pip install nltk
!pip install scikit-learn

Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/
Looking in indexes: https://pypi.org/simple, https://us-python.pkg.dev/colab-wheels/public/simple/


# Cell 2: The second cell reads a json file (medical_ner.json), which contains some medical text examples, and tokenize them into words, and then the words are grouped into sentences. For each sentence, Part-of-Speech (POS) tagging is applied using the pos_tag function in the nltk package, and the tagged sentences are stored in a list named sentences. At the end of the cell, some statistics about the data are printed.

In [None]:
import nltk

nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')
nltk.download('stopwords')
nltk.download('wordnet')

import json
from nltk import sent_tokenize, word_tokenize, pos_tag, FreqDist
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

sentences = []
num_sentences = 0
num_documents = 0

with open('medical_ner.json', 'r') as f:
    data = json.load(f)

for example in data['examples']:
    if 'content' in example:
        num_documents += 1
        text = example['content']
        tokens = word_tokenize(text)
        sentences_in_doc = sent_tokenize(text)
        num_sentences += len(sentences_in_doc)

        for sentence in sentences_in_doc:
            tagged = pos_tag(word_tokenize(sentence))
            sentences.append(tagged)

print("Number of documents with text key:", num_documents)
print("Number of sentences:", num_sentences)
print("Number of sentences in sentences list:", len(sentences))

# Feature extraction: word frequencies
word_freq = FreqDist(word.lower() for sentence in sentences for word, _ in sentence)
print("Word frequencies:", word_freq)

# Preprocessing: remove stop words and lemmatize words
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()

preprocessed_sentences = []
for sentence in sentences:
    preprocessed_sentence = [(lemmatizer.lemmatize(word.lower()), tag) for word, tag in sentence if word.lower() not in stop_words]
    preprocessed_sentences.append(preprocessed_sentence)

print("Preprocessed sentences:", preprocessed_sentences)

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /root/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Number of documents with text key: 31
Number of sentences: 195
Number of sentences in sentences list: 195
Word frequencies: <FreqDist with 1158 samples and 4028 outcomes>
Preprocessed sentences: [[('bismuth', 'JJ'), ('compound', 'NNS'), ('(', '('), ('pepto-bismol', 'NNP'), (')', ')'), ('decreased', 'VBD'), ('number', 'NN'), ('bowel', 'NN'), ('movement', 'NNS'), ('traveler', 'NNS'), ("'", 'POS'), ('diarrhea', 'NN'), (',', ','), ('decrease', 'VB'), ('length', 'NN'), ('illness', 'NN'), ('.', '.')], [('[', 'RB'), ('91', 'CD'), (']', 'JJ'), ('anti-motility', 'JJ'), ('agent', 'NNS'), ('like', 'IN'), ('loperamide', 'NN'), ('also', 'RB'), ('effective', 'JJ'), ('reducing', 'VBG'), ('number', 'NN'), ('stool', 'NNS'), ('duration', 'NN'), ('disease', 'NN'), ('.', '.')], [('[', 'RB'), ('8', 'CD'), (']', 'JJ'), ('agent', 'NNS'), ('used', 'VBN'), ('bloody', 'JJ'), ('diarrhea', 'NN'), ('present', 'JJ'), ('.', '.')], [('[', 'RB'), ('92', 'CD'), (']', 'NNP'), ('diosmectite', 'NNP'), (',', ','), ('natura

# Cell 3: In this cell, some further preprocessing is applied to the sentences. Each word in each sentence is again POS tagged and then replaced with a tuple of the form (word, pos_tag). The modified sentences are stored in tagged_sentences. Some statistics about the data are printed at the end of this cell.

In [None]:
num_documents = 0
num_sentences = 0
sentences_str = []
tagged_sentences = []

for doc in preprocessed_sentences:
    for sentence in doc:
        num_sentences += 1
        tagged = pos_tag(word_tokenize(sentence[0]))
        sentences_str.append(sentence[0])
        if num_sentences == 1:
            num_documents += 1
        tagged_sentences.append(tagged)

sentences = tagged_sentences
print("Number of documents with text key:", num_documents)
print("Number of sentences:", num_sentences)
print("Number of sentences in sentences list:", len(sentences))

Number of documents with text key: 1
Number of sentences: 2816
Number of sentences in sentences list: 2816


# Cell 4: The fourth cell defines a list of patterns and then applies these patterns to the tokenized sentences in sentences_str to detect entities. The patterns are regex patterns for entities like drugs, diseases, symptoms, etc. The matched words in each sentence are replaced by a tuple (word, label), where the label is the type of the entity detected. The labeled sentences are stored in tagged_sentences.

In [None]:
import re

patterns = [
    {'class': 'drug', 'pattern': r'(aspirin|ibuprofen|acetaminophen)'},
    {'class': 'disease', 'pattern': r'(diabetes|cancer|heart disease)'},
    {'class': 'drug', 'pattern': r'(metformin|insulin|atorvastatin|lisinopril|simvastatin|losartan|furosemide|amlodipine|hydrochlorothiazide)'},
    {'class': 'disease', 'pattern': r'(alzheimer|arthritis|asthma|bronchitis|cirrhosis|copd|depression|edema|gastritis|glaucoma|gout|hepatitis|hernia|hiv|hypertension|hyperthyroidism|hypothyroidism|meningitis|migraine|osteoporosis|parkinson|pneumonia|psoriasis|seizure|stroke|ulcer)'},
    {'class': 'symptom', 'pattern': r'(fever|cough|nausea|vomiting|fatigue|diarrhea|headache|pain|swelling|itching|redness|soreness)'},
    {'class': 'test', 'pattern': r'(blood test|urine test|x-ray|ct scan|mri|ultrasound|biopsy|endoscopy)'}
]


tagged_sentences = []

for sentence in sentences_str:
    tagged = []
    for word in word_tokenize(sentence):
        for pattern in patterns:
            if re.match(pattern['pattern'], word, re.IGNORECASE):
                tagged.append((word, pattern['class']))
                break
        else:
            tagged.append((word, 'O'))
    tagged_sentences.append(tagged)

print(tagged_sentences)

[[('bismuth', 'O')], [('compound', 'O')], [('(', 'O')], [('pepto-bismol', 'O')], [(')', 'O')], [('decreased', 'O')], [('number', 'O')], [('bowel', 'O')], [('movement', 'O')], [('traveler', 'O')], [("'", 'O')], [('diarrhea', 'symptom')], [(',', 'O')], [('decrease', 'O')], [('length', 'O')], [('illness', 'O')], [('.', 'O')], [('[', 'O')], [('91', 'O')], [(']', 'O')], [('anti-motility', 'O')], [('agent', 'O')], [('like', 'O')], [('loperamide', 'O')], [('also', 'O')], [('effective', 'O')], [('reducing', 'O')], [('number', 'O')], [('stool', 'O')], [('duration', 'O')], [('disease', 'O')], [('.', 'O')], [('[', 'O')], [('8', 'O')], [(']', 'O')], [('agent', 'O')], [('used', 'O')], [('bloody', 'O')], [('diarrhea', 'symptom')], [('present', 'O')], [('.', 'O')], [('[', 'O')], [('92', 'O')], [(']', 'O')], [('diosmectite', 'O')], [(',', 'O')], [('natural', 'O')], [('aluminomagnesium', 'O')], [('silicate', 'O')], [('clay', 'O')], [(',', 'O')], [('effective', 'O')], [('alleviating', 'O')], [('symptom'

# Cell 5: In this cell, the tagged sentences are transformed into feature vectors using a dictionary vectorizer from the scikit-learn package. Then the transformed data is split into training and testing sets. Each token in each sentence is then represented by a feature vector that includes the word itself, its POS tag, and the surrounding words' features. The feature vectors and their labels are used to train a linear support vector classifier.

In [None]:
from sklearn.feature_extraction import DictVectorizer
from sklearn.pipeline import Pipeline
from sklearn.svm import LinearSVC
from sklearn.model_selection import train_test_split

def word2features(sent, i):
    word = sent[i][0]
    postag = sent[i][1]
    features = {
        'word': word,
        'postag': postag,
        'bias': 1.0,
    }
    if i > 0:
        prev_word = sent[i-1][0]
        prev_postag = sent[i-1][1]
        features.update({
            'prev_word': prev_word,
            'prev_postag': prev_postag,
        })
    else:
        features['BOS'] = True

    if i < len(sent)-1:
        next_word = sent[i+1][0]
        next_postag = sent[i+1][1]
        features.update({
            'next_word': next_word,
            'next_postag': next_postag,
        })
    else:
        features['EOS'] = True

    return features

# Cell 6/7: This contains the final cells of the script. Starting from cell 6, it defines some helper functions to convert sentences into feature vectors and tokens, respectively. Then, it uses these functions to create training and testing feature vectors and token labels. After that, the feature vectors are transformed using a dictionary vectorizer. The classifier from the previous cell is trained on the transformed data, and its performance is evaluated on the test set. Finally, the script prints out some evaluation metrics.

In [None]:
def sent2features(sent):
    return [word2features(sent, i) for i in range(len(sent))]

def sent2labels(sent):
    return [label if label else 'O' for token, label in sent]

def sent2tokens(sent):
    return [token for token, postag, label in sent]

tagged_sentences_train, tagged_sentences_test = train_test_split(tagged_sentences, test_size=0.2, random_state=42)

X_train = [sent2features(s) for s in tagged_sentences_train]
y_train = [sent2labels(s) for s in tagged_sentences_train]

X_test = [sent2features(s) for s in tagged_sentences_test]
y_test = [sent2labels(s) for s in tagged_sentences_test]

vectorizer = DictVectorizer(sparse=False)
X_train = vectorizer.fit_transform([item for sublist in X_train for item in sublist])

classifier = LinearSVC()
classifier.fit(X_train, [label for sent in y_train for label in sent])

In [None]:

X_test = vectorizer.transform([item for sublist in X_test for item in sublist])
y_pred = classifier.predict(X_test)
print(y_pred)
from sklearn.metrics import classification_report

print(classification_report([label for sent in y_test for label in sent], y_pred))

['O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'disease' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'symptom' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'disease' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'disease' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'symptom' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O'
 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 'O' 