In [1]:
import spacy
import pandas as pd
import numpy as np
import re

In [2]:
# Load the dataset
df = pd.read_csv("view_2078.csv")
len(df)

7585

In [3]:
list(df.columns)

['id',
 'Date of Encounter',
 'Type of encounter',
 'Purpose of encounter',
 'Age Range at Consultation',
 'Is the patient wearing a face mask during the consulation?',
 'Encounter ID 2',
 'Retire Encounter Record?',
 'Reason for Retiring Record',
 'Is Archived?',
 'Date Archived',
 'First Created',
 'Last Updated',
 'Complaint/s',
 'History of Illness',
 'Image for Complaint/Hx',
 'Neurologic',
 'Other Neurologic',
 'Pulmonary',
 'Other Pulmonary',
 'Cardiovascular',
 'Other Cardiovascular',
 'Skin/Gland',
 'Other Skin/Gland',
 'Gastrointestinal',
 'Other Gastrointestinal',
 'Musculoskeletal',
 'Other Musculoskeletal',
 'Psychiatric',
 'Other Psychiatric',
 'Temperature (C)',
 'Blood Pressure',
 'Pulse Rate (bpm)',
 'Oxygenation (%)',
 'Respiratory Rate',
 'Height (cm)',
 'Weight (kg)',
 'BMI',
 'Eye response',
 'Verbal response',
 'Motor response',
 'GCS',
 'Skin Status',
 'Skin Findings',
 'HEENT Status',
 'HEENT Findings',
 'Cardiovascular System Status',
 'Cardiovascular System Fi

In [4]:
# Filter the dataset to only include date of Encounter, Complaint/s, History of Illness
df = df[['Date of Encounter', 'Complaint/s', 'History of Illness', 'Preliminary Diagnosis/Final Diagnosis', 'Temperature (C)']]
df.head()

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
0,2023-09-15T00:00:00+08:00,pain left eyebrow,Patient complained of pain at the left eyebrow...,,36.5
1,2023-07-22T00:00:00+08:00,cough,"3 days PTC patient had cough, nasal catarrh an...",,36.3
2,2023-07-22T00:00:00+08:00,for general check up,Patient came in for general check up.,,36.4
3,2023-07-22T00:00:00+08:00,cough,1 week PTC patient had cough and nasal catarrh.,,36.0
4,2022-06-30T00:00:00+08:00,Non-healing wound\nBody weakness,1 month ago patient has doing farming when he ...,,


[WHO defines ILI](https://www.who.int/teams/global-influenza-programme/surveillance-and-monitoring/case-definitions-for-ili-and-sari)  as "acute respiratory infection with: measured fever of ≥38 ⁰C and cough with onset within the last 10 days." 

The objectives of this analysis are as follows:
1. Filter cases according to the following rules:
- Mention of 'URTI', 'Upper Respiratory Tract Infection', 'CAP', 'Community Acquired Pneumonia', 'PCAP', 'Pediatric Community Acquired Pneumonia', 'fever', 'cough', or 'colds' in the chief complaint, history of patient illness, and diagnosis fields.
- Body temperature >= 37.8 deg. C. 

In [5]:
df[df["Preliminary Diagnosis/Final Diagnosis"].notna()]

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
5,2022-06-30T00:00:00+08:00,Infected wound,The patient had an insect bite while farming h...,Non healing wound,36.2
9,2023-09-19T00:00:00+08:00,Nasal catarrh,,Common colds,37.0
10,2023-10-25T00:00:00+08:00,Epigastric pain,"3 days PTC - epigastric pain, always feeling h...",Gastritis,36.7
12,2023-07-27T00:00:00+08:00,cough,6 days PTC patient started to have productive ...,PCAP,37.3
14,2023-10-07T00:00:00+08:00,Abdominal pain,1 day PTC - abdominal pain associated with LBM...,Acute Gastroenteritis,36.0
...,...,...,...,...,...
7580,2023-07-04T00:00:00+08:00,cough and colds,,URTI,
7581,2023-05-06T00:00:00+08:00,vomiting,patient vomits ocassionally and needs to be ad...,G6PD\nNon-ulcerative dyspepsia,35.0
7582,2022-07-26T00:00:00+08:00,whitish discharge on neck,1 year history of whitish discharge from a pim...,Inclusion cyst,36.5
7583,2022-07-12T00:00:00+08:00,poor weight gain,"Patient was born full term, at local lying in ...",Wasted at 5th percentile,


# Method of Training the Spacy Model

To train the spacy model, we will use the following steps:
- Perform regex matching to perform labeling of the entities. The regex matching will be done on the 'Complaint/s', 'History of Illness', and 'Preliminary Diagnosis/Final Diagnosis' fields.
- Use the labeled data to train the spacy model.


In [6]:
# Load the spacy model
nlp = spacy.load("en_core_web_sm")

In [7]:
df_sample = df.sample(n=5000)


In [8]:
from sklearn.model_selection import train_test_split
train, test = train_test_split(df_sample, test_size=0.2)
train

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
4818,2023-04-25T00:00:00+08:00,“Anemic Ako”,3 days PTC patient had cough with whitish phle...,,36.2
6810,2023-05-04T00:00:00+08:00,Dizziness,1 year PTC patient had dizziness. She is a kno...,,36.4
4679,2022-09-01T00:00:00+08:00,Cough and Nasal Catarrh,Patient was noted by mother to have on and off...,,36.5
2654,2022-08-02T00:00:00+08:00,dizziness,patient complained of dizziness with no other ...,BPPV\nHCVD,36.4
726,2023-08-23T00:00:00+08:00,Cough\nRunny nose,2 days PTC - cough and runny nose,URTI,36.4
...,...,...,...,...,...
312,2024-04-03T00:00:00+08:00,eye pain and pruritic eyelid,"Months pTC, patient started to have pruritic l...",T/C Eyelid dermatitis\nNUD,35.9
3236,2023-03-22T00:00:00+08:00,loose stools,2 days PTC patient had several episodes of loo...,,
167,2023-07-28T00:00:00+08:00,flank pain radiating to left leg,"1 month PTC, patient experienced right flank p...",Nephrolithiasis vs MSSPS,37.2
4116,2023-04-21T00:00:00+08:00,cough,3 days cough\nno fever or colds\nNo associated...,Bronchial Asthma,36.4


In [67]:
# Concatenate the column complaint
train['Complaint/s'] = train['Complaint/s'].fillna('')
train['History of Illness'] = train['History of Illness'].fillna('')
train['Preliminary Diagnosis/Final Diagnosis'] = train['Preliminary Diagnosis/Final Diagnosis'].fillna('')
train['text'] = train['Complaint/s'] + ' ' + train['History of Illness'] + ' ' + train['Preliminary Diagnosis/Final Diagnosis']

In [68]:
train['text'] = train['text'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [69]:
train['text']

4818    “Anemic Ako” 3 days PTC patient had cough with...
6810    Dizziness 1 year PTC patient had dizziness. Sh...
4679    Cough and Nasal Catarrh Patient was noted by m...
2654    dizziness patient complained of dizziness with...
726     Cough Runny nose 2 days PTC - cough and runny ...
                              ...                        
312     eye pain and pruritic eyelid Months pTC, patie...
3236    loose stools 2 days PTC patient had several ep...
167     flank pain radiating to left leg 1 month PTC, ...
4116    cough 3 days cough no fever or colds No associ...
5612    Dizziness 3 days PTC - dizziness on and off, a...
Name: text, Length: 4000, dtype: object

In [70]:
# join the text in df_sample
text = ' '.join(train['text'])
len(text)

437016

In [71]:
# remove (-) and (+) from text
text = re.sub(r'\(+', '', text)
text = re.sub(r'\)+', '', text)
text = re.sub(r'\-+', '', text)
text = re.sub(r'\s+', ' ', text)
text



In [72]:
def get_matches(text, pattern):
    if pattern == "CAP":
        # Define a pattern to match whole words; assume "CAP" is a placeholder for a word.
        pattern = r'\bCAP\b'
    else:
        # Ensure that pattern matches whole words
        pattern = r'\b' + pattern + r'\w*'
    # Perform the match with case insensitivity
    matches = re.finditer(pattern, text, re.IGNORECASE)
    return [(match.start(), match.end(), match.group()) for match in matches]



In [73]:
def annotate_entities(text, pattern, label):
    matches = get_matches(text, pattern)
    entities = []
    labeled_texts = []
    for start, end, matched_text in matches:
        entities.append((start, end, label))
        labeled_texts.append((start, end, matched_text))
    return entities, labeled_texts

In [74]:
# Annotate the entities
symptoms = ['fever', 'cough', 'colds']
diseases = ['URTI', 'Upper Respiratory Tract Infection', 'CAP', 'Community Acquired Pneumonia', 'PCAP']
ILI = symptoms + diseases

all_entities = []
all_labeled_texts = []

for disease in ILI:
    entities, labeled_texts = annotate_entities(text, disease, 'ILI')
    all_entities.extend(entities)
    all_labeled_texts.extend(labeled_texts)

list_label = [label for (_, _, label) in all_labeled_texts]
np.unique(list_label)

array(['CAP', 'COLDS', 'COUGH', 'Cap', 'Colds',
       'Community Acquired Pneumonia', 'Community acquired pneumonia',
       'Cough', 'FEVER', 'Fever', 'PCAP', 'PCAPA', 'PCAPB', 'PCAPLR',
       'PCAPLow', 'PCAPrecovered', 'Pcap', 'PcapB',
       'UPPER RESPIRATORY TRACT INFECTION', 'URTI', 'URTIResolved',
       'URTIResolving', 'URTIresolvig', 'URTIresolving', 'URTi',
       'Upper Respiratory Tract Infection',
       'Upper Respiratory Tract Infections',
       'Upper Respiratory tract infection',
       'Upper respiratory tract infection', 'UrTI', 'Urti', 'Urticaria',
       'Urtiresolved', 'Urtiresolving', 'cap', 'colds', 'coldst',
       'community acquired pneumonia', 'cough', 'coughed', 'coughing',
       'coughs', 'coughyellowish', 'fever', 'feverish', 'pcap', 'uRTI',
       'upper respiratory tract infection', 'urti', 'urticarial'],
      dtype='<U34')

In [75]:
len(all_entities)

5345

In [76]:
text[47860:47865]

'yspep'

In [77]:
train_data = [(text, {'entities': all_entities})]
train_data

  {'entities': [(1099, 1104, 'ILI'),
    (1421, 1426, 'ILI'),
    (1897, 1902, 'ILI'),
    (2665, 2670, 'ILI'),
    (2774, 2779, 'ILI'),
    (3146, 3151, 'ILI'),
    (3852, 3857, 'ILI'),
    (3881, 3886, 'ILI'),
    (5595, 5600, 'ILI'),
    (7000, 7005, 'ILI'),
    (7041, 7046, 'ILI'),
    (8437, 8442, 'ILI'),
    (9701, 9706, 'ILI'),
    (9767, 9772, 'ILI'),
    (10747, 10752, 'ILI'),
    (11538, 11543, 'ILI'),
    (11631, 11636, 'ILI'),
    (11651, 11656, 'ILI'),
    (11900, 11905, 'ILI'),
    (11934, 11939, 'ILI'),
    (11972, 11977, 'ILI'),
    (12463, 12468, 'ILI'),
    (14089, 14094, 'ILI'),
    (14371, 14376, 'ILI'),
    (14651, 14656, 'ILI'),
    (14830, 14835, 'ILI'),
    (15722, 15727, 'ILI'),
    (16053, 16058, 'ILI'),
    (16635, 16640, 'ILI'),
    (16897, 16902, 'ILI'),
    (17937, 17942, 'ILI'),
    (18020, 18025, 'ILI'),
    (18489, 18494, 'ILI'),
    (18547, 18552, 'ILI'),
    (19570, 19575, 'ILI'),
    (19869, 19874, 'ILI'),
    (20904, 20909, 'ILI'),
    (21148, 21153

In [78]:
ner=nlp.get_pipe("ner")
ner

<spacy.pipeline.ner.EntityRecognizer at 0x7518229f7ca0>

In [79]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        print(ent)

(1099, 1104, 'ILI')
(1421, 1426, 'ILI')
(1897, 1902, 'ILI')
(2665, 2670, 'ILI')
(2774, 2779, 'ILI')
(3146, 3151, 'ILI')
(3852, 3857, 'ILI')
(3881, 3886, 'ILI')
(5595, 5600, 'ILI')
(7000, 7005, 'ILI')
(7041, 7046, 'ILI')
(8437, 8442, 'ILI')
(9701, 9706, 'ILI')
(9767, 9772, 'ILI')
(10747, 10752, 'ILI')
(11538, 11543, 'ILI')
(11631, 11636, 'ILI')
(11651, 11656, 'ILI')
(11900, 11905, 'ILI')
(11934, 11939, 'ILI')
(11972, 11977, 'ILI')
(12463, 12468, 'ILI')
(14089, 14094, 'ILI')
(14371, 14376, 'ILI')
(14651, 14656, 'ILI')
(14830, 14835, 'ILI')
(15722, 15727, 'ILI')
(16053, 16058, 'ILI')
(16635, 16640, 'ILI')
(16897, 16902, 'ILI')
(17937, 17942, 'ILI')
(18020, 18025, 'ILI')
(18489, 18494, 'ILI')
(18547, 18552, 'ILI')
(19570, 19575, 'ILI')
(19869, 19874, 'ILI')
(20904, 20909, 'ILI')
(21148, 21153, 'ILI')
(21760, 21765, 'ILI')
(22588, 22593, 'ILI')
(22973, 22978, 'ILI')
(23335, 23340, 'ILI')
(23586, 23591, 'ILI')
(23614, 23619, 'ILI')
(24063, 24068, 'ILI')
(24174, 24179, 'ILI')
(24204, 24209, '

In [80]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [81]:
ner.labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'ILI',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'TIME',
 'WORK_OF_ART')

In [82]:
import spacy
from spacy.training import Example
import random
from spacy.util import minibatch, compounding


# Example function to segment text into smaller parts
def segment_text(text, max_length=2000):
    segments = []
    start = 0
    while start < len(text):
        end = start + max_length
        if end < len(text):
            end = text.rfind(' ', start, end) + 1
        segments.append(text[start:end])
        start = end
    return segments

def remove_overlaps(entities):
    # Sort entities by start position
    entities = sorted(entities, key=lambda x: x[0])
    non_overlapping = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:
            non_overlapping.append((start, end, label))
            last_end = end
    return non_overlapping

# Training loop
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Disable other pipes and train
with nlp.disable_pipes(*unaffected_pipes):
    sizes = compounding(1.0, 4.0, 1.001)
    for itn in range(100):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            segments = segment_text(text)
            offset = 0
            for segment in segments:
                doc = nlp.make_doc(segment)
                # Adjust annotations for the segment
                segment_entities = [(start - offset, end - offset, label) for start, end, label in annotations['entities']
                                    if start >= offset and end <= offset + len(segment)]
                segment_entities = remove_overlaps(segment_entities)  # Remove overlaps
                example = Example.from_dict(doc, {"entities": segment_entities})
                nlp.update([example], drop=0.5, losses=losses)
                offset += len(segment)
        print("Losses", losses)

# Save the trained model
nlp.to_disk("trained_spacy_model")



Losses {'ner': 1696.6607219940765}
Losses {'ner': 129.27609235519375}
Losses {'ner': 67.47323062350283}
Losses {'ner': 23.416257169668867}
Losses {'ner': 17.928344313311342}
Losses {'ner': 25.66383112380595}
Losses {'ner': 9.583606482041942}
Losses {'ner': 16.130582865193023}
Losses {'ner': 18.971126403326416}
Losses {'ner': 16.627713889021507}
Losses {'ner': 7.936800267364742}
Losses {'ner': 5.314817428259049}
Losses {'ner': 10.715930597704347}
Losses {'ner': 6.480560843432951}
Losses {'ner': 5.5138660855171775}
Losses {'ner': 9.035729245795453}
Losses {'ner': 5.940415808433791}
Losses {'ner': 10.148625788469356}
Losses {'ner': 7.998605150737335}
Losses {'ner': 12.690762805598043}
Losses {'ner': 4.73784183492444}
Losses {'ner': 1.9839529171633392}
Losses {'ner': 6.949432551539489}
Losses {'ner': 9.24962414187177}
Losses {'ner': 1.543477924604397}
Losses {'ner': 3.388067588678152}
Losses {'ner': 0.0030954800633411794}
Losses {'ner': 5.647079424654182}
Losses {'ner': 1.9701578585996968}

In [85]:
# add testing data
df_test = test
df_test['Complaint/s'] = df_test['Complaint/s'].fillna('')
df_test['History of Illness'] = df_test['History of Illness'].fillna('')
df_test['Preliminary Diagnosis/Final Diagnosis'] = df_test['Preliminary Diagnosis/Final Diagnosis'].fillna('')
df_test['text'] = df_test['Complaint/s'] + ' ' + df_test['History of Illness'] + ' ' + df_test['Preliminary Diagnosis/Final Diagnosis']
text = ' '.join(df_test['text'])

In [86]:
doc = nlp(text)
print(doc.ents)


(cough, cough, fever, fever, Urti, fever, cough, fever, cough, colds, Cough, colds, cough, colds, fever, upper respiratory tract infection, fever, fever, fever, cough, URTI, cough, URTI, cough, cough, Cough, cough, cough, fever, Cough, Cough, Cough, Cap, URTI, fever, cough, URTI, fever, fever, cough, URTI, fever, fever, cough, fever, cough, cough, colds, URTI, Cough, Cap, cough, colds, fever, URTI, cough, cough, PCAP, URTI, fever, fever, cough, fever, URTI, Cough, cough, fever, URTI, cough, fever, CAP, Cough, cough, fever, URTI, cough, cough, colds, fever, URTI, Cough, Fever, Cough, cough, CAP, fever, fever, fever, cough, cough, CAP, cough, Cough, cough, CAP, Cough, Pcap, fever, fever, cough, cough, urti, Cough, Cap, cough, cough, cough, cough, cough, urti, fever, URTI, cough, PCAP, URTI, Cough, cough, cough, cough, cough, cough, fever, cough, URTI, Fever, fever, cough, cap, Cough, Cap, cough, cough, Cough, Fever, fever, cough, PCAP, Cough, cough, fever, CAP, cough, colds, cough, fever

In [87]:
doc = nlp("The patient has Fever and Cough. The patient has URTI. The patient is suffering from BPPV. Upper back pain")
print([(ent.text, ent.label_) for ent in doc.ents])

[('Fever', 'ILI'), ('Cough', 'ILI'), ('URTI', 'ILI')]


In [88]:
keywords = ['fever', 'cough', 'colds', 'URTI', 'Upper Respiratory Tract Infection', 'CAP', 'Community Acquired Pneumonia', 'PCAP', 'Pediatric Community Acquired Pneumonia']
df_new = pd.DataFrame(columns=df.columns)

df_new

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C),text


In [89]:
for rows in df.itertuples():
    text = str(rows[2]) + ' . ' + str(rows[3]) + ' . ' + str(rows[4])
    doc = nlp(text)
    entities = [ent.text.lower() for ent in doc.ents]  # Convert entity texts to lowercase
    # if entities is not empty
    

[]
['cough', 'cough', 'fever']
[]
['cough', 'cough']
[]
[]
[]
[]
[]
['colds']
[]
[]
['cough', 'cough', 'fever', 'cough', 'pcap']
[]
[]
[]
[]
['coughing', 'cough']
['cough', 'fever']
['cough', 'cough', 'cap']
[]
[]
['fever', 'fever']
['cough', 'cough', 'fever', 'colds', 'urti']
[]
[]
['cough', 'colds', 'cap']
[]
[]
['cough', 'cough', 'fever', 'pcap']
[]
[]
['fever']
['cough', 'colds', 'cough', 'colds', 'fever', 'cough', 'pcap']
[]
['fever']
[]
[]
['cough', 'cough', 'cap']
[]
[]
[]
['urti']
[]
['cough', 'colds', 'cough', 'urti']
[]
[]
['cough', 'colds', 'cough', 'colds', 'fever', 'cough', 'pcap']
[]
[]
[]
[]
[]
[]
[]
[]
[]
['cough']
[]
[]
[]
[]
['cough', 'cough', 'cap']
[]
['cough', 'colds', 'fever', 'urti']
[]
[]
[]
['cough']
['cough', 'cough', 'fever']
['cough', 'fever']
[]
[]
['urti']
[]
[]
[]
['cough', 'cough', 'fever', 'cough', 'urti']
['cough', 'colds', 'fever', 'urti']
['cough', 'cough', 'fever', 'community acquired pneumonia']
['cough', 'cough', 'urti']
['urti']
['cough', 'cough'

KeyboardInterrupt: 

In [376]:
df_new

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
0,2023-07-22T00:00:00+08:00,cough,"3 days PTC patient had cough, nasal catarrh an...",,36.3
1,2023-07-22T00:00:00+08:00,cough,1 week PTC patient had cough and nasal catarrh.,,36.0
2,2023-09-19T00:00:00+08:00,Nasal catarrh,,Common colds,37.0
3,2023-07-27T00:00:00+08:00,cough,6 days PTC patient started to have productive ...,PCAP,37.3
4,2022-10-10T00:00:00+08:00,"""gina atake sang kulba""","~ 20 yrs PTC , pc witnessed a fight as she was...",Anxiety Reaction Secondary to A General Medica...,35.7
...,...,...,...,...,...
3583,2023-05-13T00:00:00+08:00,Cough,,,
3584,2023-09-12T00:00:00+08:00,Fever,,Urti,38.0
3585,2023-10-04T00:00:00+08:00,Cough,,CAP,
3586,2023-07-04T00:00:00+08:00,cough and colds,,URTI,


In [90]:
columns = [
    'Complaint/s', 
    'History of Illness', 
    'Preliminary Diagnosis/Final Diagnosis'
]
col = df_new.columns
df_new = pd.DataFrame(columns=col)
counter = 0
for col in columns:
    for i in range(len(df)):
        text = str(df[col][i])
        doc = nlp(text)
        entities = [ent.text.lower() for ent in doc.ents]
        if len(entities) > 0:
            df_new = pd.concat([df_new, pd.DataFrame([df.loc[i]])], ignore_index=True)
            counter += 1
len(df_new)

7664

In [101]:
df_new.duplicated().sum()

4364

In [103]:
df_new.to_csv("test_ILI.csv", index=False)

In [104]:
df_new = pd.read_csv("test_ILI.csv")

In [94]:
df1 = pd.read_csv("ILI_estimate.csv")
df2 = df_new
len(df1)

  df1 = pd.read_csv("ILI_estimate.csv")


10404

In [267]:
df1.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)

In [268]:
len(df1)

3634

In [269]:
len(df2)

3272

In [270]:
import pandas as pd

# Assuming df1 and df2 are your DataFrames

# Merge df1 and df2 with an indicator and perform an outer join
merged_df = pd.merge(df1, df2, how='outer', indicator=True)

# Filter rows that are only in df1
df1_not_in_df2 = merged_df[merged_df['_merge'] == 'left_only']

# Drop the indicator column
df1_not_in_df2 = df1_not_in_df2.drop(columns=['_merge'])

# If needed, reset the index
df1_not_in_df2.reset_index(drop=True, inplace=True)

# df1_not_in_df2 now contains rows from df1 that are not in df2

df2_not_in_df1 = merged_df[merged_df['_merge'] == 'right_only']
df2_not_in_df1 = df2_not_in_df1.drop(columns=['_merge'])
df2_not_in_df1.reset_index(drop=True, inplace=True)


In [271]:
df1_not_in_df2

Unnamed: 0.1,Unnamed: 0,id,Date of Encounter,Type of encounter,Purpose of encounter,Age Range at Consultation,Is the patient wearing a face mask during the consulation?,Encounter ID 2,Retire Encounter Record?,Reason for Retiring Record,...,Other Adverse Effects,Adverse Events,Did the patient get well?,Patient Disposition Summary,Type of Consultation,Income Classification,Follow-up Schedule,Identifying Physical Qualities,Record ID,Sex
0,4987,64a8e86ec976c10026b63b81,2023-06-12T00:00:00+08:00,Outpatient,Check-up,30 - 39,False,4713-MvITsMpQj5,False,,...,,|,,Released | | | Did the patient get well:,,,2023-07-08T00:00:00+08:00,,4713,Male
1,4768,62e17bbd20a92c0024e6b6a2,2022-07-28T00:00:00+08:00,Outpatient,New health issue,10 - 19,False,1157-umIxx1IkrS,False,,...,,|,,Released | | | Did the patient get well:,,,2022-07-28T00:00:00+08:00,,1157,Female
2,6622,62b423885b6141001e9caf76,2022-06-23T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,683-TrG7TDOZ0X,False,,...,,|,,| | | Did the patient get well:,,,2022-06-23T00:00:00+08:00,,683,Male
3,4230,64700a94c76b6f002a40b941,2023-05-05T00:00:00+08:00,Outpatient,Check-up,60 - 69,False,4437-8Fn4VR1AW7,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-05-26T00:00:00+08:00,,4437,Male
4,5958,63bf9f4ab25151001282e4c4,2023-01-12T00:00:00+08:00,Outpatient,New health issue,0 - 9,False,3048-F1PdYeN2cq,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-01-12T00:00:00+08:00,,3048,Female
5,6329,6456ebcfefd5bb0027295992,2023-05-02T00:00:00+08:00,Outpatient,New health issue,60 - 69,False,4296-TUxORZiYgg,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-05-07T00:00:00+08:00,,4296,Male
6,977,63801d67e1f9870021ece6ba,2022-11-04T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,500-vg68vdH4XV,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2022-11-25T00:00:00+08:00,,500,Female
7,1913,6426a60b336d5100283c5077,2023-03-22T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,27-g320oKpT6b,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-03-31T00:00:00+08:00,,27,Female
8,2177,62f8db749b4f6d0021114578,2022-08-09T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,307-Z3LGDUIVjU,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2022-08-14T00:00:00+08:00,,307,Female
9,6578,660e6bc794db0a0029c4a6b7,2024-04-03T00:00:00+08:00,Outpatient,Check-up,70 - 79,False,8067-1Yf0nwJluS,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2024-04-04T00:00:00+08:00,,8067,Male


In [272]:
doc = nlp(df1_not_in_df2["History of Illness"][0])
print([(ent.text, ent.label_) for ent in doc.ents])

[]


In [273]:
df1_not_in_df2["History of Illness"][0]

'2 weeks PTC patient had feverish episodes and body pains. '