In [32]:
import spacy
import pandas as pd
import numpy as np
import re

In [240]:
# Load the dataset
df = pd.read_csv("view_2078.csv")
len(df)

7585

In [241]:
list(df.columns)

['id',
 'Date of Encounter',
 'Type of encounter',
 'Purpose of encounter',
 'Age Range at Consultation',
 'Is the patient wearing a face mask during the consulation?',
 'Encounter ID 2',
 'Retire Encounter Record?',
 'Reason for Retiring Record',
 'Is Archived?',
 'Date Archived',
 'First Created',
 'Last Updated',
 'Complaint/s',
 'History of Illness',
 'Image for Complaint/Hx',
 'Neurologic',
 'Other Neurologic',
 'Pulmonary',
 'Other Pulmonary',
 'Cardiovascular',
 'Other Cardiovascular',
 'Skin/Gland',
 'Other Skin/Gland',
 'Gastrointestinal',
 'Other Gastrointestinal',
 'Musculoskeletal',
 'Other Musculoskeletal',
 'Psychiatric',
 'Other Psychiatric',
 'Temperature (C)',
 'Blood Pressure',
 'Pulse Rate (bpm)',
 'Oxygenation (%)',
 'Respiratory Rate',
 'Height (cm)',
 'Weight (kg)',
 'BMI',
 'Eye response',
 'Verbal response',
 'Motor response',
 'GCS',
 'Skin Status',
 'Skin Findings',
 'HEENT Status',
 'HEENT Findings',
 'Cardiovascular System Status',
 'Cardiovascular System Fi

In [242]:
# Filter the dataset to only include date of Encounter, Complaint/s, History of Illness
df = df[['Date of Encounter', 'Complaint/s', 'History of Illness', 'Preliminary Diagnosis/Final Diagnosis', 'Temperature (C)']]
df.head()

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
0,2023-09-15T00:00:00+08:00,pain left eyebrow,Patient complained of pain at the left eyebrow...,,36.5
1,2023-07-22T00:00:00+08:00,cough,"3 days PTC patient had cough, nasal catarrh an...",,36.3
2,2023-07-22T00:00:00+08:00,for general check up,Patient came in for general check up.,,36.4
3,2023-07-22T00:00:00+08:00,cough,1 week PTC patient had cough and nasal catarrh.,,36.0
4,2022-06-30T00:00:00+08:00,Non-healing wound\nBody weakness,1 month ago patient has doing farming when he ...,,


[WHO defines ILI](https://www.who.int/teams/global-influenza-programme/surveillance-and-monitoring/case-definitions-for-ili-and-sari)  as "acute respiratory infection with: measured fever of ≥38 ⁰C and cough with onset within the last 10 days." 

The objectives of this analysis are as follows:
1. Filter cases according to the following rules:
- Mention of 'URTI', 'Upper Respiratory Tract Infection', 'CAP', 'Community Acquired Pneumonia', 'PCAP', 'Pediatric Community Acquired Pneumonia', 'fever', 'cough', or 'colds' in the chief complaint, history of patient illness, and diagnosis fields.
- Body temperature >= 37.8 deg. C. 

In [243]:
df[df["Preliminary Diagnosis/Final Diagnosis"].notna()]

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
5,2022-06-30T00:00:00+08:00,Infected wound,The patient had an insect bite while farming h...,Non healing wound,36.2
9,2023-09-19T00:00:00+08:00,Nasal catarrh,,Common colds,37.0
10,2023-10-25T00:00:00+08:00,Epigastric pain,"3 days PTC - epigastric pain, always feeling h...",Gastritis,36.7
12,2023-07-27T00:00:00+08:00,cough,6 days PTC patient started to have productive ...,PCAP,37.3
14,2023-10-07T00:00:00+08:00,Abdominal pain,1 day PTC - abdominal pain associated with LBM...,Acute Gastroenteritis,36.0
...,...,...,...,...,...
7580,2023-07-04T00:00:00+08:00,cough and colds,,URTI,
7581,2023-05-06T00:00:00+08:00,vomiting,patient vomits ocassionally and needs to be ad...,G6PD\nNon-ulcerative dyspepsia,35.0
7582,2022-07-26T00:00:00+08:00,whitish discharge on neck,1 year history of whitish discharge from a pim...,Inclusion cyst,36.5
7583,2022-07-12T00:00:00+08:00,poor weight gain,"Patient was born full term, at local lying in ...",Wasted at 5th percentile,


# Method of Training the Spacy Model

To train the spacy model, we will use the following steps:
- Perform regex matching to perform labeling of the entities. The regex matching will be done on the 'Complaint/s', 'History of Illness', and 'Preliminary Diagnosis/Final Diagnosis' fields.
- Use the labeled data to train the spacy model.


In [244]:
# Load the spacy model
nlp = spacy.load("en_core_web_sm")

In [284]:
# take a sample of the dataset
df_sample = df.sample(1000)
df_sample

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
1339,2022-12-03T00:00:00+08:00,Patient came back with labs.,Her Breast UTZ showed solid nodules on the rig...,,36.7
2216,2024-04-08T00:00:00+08:00,cough,7 days PTC patient started to have cough and c...,PCAP-B,36.9
891,2023-09-21T00:00:00+08:00,dizziness,2 days PTC patient had dizziness.,,36.5
5057,2023-06-23T00:00:00+08:00,headache,,"Malnutrition (underweight, less than the 5th p...",36.0
4332,2023-07-06T00:00:00+08:00,for MTV,,General Adult Medical Examination with Normal ...,
...,...,...,...,...,...
2133,2023-04-22T00:00:00+08:00,Cough,,Cap,
6580,2022-06-11T00:00:00+08:00,Right Foot Swelling and Inflammation,"Two weeks prior to consult, patient noted swel...",,36.6
3628,2023-10-20T00:00:00+08:00,Ff up,,Cap,
4186,2023-04-30T00:00:00+08:00,fever,2 days fever and cough. good appetite. active....,URTI,36.4


In [341]:
# Concatenate the column complaint
df_sample['Complaint/s'] = df_sample['Complaint/s'].fillna('')
df_sample['History of Illness'] = df_sample['History of Illness'].fillna('')
df_sample['Preliminary Diagnosis/Final Diagnosis'] = df_sample['Preliminary Diagnosis/Final Diagnosis'].fillna('')
df_sample['text'] = df_sample['Complaint/s'] + ' ' + df_sample['History of Illness'] + ' ' + df_sample['Preliminary Diagnosis/Final Diagnosis']

In [342]:
df_sample['text'] = df_sample['text'].apply(lambda x: re.sub(r'\s+', ' ', x))

In [361]:
df_sample['text']

1339    Patient came back with labs. Her Breast UTZ sh...
2216    cough 7 days PTC patient started to have cough...
891          dizziness 2 days PTC patient had dizziness. 
5057    headache Malnutrition (underweight, less than ...
4332    for MTV General Adult Medical Examination with...
                              ...                        
2133                                            Cough Cap
6580    Right Foot Swelling and Inflammation Two weeks...
3628                                            Ff up Cap
4186    fever 2 days fever and cough. good appetite. a...
5972                                               Cough 
Name: text, Length: 1000, dtype: object

In [362]:
# join the text in df_sample
text = ' '.join(df_sample['text'])
len(text)

108941

96 101
137 142
263 268
520 525
549 554
599 604
935 940
1203 1208
1255 1260
1458 1463
1534 1539
1640 1645
1745 1750
2146 2151
2533 2538
2577 2582
2673 2678
2783 2788
2812 2817
3097 3102
3126 3131
4093 4098
5081 5086
5147 5152
5637 5642
5981 5986
6053 6058
6059 6064
6145 6150
6192 6197
6216 6221
6326 6331
6369 6374
6424 6429
6438 6443
6522 6527
6547 6552
6588 6593
6628 6633
7031 7036
7060 7065
8342 8347
8355 8360
9145 9150
9582 9587
9908 9913
9932 9937
10168 10173
10449 10454
10479 10484
10572 10577
10953 10958
11033 11038
11062 11067
11098 11103
11197 11202
11277 11282
11306 11311
11372 11377
11700 11705
11934 11939
12107 12112
12159 12164
12282 12287
12446 12451
13125 13130
13307 13312
13636 13641
13811 13816
14109 14114
14563 14568
14592 14597
15088 15093
15123 15128
15378 15383
15458 15463
15586 15591
15662 15667
15974 15979
16421 16426
16569 16574
16680 16685
16759 16764
17081 17086
17256 17261
17271 17276
17294 17299
17432 17437
17665 17670
18199 18204
19102 19107
19236 19241
19320

In [363]:
def get_matches(text, pattern):
    if pattern == "CAP":
        # Define a pattern to match whole words; assume "CAP" is a placeholder for a word.
        pattern = r'\bCAP\b'
    else:
        # Ensure that pattern matches whole words
        pattern = r'\b' + pattern + r'\w*'
    # Perform the match with case insensitivity
    matches = re.finditer(pattern, text, re.IGNORECASE)
    
    # Return the start and end positions of the matches
    return [(match.start(), match.end()) for match in matches]


In [364]:

def annotate_entities(text, pattern, label):
    matches = get_matches(text, pattern)
    entities = []
    for match in matches:
        start, end = match
        entities.append((start, end, label))
    return entities

In [365]:
# Annotate the entities
symptoms = ['fever', 'cough', 'colds']
diseases = ['URTI', 'Upper Respiratory Tract Infection', 'CAP', 'Community Acquired Pneumonia', 'PCAP']
ILI = symptoms + diseases
entities = []
for disease in ILI:
    entities += annotate_entities(text, disease, 'ILI')
entities

[(188, 193, 'ILI'),
 (559, 564, 'ILI'),
 (642, 647, 'ILI'),
 (1152, 1157, 'ILI'),
 (1181, 1186, 'ILI'),
 (1245, 1250, 'ILI'),
 (1335, 1340, 'ILI'),
 (1375, 1380, 'ILI'),
 (1484, 1489, 'ILI'),
 (1512, 1517, 'ILI'),
 (2171, 2176, 'ILI'),
 (2258, 2263, 'ILI'),
 (2627, 2632, 'ILI'),
 (2686, 2691, 'ILI'),
 (3645, 3650, 'ILI'),
 (5095, 5100, 'ILI'),
 (5189, 5194, 'ILI'),
 (6257, 6262, 'ILI'),
 (6412, 6417, 'ILI'),
 (6722, 6727, 'ILI'),
 (8391, 8396, 'ILI'),
 (8469, 8474, 'ILI'),
 (8499, 8504, 'ILI'),
 (9115, 9120, 'ILI'),
 (9208, 9213, 'ILI'),
 (9383, 9388, 'ILI'),
 (9465, 9470, 'ILI'),
 (9494, 9499, 'ILI'),
 (9562, 9567, 'ILI'),
 (10509, 10514, 'ILI'),
 (10539, 10544, 'ILI'),
 (10619, 10624, 'ILI'),
 (10925, 10930, 'ILI'),
 (10963, 10968, 'ILI'),
 (10971, 10976, 'ILI'),
 (11001, 11006, 'ILI'),
 (11217, 11222, 'ILI'),
 (11478, 11483, 'ILI'),
 (11740, 11745, 'ILI'),
 (11796, 11801, 'ILI'),
 (12207, 12212, 'ILI'),
 (12710, 12715, 'ILI'),
 (12740, 12745, 'ILI'),
 (13238, 13243, 'ILI'),
 (13285,

In [348]:
unique_entities = list(set(entities))

In [366]:
text[47860:47865]

'colds'

In [350]:
train_data = [(text, {'entities': entities})]
train_data

[('Patient came back with labs. Her Breast UTZ showed solid nodules on the right breast, BIRADS 4  cough 7 days PTC patient started to have cough and colds that occurs throughout the day. No fever, DOB or loss of appetite. Patient can sleep at night well despite of cough. No meds taken. No allergies. No other associated signs and symptoms. PCAP-B dizziness 2 days PTC patient had dizziness.  headache Malnutrition (underweight, less than the 5th percentile) for MTV General Adult Medical Examination with Normal Findings cough 3 days PTC patient had cough and fever.    Cough 1 week PTC, patient had cough associated with nasal catarrh but no fever and DOB. cap vs urti dizziness occ poor sleep occ dizziness no associated s/sx ff-up: no recurrence of dizziness (patient given Vit B complex during the activity). Vit b not yet taken. the symptoms stopped after sleeping and water intake T/C Heat Exhaustion Cough 3 days PTC patient had cough associated with throat itchiness. She also complained of

In [351]:
ner=nlp.get_pipe("ner")
ner

<spacy.pipeline.ner.EntityRecognizer at 0x77268592a880>

In [352]:
for _, annotations in train_data:
    for ent in annotations.get('entities'):
        ner.add_label(ent[2])

In [353]:
ner.labels

('CARDINAL',
 'DATE',
 'EVENT',
 'FAC',
 'GPE',
 'ILI',
 'LANGUAGE',
 'LAW',
 'LOC',
 'MONEY',
 'NORP',
 'ORDINAL',
 'ORG',
 'PERCENT',
 'PERSON',
 'PRODUCT',
 'QUANTITY',
 'SYMPTOM',
 'TIME',
 'WORK_OF_ART')

In [370]:
import spacy
from spacy.training import Example
import random
from spacy.util import minibatch, compounding


# Example function to segment text into smaller parts
def segment_text(text, max_length=2000):
    segments = []
    start = 0
    while start < len(text):
        end = start + max_length
        if end < len(text):
            end = text.rfind(' ', start, end) + 1
        segments.append(text[start:end])
        start = end
    return segments

def remove_overlaps(entities):
    # Sort entities by start position
    entities = sorted(entities, key=lambda x: x[0])
    non_overlapping = []
    last_end = -1
    for start, end, label in entities:
        if start >= last_end:
            non_overlapping.append((start, end, label))
            last_end = end
    return non_overlapping

# Training loop
pipe_exceptions = ["ner"]
unaffected_pipes = [pipe for pipe in nlp.pipe_names if pipe not in pipe_exceptions]

# Disable other pipes and train
with nlp.disable_pipes(*unaffected_pipes):
    sizes = compounding(1.0, 4.0, 1.001)
    for itn in range(100):
        random.shuffle(train_data)
        losses = {}
        for text, annotations in train_data:
            segments = segment_text(text)
            offset = 0
            for segment in segments:
                doc = nlp.make_doc(segment)
                # Adjust annotations for the segment
                segment_entities = [(start - offset, end - offset, label) for start, end, label in annotations['entities']
                                    if start >= offset and end <= offset + len(segment)]
                segment_entities = remove_overlaps(segment_entities)  # Remove overlaps
                example = Example.from_dict(doc, {"entities": segment_entities})
                nlp.update([example], drop=0.5, losses=losses)
                offset += len(segment)
        print("Losses", losses)

# Save the trained model
nlp.to_disk("/path/to/save/your/model")




Losses {'ner': 11914.420468069702}
Losses {'ner': 376.2153969591185}
Losses {'ner': 119.74668226939829}
Losses {'ner': 210.63590770999483}
Losses {'ner': 75.57628937094489}
Losses {'ner': 62.0288227612691}
Losses {'ner': 52.37429807287735}
Losses {'ner': 54.991385410218186}
Losses {'ner': 51.90661361699725}
Losses {'ner': 44.80430559796472}
Losses {'ner': 55.417295115175015}
Losses {'ner': 50.07335967144262}
Losses {'ner': 46.32179139329423}
Losses {'ner': 38.976448428624295}
Losses {'ner': 18.742992680607117}
Losses {'ner': 12.453287274454238}
Losses {'ner': 25.990797487864977}
Losses {'ner': 19.43895729661347}
Losses {'ner': 26.634799139728536}
Losses {'ner': 26.23208937549709}
Losses {'ner': 14.32487297004396}
Losses {'ner': 14.040888854433419}
Losses {'ner': 17.930279439835495}
Losses {'ner': 30.720057645236224}
Losses {'ner': 8.868079185416171}
Losses {'ner': 15.762089536317113}
Losses {'ner': 15.742817211428694}
Losses {'ner': 20.905766416178526}
Losses {'ner': 15.445969827986294

FileNotFoundError: [Errno 2] No such file or directory: '/path/to/save/your/model'

In [371]:
# add testing data
df_test = df.sample(100)
df_test['Complaint/s'] = df_test['Complaint/s'].fillna('')
df_test['History of Illness'] = df_test['History of Illness'].fillna('')
df_test['Preliminary Diagnosis/Final Diagnosis'] = df_test['Preliminary Diagnosis/Final Diagnosis'].fillna('')
df_test['text'] = df_test['Complaint/s'] + ' . ' + df_test['History of Illness'] + ' . ' + df_test['Preliminary Diagnosis/Final Diagnosis']
df_test['text'] = df_test['text'].apply(lambda x: re.sub(r'\s+', ' ', x))


In [372]:
# Testing the model
for text in df_test['text']:
    doc = nlp(text)
    print("Entities", [(ent.text, ent.label_) for ent in doc.ents])

Entities [('Patient', 'ILI'), ('came', 'ILI'), ('back', 'ILI'), ('with', 'ILI'), ('labs', 'ILI'), ('Her', 'ILI'), ('Breast', 'ILI'), ('UTZ', 'ILI'), ('showed', 'ILI'), ('solid', 'ILI'), ('nodules', 'ILI'), ('on', 'ILI'), ('the', 'ILI'), ('right', 'ILI'), ('breast', 'ILI'), ('BIRADS', 'ILI'), ('4', 'ILI')]
Entities [('cough', 'ILI'), ('4', 'ILI'), ('days', 'ILI'), ('PTC', 'ILI'), ('patient', 'ILI'), ('started', 'ILI'), ('to', 'ILI'), ('cough', 'ILI'), ('productive', 'ILI'), ('that', 'ILI'), ('occurs', 'ILI'), ('throughout', 'ILI'), ('the', 'ILI'), ('day', 'ILI'), ('No', 'ILI'), ('meds', 'ILI'), ('taken', 'ILI'), ('ODC', 'ILI'), ('persistence', 'ILI'), ('of', 'ILI'), ('above', 'ILI'), ('s', 'ILI'), ('sx', 'ILI'), ('no', 'ILI'), ('fever', 'ILI'), ('dob', 'ILI'), ('or', 'ILI'), ('chestpain', 'ILI'), ('URTI', 'ILI')]
Entities [('cough', 'ILI'), ('5', 'ILI'), ('days', 'ILI'), ('whitish', 'ILI'), ('phlegm', 'ILI'), ('nasal', 'ILI'), ('congestion', 'ILI'), ('no', 'ILI'), ('fever', 'ILI'), ('no

In [373]:
doc = nlp("The patient has Fever and Cough. The patient has URTI. The patient is suffering from BPPV. Upper back pain")
print(doc.ents)

(The, patient, has, Fever, and, Cough, The, patient, has, URTI, The, patient, is, suffering, from, BPPV, Upper, back, pain)


In [374]:
keywords = ['fever', 'cough', 'colds', 'URTI', 'Upper Respiratory Tract Infection', 'CAP', 'Community Acquired Pneumonia', 'PCAP', 'Pediatric Community Acquired Pneumonia']
df_new = pd.DataFrame(columns=df.columns)

df_new

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)


In [375]:
for rows in df.itertuples():
    text = str(rows[2]) + ' . ' + str(rows[3]) + ' . ' + str(rows[4])
    doc = nlp(text)
    entities = [ent.text.lower() for ent in doc.ents]  # Convert entity texts to lowercase
    if any(keyword.lower() in entities for keyword in keywords):  # Convert keywords to lowercase for comparison
        df_new = pd.concat([df_new, pd.DataFrame([df.loc[rows.Index]])], ignore_index=True) 

In [376]:
df_new

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C)
0,2023-07-22T00:00:00+08:00,cough,"3 days PTC patient had cough, nasal catarrh an...",,36.3
1,2023-07-22T00:00:00+08:00,cough,1 week PTC patient had cough and nasal catarrh.,,36.0
2,2023-09-19T00:00:00+08:00,Nasal catarrh,,Common colds,37.0
3,2023-07-27T00:00:00+08:00,cough,6 days PTC patient started to have productive ...,PCAP,37.3
4,2022-10-10T00:00:00+08:00,"""gina atake sang kulba""","~ 20 yrs PTC , pc witnessed a fight as she was...",Anxiety Reaction Secondary to A General Medica...,35.7
...,...,...,...,...,...
3583,2023-05-13T00:00:00+08:00,Cough,,,
3584,2023-09-12T00:00:00+08:00,Fever,,Urti,38.0
3585,2023-10-04T00:00:00+08:00,Cough,,CAP,
3586,2023-07-04T00:00:00+08:00,cough and colds,,URTI,


In [377]:
columns = [
    'Complaint/s', 
    'History of Illness', 
    'Preliminary Diagnosis/Final Diagnosis'
]
col = df_new.columns
df_new = pd.DataFrame(columns=col)
counter = 0
for col in columns:
    for i in range(len(df)):
        text = str(df[col][i])
        doc = nlp(text)
        entities = [ent.text.lower() for ent in doc.ents]
        if any(keyword.lower() in entities for keyword in keywords):
            df_new = pd.concat([df_new, pd.DataFrame([df.loc[i]])], ignore_index=True)
            counter += 1
len(df_new)

7599

In [265]:
df_new.duplicated().sum()

4090

In [232]:
df_new

Unnamed: 0,Date of Encounter,Complaint/s,History of Illness,Preliminary Diagnosis/Final Diagnosis,Temperature (C),id,Type of encounter,Purpose of encounter,Age Range at Consultation,Is the patient wearing a face mask during the consulation?,...,Other Adverse Effects,Adverse Events,Did the patient get well?,Patient Disposition Summary,Type of Consultation,Income Classification,Follow-up Schedule,Identifying Physical Qualities,Record ID,Sex
0,2023-07-22T00:00:00+08:00,cough,"3 days PTC patient had cough, nasal catarrh an...",,36.3,64c3894ccebb4d0028cd91e9,Outpatient,Check-up,0 - 9,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-07-28T00:00:00+08:00,,5785,Male
1,2023-07-22T00:00:00+08:00,cough,1 week PTC patient had cough and nasal catarrh.,,36.0,64ca17a02be41a0027d286ed,Outpatient,Check-up,0 - 9,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-08-02T00:00:00+08:00,,5699,Male
2,2023-07-27T00:00:00+08:00,cough,6 days PTC patient started to have productive ...,PCAP,37.3,64f2c036c8efaa00298e4feb,Outpatient,New health issue,0 - 9,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-09-02T00:00:00+08:00,,6028,Male
3,2023-10-25T00:00:00+08:00,Cough,"1 week PTC - cough with yellowish phlegm, back...",CAP,36.3,6540e5aaca8723002551948d,Outpatient,Check-up,60 - 69,False,...,,|,,| | | Did the patient get well:,,,2023-10-31T00:00:00+08:00,,7649,Female
4,2023-08-27T00:00:00+08:00,Headache\nFever\nPainful urination,"2 days PTC - on and off fever, headache, painf...",UTI\nDM Type 2,37.6,64f1f433b2767d0028a35601,Outpatient,Check-up,60 - 69,False,...,,|,,| | | Did the patient get well:,,,2023-09-01T00:00:00+08:00,,6512,Female
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7238,2024-02-12T00:00:00+08:00,follow up,patient has regression of s/sx from previous c...,URTI\nTooth eruption,37.9,65c96679fd5f2f002881b747,Outpatient,Follow up on previous health issue,0 - 9,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2024-02-12T00:00:00+08:00,,7023,Male
7239,2023-09-12T00:00:00+08:00,Fever,,Urti,38.0,651c1bb8917b21002939831d,Outpatient,Check-up,0 - 9,False,...,,|,,Released | | | Did the patient get well:,,,2023-10-03T00:00:00+08:00,,3465,Female
7240,2023-10-04T00:00:00+08:00,Cough,,CAP,,652b51ef3179000027d229a2,Outpatient,Check-up,0 - 9,False,...,,|,,Released | | | Did the patient get well:,,,2023-10-15T00:00:00+08:00,,3465,Female
7241,2023-07-04T00:00:00+08:00,cough and colds,,URTI,,64cfb2fb83ee5e00274d00fa,Outpatient,New health issue,10 - 19,False,...,,|,,| | | Did the patient get well:,,,2023-08-06T00:00:00+08:00,,5991,Female


In [228]:
df

Unnamed: 0,id,Date of Encounter,Type of encounter,Purpose of encounter,Age Range at Consultation,Is the patient wearing a face mask during the consulation?,Encounter ID 2,Retire Encounter Record?,Reason for Retiring Record,Is Archived?,...,Other Adverse Effects,Adverse Events,Did the patient get well?,Patient Disposition Summary,Type of Consultation,Income Classification,Follow-up Schedule,Identifying Physical Qualities,Record ID,Sex
0,6516480964fa61002775fdf3,2023-09-15T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,7027-C4A3TDeUkE,False,,False,...,,|,,Released | | | Did the patient get well:,,,2023-09-29T00:00:00+08:00,,7027,Female
1,64c3894ccebb4d0028cd91e9,2023-07-22T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,5785-V5KGFNiHHz,False,,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-07-28T00:00:00+08:00,,5785,Male
2,64c38a2517e5110027622a8f,2023-07-22T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,5785-eI30WnP0WC,False,,False,...,,|,,Released | | | Did the patient get well:,,,2023-07-28T00:00:00+08:00,,5785,Male
3,64ca17a02be41a0027d286ed,2023-07-22T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,5699-4LLCGE0XPw,False,,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-08-02T00:00:00+08:00,,5699,Male
4,62bd47a9cb2f88001f93a9d6,2022-06-30T00:00:00+08:00,Outpatient,Check-up,60 - 69,False,775-nIwhR3zhzp,False,,False,...,,|,,| | | Did the patient get well:,,,2022-06-30T00:00:00+08:00,,775,Male
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7580,64cfb2fb83ee5e00274d00fa,2023-07-04T00:00:00+08:00,Outpatient,New health issue,10 - 19,False,5991-NuGkwCh8f0,False,,False,...,,|,,| | | Did the patient get well:,,,2023-08-06T00:00:00+08:00,,5991,Female
7581,6470ccfaeec5650027c1a906,2023-05-06T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,4456-oP4BHJRUHH,False,,False,...,,|,,Released - under observation | | | Did the p...,,,2023-05-26T00:00:00+08:00,,4456,Male
7582,62e193cd76092f002152680f,2022-07-26T00:00:00+08:00,Outpatient,New health issue,0 - 9,False,1289-KETfqONNXk,False,,False,...,,|,,Released | | | Did the patient get well:,,,2022-07-28T00:00:00+08:00,,1289,Male
7583,62cd14dd7aee070021b0d07a,2022-07-12T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,939-dUTc9c1Q8c,False,,False,...,,|,,Need to follow up | | | Did the patient get ...,,,2022-07-12T00:00:00+08:00,,939,Female


In [266]:
df1 = pd.read_csv("ILI_estimate.csv")
df2 = df_new

  df1 = pd.read_csv("ILI_estimate.csv")


In [267]:
df1.drop_duplicates(inplace=True)
df2.drop_duplicates(inplace=True)

In [268]:
len(df1)

3634

In [269]:
len(df2)

3272

In [270]:
import pandas as pd

# Assuming df1 and df2 are your DataFrames

# Merge df1 and df2 with an indicator and perform an outer join
merged_df = pd.merge(df1, df2, how='outer', indicator=True)

# Filter rows that are only in df1
df1_not_in_df2 = merged_df[merged_df['_merge'] == 'left_only']

# Drop the indicator column
df1_not_in_df2 = df1_not_in_df2.drop(columns=['_merge'])

# If needed, reset the index
df1_not_in_df2.reset_index(drop=True, inplace=True)

# df1_not_in_df2 now contains rows from df1 that are not in df2

In [271]:
df1_not_in_df2

Unnamed: 0.1,Unnamed: 0,id,Date of Encounter,Type of encounter,Purpose of encounter,Age Range at Consultation,Is the patient wearing a face mask during the consulation?,Encounter ID 2,Retire Encounter Record?,Reason for Retiring Record,...,Other Adverse Effects,Adverse Events,Did the patient get well?,Patient Disposition Summary,Type of Consultation,Income Classification,Follow-up Schedule,Identifying Physical Qualities,Record ID,Sex
0,4987,64a8e86ec976c10026b63b81,2023-06-12T00:00:00+08:00,Outpatient,Check-up,30 - 39,False,4713-MvITsMpQj5,False,,...,,|,,Released | | | Did the patient get well:,,,2023-07-08T00:00:00+08:00,,4713,Male
1,4768,62e17bbd20a92c0024e6b6a2,2022-07-28T00:00:00+08:00,Outpatient,New health issue,10 - 19,False,1157-umIxx1IkrS,False,,...,,|,,Released | | | Did the patient get well:,,,2022-07-28T00:00:00+08:00,,1157,Female
2,6622,62b423885b6141001e9caf76,2022-06-23T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,683-TrG7TDOZ0X,False,,...,,|,,| | | Did the patient get well:,,,2022-06-23T00:00:00+08:00,,683,Male
3,4230,64700a94c76b6f002a40b941,2023-05-05T00:00:00+08:00,Outpatient,Check-up,60 - 69,False,4437-8Fn4VR1AW7,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-05-26T00:00:00+08:00,,4437,Male
4,5958,63bf9f4ab25151001282e4c4,2023-01-12T00:00:00+08:00,Outpatient,New health issue,0 - 9,False,3048-F1PdYeN2cq,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-01-12T00:00:00+08:00,,3048,Female
5,6329,6456ebcfefd5bb0027295992,2023-05-02T00:00:00+08:00,Outpatient,New health issue,60 - 69,False,4296-TUxORZiYgg,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-05-07T00:00:00+08:00,,4296,Male
6,977,63801d67e1f9870021ece6ba,2022-11-04T00:00:00+08:00,Outpatient,Check-up,0 - 9,False,500-vg68vdH4XV,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2022-11-25T00:00:00+08:00,,500,Female
7,1913,6426a60b336d5100283c5077,2023-03-22T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,27-g320oKpT6b,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2023-03-31T00:00:00+08:00,,27,Female
8,2177,62f8db749b4f6d0021114578,2022-08-09T00:00:00+08:00,Outpatient,Check-up,20 - 29,False,307-Z3LGDUIVjU,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2022-08-14T00:00:00+08:00,,307,Female
9,6578,660e6bc794db0a0029c4a6b7,2024-04-03T00:00:00+08:00,Outpatient,Check-up,70 - 79,False,8067-1Yf0nwJluS,False,,...,,|,,Need to follow up | | | Did the patient get ...,,,2024-04-04T00:00:00+08:00,,8067,Male


In [272]:
doc = nlp(df1_not_in_df2["History of Illness"][0])
print([(ent.text, ent.label_) for ent in doc.ents])

[]


In [273]:
df1_not_in_df2["History of Illness"][0]

'2 weeks PTC patient had feverish episodes and body pains. '