In [1]:
import pandas as pd
import requests
import re
import json
import os
from IPython.display import display

In [2]:
file_path = r"C:\Users\Dell\Desktop\UGM\PRPL\DataRaw1.csv"
df = pd.read_csv(file_path)

In [3]:
# Check for empty cells
empty_cells = df.isnull().sum()
print(empty_cells)

# Check for Duplicates
duplicates = df.duplicated().sum()
print("Number of duplicates:", duplicates)

Free_Text    0
dtype: int64
Number of duplicates: 0


In [4]:
from IPython.display import display

with pd.option_context('display.max_colwidth', None):
    display(df.head())

Unnamed: 0,Free_Text
0,"Pasien Rahmat Widodo, KTP 3276010101010004, 52 tahun, perempuan, alamat Jl. Imam Bonjol No. 78, Surabaya, nomor telepon 081987654321, golongan darah A, nomor rekam medis 727106, datang pada 10/09/2023. Diperiksa oleh Dr. Farida Lestari, KTP 3173010202020004. Keluhan: hipertensi, keluhan tambahan: sesak napas. disarankan mengubah pola makan. Dokter rujukan: Dr. Hasan. keluhan masih tetap dirasakan, perlu observasi lebih lanjut"
1,"Pasien Wawan Purnomo, KTP 3276010101010009, 62 tahun, perempuan, alamat Jl. Sudirman No. 56, Bandung, nomor telepon 081234567890, golongan darah B, nomor rekam medis 870412, datang pada 11/09/2023. Diperiksa oleh Dr. Andika Pratama, KTP 3173010202020003. Keluhan: migrain, keluhan tambahan: nyeri kepala. diberikan suplemen zat besi. Dokter rujukan: Dr. Hasan. direkomendasikan untuk menjaga pola makan dan olahraga"
2,"Pasien Lestari Nugroho, KTP 3276010101010007, 41 tahun, perempuan, alamat Jl. Imam Bonjol No. 78, Surabaya, nomor telepon 081987654321, golongan darah AB, nomor rekam medis 151697, datang pada 27/09/2023. Diperiksa oleh Dr. Sumarno Hadi, KTP 3173010202020007. Keluhan: hipertensi, keluhan tambahan: pusing. diberikan obat antihipertensi. Dokter rujukan: Dr. Hasan. direkomendasikan untuk menjaga pola makan dan olahraga"
3,"Pasien Yudi Raharjo, KTP 3276010101010004, 20 tahun, laki-laki, alamat Jl. Sudirman No. 56, Bandung, nomor telepon 081987654321, golongan darah A, nomor rekam medis 583165, datang pada 3/08/2023. Diperiksa oleh Dr. Wahyu Nugraha, KTP 3173010202020010. Keluhan: infeksi saluran pernapasan, keluhan tambahan: nyeri kepala. diberikan obat antihipertensi. Dokter rujukan: Dr. Dian. perlu dilakukan tes darah ulang"
4,"Pasien Wawan Purnomo, KTP 3276010101010004, 45 tahun, perempuan, alamat Jl. Diponegoro No. 12, Medan, nomor telepon 081234567890, golongan darah A, nomor rekam medis 522131, datang pada 25/08/2023. Diperiksa oleh Dr. Sari Utami, KTP 3173010202020009. Keluhan: migrain, keluhan tambahan: sakit pinggang. diberikan antibiotik selama 5 hari. Dokter rujukan: Dr. Dian. diminta mengurangi konsumsi garam dan lemak"


#### **SIMPLE ENTITY EXTRACTION**

In [5]:
import re

#SIMPLE ENTITY EXTRACTION
def extract_simple(text):
    # KTP
    ktp_matches = re.findall(r'KTP (\d{16})', text)
    patient_ktp = ktp_matches[0] if len(ktp_matches) > 0 else None
    doctor_ktp = ktp_matches[1] if len(ktp_matches) > 1 else None
    
    # AGE
    age_match = re.search(r'(\d+)\s*tahun', text)
    age = int(age_match.group(1)) if age_match else None
    
    # GENDER
    gender_match = re.search(r'(laki-laki|perempuan)', text, re.IGNORECASE)
    gender = gender_match.group(1).lower() if gender_match else None
    
    # PHONE
    phone_match = re.search(r'nomor telepon\s+(08\d{8,11})', text)
    phone = phone_match.group(1) if phone_match else None
    
    # ASSUMING THE MEDICAL RECORD NO IS ALWAYSSS 6 DIGITS
    rekam_medis_match = re.search(r'\b(\d{6})\b', text)
    rekam_medis = rekam_medis_match.group(1) if rekam_medis_match else None
    
    # BLOOD TYPE
    blood_type_match = re.search(r'\b([ABO]|AB)[+-]?\b', text)
    blood_type = blood_type_match.group(0) if blood_type_match else None
    
    # PRIMARY AND ADDITIONAL COMPLAINTS
    complaint_match = re.search(r'keluhan\s*:\s*([^,.]+)', text, re.IGNORECASE)
    additional_complaint_match = re.search(r'keluhan\s*tambahan\s*:\s*([^,.]+)', text, re.IGNORECASE)

    primary_complaint = [complaint_match.group(1).strip()] if complaint_match else []
    additional_complaint = [additional_complaint_match.group(1).strip()] if additional_complaint_match else []

    # PRIMARY AND ADDITIONAL RECOMMENDATIONS & NOTES
    recommendation_matches = re.findall(r'(diberikan|disarankan|direkomendasikan)\s([^,.]+)', text, re.IGNORECASE)
    recommendation = [f"{match[0]} {match[1].strip()}" for match in recommendation_matches]

    notes_match = re.search(r'Dokter rujukan\s*:\s*[^.]+\.?\s*(.*)', text, re.IGNORECASE)
    notes = notes_match.group(1).strip() if notes_match else ""
    notes = re.sub(r'^[A-Za-z]+\b', '', notes).strip()
    notes = notes.split('. ') if notes else []
    notes = [note.strip() for note in notes if note.strip()]  # Clean up and remove empty strings

    # Deduplicate recommendations and notes
    recommendation = list(set(recommendation))  # Remove duplicates within recommendations
    notes = list(set(notes))  # Remove duplicates within notes

    # Ensure recommendations don't overlap with notes
    filtered_recommendation = []
    for rec in recommendation:
        if rec in notes:
            continue  # If the recommendation already exists in notes, exclude it from recommendations
        filtered_recommendation.append(rec)

    # Finalize recommendations and notes
    recommendation = filtered_recommendation

    return {
        "PATIENT_KTP": patient_ktp,
        "DOCTOR_KTP": doctor_ktp,
        "AGE": age,
        "GENDER": gender,
        "PHONE": phone,
        "MEDICAL_RECORD_NUMBER": rekam_medis,
        "BLOOD_TYPE": blood_type,
        "MAIN_COMPLAINT": primary_complaint,
        "ADDITIONAL_COMPLAINT": additional_complaint,
        "RECOMMENDATION": recommendation,
        "NOTES": notes
    }

In [6]:
df['Simple_Fields'] = df['Free_Text'].apply(extract_simple)
df_SimpleFields = pd.json_normalize(df['Simple_Fields'])
df = pd.concat([df, df_SimpleFields], axis=1)

In [7]:
df_SimpleFields.head()

Unnamed: 0,PATIENT_KTP,DOCTOR_KTP,AGE,GENDER,PHONE,MEDICAL_RECORD_NUMBER,BLOOD_TYPE,MAIN_COMPLAINT,ADDITIONAL_COMPLAINT,RECOMMENDATION,NOTES
0,3276010101010004,3173010202020004,52,perempuan,81987654321,727106,A,[hipertensi],[sesak napas],[disarankan mengubah pola makan],"[keluhan masih tetap dirasakan, perlu observas..."
1,3276010101010009,3173010202020003,62,perempuan,81234567890,870412,B,[migrain],[nyeri kepala],[diberikan suplemen zat besi],[direkomendasikan untuk menjaga pola makan dan...
2,3276010101010007,3173010202020007,41,perempuan,81987654321,151697,AB,[hipertensi],[pusing],[diberikan obat antihipertensi],[direkomendasikan untuk menjaga pola makan dan...
3,3276010101010004,3173010202020010,20,laki-laki,81987654321,583165,A,[infeksi saluran pernapasan],[nyeri kepala],[diberikan obat antihipertensi],[perlu dilakukan tes darah ulang]
4,3276010101010004,3173010202020009,45,perempuan,81234567890,522131,A,[migrain],[sakit pinggang],[diberikan antibiotik selama 5 hari],[diminta mengurangi konsumsi garam dan lemak]


#### **COMPLEX ENTITIES EXTRACTION**

In [8]:
from transformers import AutoTokenizer, AutoModelForTokenClassification, pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [9]:
tokenizer = AutoTokenizer.from_pretrained("syafiqfaray/indobert-model-ner")
model = AutoModelForTokenClassification.from_pretrained("syafiqfaray/indobert-model-ner")

ner_pipeline = pipeline("ner", model=model, tokenizer=tokenizer, grouped_entities=True)



In [10]:
# Define the extraction function
def extract_complex(text):
    ner_results = ner_pipeline(text)
    extracted_info = {
        "PATIENT_NAME": None,
        "DOCTOR_NAME": None,
        "REFERRAL": None,
        "ADDRESS": [],
        "CITY": None,
        "DATE_VISIT": []
    }

    unclassified_names = []

    for entity in ner_results:
        label = entity.get('entity_group', 'Unknown')
        word = entity['word'].strip()

        if label == "PER":                      # NAMES
            unclassified_names.append(word)
        elif label == "LOC":                    # ADDRESS
            extracted_info['ADDRESS'].append(word)
        elif label == "GPE":                    # CITY
            extracted_info['CITY'] = word 
        elif label == "DAT":                    # DATE
            extracted_info['DATE_VISIT'].append(word)

    # Classify names based on order or context
    if unclassified_names:
        if len(unclassified_names) >= 1:
            extracted_info['PATIENT_NAME'] = unclassified_names[0]  # Assume first name is patient
        if len(unclassified_names) >= 2:
            extracted_info['DOCTOR_NAME'] = unclassified_names[1]  # Assume second name is doctor
        if len(unclassified_names) >= 3:
            extracted_info['REFERRAL'] = unclassified_names[2]  # Assume third name is referring doctor
            
    # ADDRESS & CITY
    if extracted_info['ADDRESS']:  # Combine address tokens into a single string
        extracted_info['ADDRESS'] = ' '.join(extracted_info['ADDRESS']).strip()
    else:
        extracted_info['ADDRESS'] = None

    if not extracted_info['CITY']:  # If city is not detected, set it to None
        extracted_info['CITY'] = None

    # Clean and finalize extracted information
    for key in extracted_info:
        if isinstance(extracted_info[key], list) and extracted_info[key]:  # Clean list entries
            extracted_info[key] = ' '.join(extracted_info[key]).strip()
        elif not extracted_info[key]:  # None if empty
            extracted_info[key] = None
    
    return extracted_info

In [11]:
# Apply the NER extraction on each row
df['Complex_Fields'] = df['Free_Text'].apply(extract_complex)
df_ComplexFields = pd.json_normalize(df['Complex_Fields'])
data = pd.concat([df, df_ComplexFields], axis=1)

Asking to truncate to max_length but no maximum length is provided and the model has no predefined maximum length. Default to no truncation.


In [12]:
df_ComplexFields.head()

Unnamed: 0,PATIENT_NAME,DOCTOR_NAME,REFERRAL,ADDRESS,CITY,DATE_VISIT
0,rahmat widodo,dr. farida lestari,dr. hasan,jl. imam bonjol no. 78,surabaya,10 / 09 / 2023
1,wawan purnomo,dr. andika pratama,dr. hasan,jl. sudirman no. 56,bandung,11 / 09 / 2023
2,lestari nugroho,dr. sumarno hadi,dr. hasan,jl. imam bonjol no. 78,surabaya,27 / 09 / 2023
3,yudi raharjo,dr. wahyu nugraha,dr. dian,jl. sudirman no. 56,bandung,3 / 08 / 2023
4,wawan purnomo,dr. sari utami,dr. dian,jl. diponegoro no. 12,medan,25 / 08 / 2023


##### **EXTRACTED INFORMATION STANDARDIZATION AND NORMALIZATION**

In [13]:
simple_expand = pd.json_normalize(df['Simple_Fields'])
complex_expand = pd.json_normalize(df['Complex_Fields'])

Extracted_Info = pd.concat([simple_expand, complex_expand], axis=1)

In [14]:
from IPython.display import display

with pd.option_context('display.max_colwidth', None):
    display(Extracted_Info.head())

Unnamed: 0,PATIENT_KTP,DOCTOR_KTP,AGE,GENDER,PHONE,MEDICAL_RECORD_NUMBER,BLOOD_TYPE,MAIN_COMPLAINT,ADDITIONAL_COMPLAINT,RECOMMENDATION,NOTES,PATIENT_NAME,DOCTOR_NAME,REFERRAL,ADDRESS,CITY,DATE_VISIT
0,3276010101010004,3173010202020004,52,perempuan,81987654321,727106,A,[hipertensi],[sesak napas],[disarankan mengubah pola makan],"[keluhan masih tetap dirasakan, perlu observasi lebih lanjut]",rahmat widodo,dr. farida lestari,dr. hasan,jl. imam bonjol no. 78,surabaya,10 / 09 / 2023
1,3276010101010009,3173010202020003,62,perempuan,81234567890,870412,B,[migrain],[nyeri kepala],[diberikan suplemen zat besi],[direkomendasikan untuk menjaga pola makan dan olahraga],wawan purnomo,dr. andika pratama,dr. hasan,jl. sudirman no. 56,bandung,11 / 09 / 2023
2,3276010101010007,3173010202020007,41,perempuan,81987654321,151697,AB,[hipertensi],[pusing],[diberikan obat antihipertensi],[direkomendasikan untuk menjaga pola makan dan olahraga],lestari nugroho,dr. sumarno hadi,dr. hasan,jl. imam bonjol no. 78,surabaya,27 / 09 / 2023
3,3276010101010004,3173010202020010,20,laki-laki,81987654321,583165,A,[infeksi saluran pernapasan],[nyeri kepala],[diberikan obat antihipertensi],[perlu dilakukan tes darah ulang],yudi raharjo,dr. wahyu nugraha,dr. dian,jl. sudirman no. 56,bandung,3 / 08 / 2023
4,3276010101010004,3173010202020009,45,perempuan,81234567890,522131,A,[migrain],[sakit pinggang],[diberikan antibiotik selama 5 hari],[diminta mengurangi konsumsi garam dan lemak],wawan purnomo,dr. sari utami,dr. dian,jl. diponegoro no. 12,medan,25 / 08 / 2023


In [None]:
# STANDARDIZING & CHANGING NECESSARY COLUMNS TO SATUSEHAT AND FHIR4 COMPLIANT FORMAT
Extracted_Info['GENDER'] = Extracted_Info['GENDER'].replace({"perempuan": "female", "laki-laki": "male"})
Extracted_Info['PHONE'] = Extracted_Info['PHONE'].apply(lambda x: "+62" + x[1:] if x.startswith("0") else x)
Extracted_Info["DATE_VISIT"] = pd.to_datetime(Extracted_Info["DATE_VISIT"], format="%d / %m / %Y", dayfirst=True).dt.strftime("%Y-%m-%d")

In [18]:
Extracted_Info.head()

Unnamed: 0,PATIENT_KTP,DOCTOR_KTP,AGE,GENDER,PHONE,MEDICAL_RECORD_NUMBER,BLOOD_TYPE,MAIN_COMPLAINT,ADDITIONAL_COMPLAINT,RECOMMENDATION,NOTES,PATIENT_NAME,DOCTOR_NAME,REFERRAL,ADDRESS,CITY,DATE_VISIT
0,3276010101010004,3173010202020004,52,female,6281987654321,727106,A,[hipertensi],[sesak napas],[disarankan mengubah pola makan],"[keluhan masih tetap dirasakan, perlu observas...",rahmat widodo,dr. farida lestari,dr. hasan,jl. imam bonjol no. 78,surabaya,2023-09-10
1,3276010101010009,3173010202020003,62,female,6281234567890,870412,B,[migrain],[nyeri kepala],[diberikan suplemen zat besi],[direkomendasikan untuk menjaga pola makan dan...,wawan purnomo,dr. andika pratama,dr. hasan,jl. sudirman no. 56,bandung,2023-09-11
2,3276010101010007,3173010202020007,41,female,6281987654321,151697,AB,[hipertensi],[pusing],[diberikan obat antihipertensi],[direkomendasikan untuk menjaga pola makan dan...,lestari nugroho,dr. sumarno hadi,dr. hasan,jl. imam bonjol no. 78,surabaya,2023-09-27
3,3276010101010004,3173010202020010,20,male,6281987654321,583165,A,[infeksi saluran pernapasan],[nyeri kepala],[diberikan obat antihipertensi],[perlu dilakukan tes darah ulang],yudi raharjo,dr. wahyu nugraha,dr. dian,jl. sudirman no. 56,bandung,2023-08-03
4,3276010101010004,3173010202020009,45,female,6281234567890,522131,A,[migrain],[sakit pinggang],[diberikan antibiotik selama 5 hari],[diminta mengurangi konsumsi garam dan lemak],wawan purnomo,dr. sari utami,dr. dian,jl. diponegoro no. 12,medan,2023-08-25


In [19]:
Extracted_Info["VISIT_YEAR"] = pd.to_datetime(Extracted_Info["DATE_VISIT"]).dt.year
AssumedYear = 2023
Extracted_Info["BIRTH_YEAR"] = AssumedYear - Extracted_Info["AGE"]

In [20]:
# VISUALIZATION PURPOSES
ColOrder = ["PATIENT_NAME", "PATIENT_KTP", "MEDICAL_RECORD_NUMBER", "BIRTH_YEAR", "GENDER", "ADDRESS", "CITY", "PHONE",
            "DOCTOR_NAME", "DOCTOR_KTP", "DATE_VISIT", "MAIN_COMPLAINT", "ADDITIONAL_COMPLAINT", "RECOMMENDATION",
            "REFERRAL", "NOTES"]

Extracted_Clean = Extracted_Info[ColOrder] # BLOOD TYPE IS LEFT OUT SINCE IT IS NOT NECESSARY IN FHIR R4 FORMAT

In [21]:
Extracted_Clean.head()

Unnamed: 0,PATIENT_NAME,PATIENT_KTP,MEDICAL_RECORD_NUMBER,BIRTH_YEAR,GENDER,ADDRESS,CITY,PHONE,DOCTOR_NAME,DOCTOR_KTP,DATE_VISIT,MAIN_COMPLAINT,ADDITIONAL_COMPLAINT,RECOMMENDATION,REFERRAL,NOTES
0,rahmat widodo,3276010101010004,727106,1971,female,jl. imam bonjol no. 78,surabaya,6281987654321,dr. farida lestari,3173010202020004,2023-09-10,[hipertensi],[sesak napas],[disarankan mengubah pola makan],dr. hasan,"[keluhan masih tetap dirasakan, perlu observas..."
1,wawan purnomo,3276010101010009,870412,1961,female,jl. sudirman no. 56,bandung,6281234567890,dr. andika pratama,3173010202020003,2023-09-11,[migrain],[nyeri kepala],[diberikan suplemen zat besi],dr. hasan,[direkomendasikan untuk menjaga pola makan dan...
2,lestari nugroho,3276010101010007,151697,1982,female,jl. imam bonjol no. 78,surabaya,6281987654321,dr. sumarno hadi,3173010202020007,2023-09-27,[hipertensi],[pusing],[diberikan obat antihipertensi],dr. hasan,[direkomendasikan untuk menjaga pola makan dan...
3,yudi raharjo,3276010101010004,583165,2003,male,jl. sudirman no. 56,bandung,6281987654321,dr. wahyu nugraha,3173010202020010,2023-08-03,[infeksi saluran pernapasan],[nyeri kepala],[diberikan obat antihipertensi],dr. dian,[perlu dilakukan tes darah ulang]
4,wawan purnomo,3276010101010004,522131,1978,female,jl. diponegoro no. 12,medan,6281234567890,dr. sari utami,3173010202020009,2023-08-25,[migrain],[sakit pinggang],[diberikan antibiotik selama 5 hari],dr. dian,[diminta mengurangi konsumsi garam dan lemak]


In [96]:
Extracted_Info.columns

Index(['PATIENT_KTP', 'DOCTOR_KTP', 'AGE', 'GENDER', 'PHONE',
       'MEDICAL_RECORD_NUMBER', 'BLOOD_TYPE', 'MAIN_COMPLAINT',
       'ADDITIONAL_COMPLAINT', 'RECOMMENDATION', 'NOTES', 'PATIENT_NAME',
       'DOCTOR_NAME', 'REFERRAL', 'ADDRESS', 'CITY', 'DATE_VISIT',
       'VISIT_YEAR', 'BIRTH_YEAR'],
      dtype='object')

In [95]:
Extracted_Clean.columns

Index(['PATIENT_NAME', 'PATIENT_KTP', 'MEDICAL_RECORD_NUMBER', 'BIRTH_YEAR',
       'GENDER', 'ADDRESS', 'CITY', 'PHONE', 'DOCTOR_NAME', 'DOCTOR_KTP',
       'DATE_VISIT', 'MAIN_COMPLAINT', 'ADDITIONAL_COMPLAINT',
       'RECOMMENDATION', 'REFERRAL', 'NOTES'],
      dtype='object')

#### **JSON MAPPING**

##### **PATIENT RESOURCES**

In [97]:
def map_patient(row, patient_ID):
    return {
     "resourceType": "Patient",
     "id": patient_ID,
     "identifier": [
         {
             "system": "http://satusehat.kemkes.go.id/nik",
             "value": row["PATIENT_KTP"]
         },
         {
             "system": "https://fhir.kemkes.go.id/id/ihs-number", 
             "value": row["MEDICAL_RECORD_NUMBER"]
         }, 
     ],
     "active": True,
     "name": [
         {
             "use": "official",
             "text": row["PATIENT_NAME"]
         }
     ],
     "gender": row["GENDER"],
     "birthDate": str(row["BIRTH_YEAR"]),
     "telecom": [
         {
             "system": "phone",
             "value": row["PHONE"]
         }
     ],
     "address": [
         {
             "line": [row["ADDRESS"]],
             "city": row["CITY"],
             "country": "ID"
         }
     ],
     "meta": {
         "profile": [
             "http://hl7.org/fhir/StructuredDefinition/Patient" # DUMMY LINK
         ]
     }
    }

##### **PRACTITIONER RESOURCES**

In [23]:
def map_practitioner(row, practitioner_ID):
    return {
        "resourceType": "Practitioner",
        "id": practitioner_ID,
        "identifier": [
            {
                "system": "http://satusehat.kemkes.go.id/nik",
                "value": row["DOCTOR_KTP"]
            }
        ],
        "active": True,
        "name": [
            {
                "text": row["DOCTOR_NAME"]
            }
        ],
        "meta": {
            "profile": [
                "http://hl7.org/fhir/StructuredDefinition/Practitioner" # DUMMY LINK
            ]
        }
    }

##### **CONDITION RESOURCES**

In [24]:
def map_condition(row, condition_ID):
    return {
        "resourceType": "Condition",
        "id": condition_ID,
        "clinicalStatus": {
            "coding": [
                {
                    "system": "http://terminology.hl7.org/CodeSystem/condition-clinical",
                    "code": "active",
                    "display": "Active"
                }
            ]
        },
        "code": {
            "text": ", ".join(row.get("MAIN_COMPLAINT", []) + row.get("ADDITIONAL_COMPLAINT", []))
        },
        "note": {
            "text": row["RECOMMENDATION"]  
        },
        "subject": {
            "reference": f"Patient/{row['PATIENT_KTP']}"
        },
        "onsetDateTime": row["DATE_VISIT"]
    }

##### **ENCOUNTER RESOURCES**

In [83]:
def map_encounter(row, encounter_ID):
    return{
        "resourceType": "Encounter",
        "id": encounter_ID,
        "subject": {
            "reference": f"Patient/{row['PATIENT_KTP']}"
        },
        "period": {
            "start": row["DATE_VISIT"]
        },
        "participant": [
            {
                "individual": {
                    "reference": f"Referral Doctor/{row['REFERRAL']}"
                }
            }
        ],
        "note": [
            {
                "text": row["NOTES"]
            }
        ]
    }

##### **COMBINING RESOURCES**

In [84]:
import uuid

def generate_uuid():
    return str(uuid.uuid4())

def map_all(row):
    # Generate a consistent ID for the patient
    patient_id = generate_uuid()
    practitioner_id = patient_id
    encounter_id = patient_id
    condition_id = patient_id
    
    return {
        "Patient": map_patient(row, patient_id),
        "Practitioner": map_practitioner(row, practitioner_id),
        "Condition": map_condition(row, condition_id),
        "Encounter": map_encounter(row, encounter_id)
    }

In [85]:
# SAVING ALL ROWS INTO INDIVIDUAL .JSON FILES
import os
import json

def saveJSON (df, output_dir):
    os.makedirs(output_dir, exist_ok=True)
    
    for index, row in df.iterrows():
        row_data = map_all(row)
        
        output_file = os.path.join(output_dir, f"row_{index+1}.json")
        
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(row_data, f, indent=4, ensure_ascii=False)
            
        print(f"Saved JSON for row {index+1} to {output_file} successfully.")

In [None]:
# output_dir = r"C:\Users\Dell\Desktop\UGM\PRPL\OUTPUT2"
# saveJSON(Extracted_Clean, output_dir)

#### **FREE_TEXT INPUT**

In [112]:
def free_input():
    print("Enter your text: ")
    text = input("> ").strip()
    
    print("\nInput Text:")
    print(text)
    
    simple_data = extract_simple(text)
    complex_data = extract_complex(text)
    combined_data = {**simple_data, **complex_data}
    
    df_Free = pd.DataFrame([combined_data])
    
    df_Free['GENDER'] = df_Free['GENDER'].replace({"perempuan": "female", "laki-laki": "male"})
    df_Free['PHONE'] = df_Free['PHONE'].apply(lambda x: "+62" + x[1:] if x.startswith("0") else x)
    df_Free['DATE_VISIT'] = pd.to_datetime(df_Free['DATE_VISIT'], format="%d / %m / %Y", dayfirst=True).dt.strftime("%Y-%m-%d")
    df_Free['VISIT_YEAR'] = pd.to_datetime(df_Free['DATE_VISIT']).dt.year
    AssumedYear = 2024
    df_Free['BIRTH_YEAR'] = AssumedYear - df_Free['AGE']
    
    standardized_data = df_Free.to_dict(orient="records")[0]
    
    # Display the results
    print("\nStandardized Information:")
    for key, value in standardized_data.items():
        print(f"{key}: {value}")
        
    # Apply `map_all` to the DataFrame row
    mapped_data = map_all(df_Free.iloc[0])  # Pass the first (and only) row as a Series to `map_all`
    
    # DEBUGGING PURPOSES - SHOW FHIR FORMAT
    print("\nMapped FHIR Resources:")
    for resource_type, resource_data in mapped_data.items():
        print(f"\n{resource_type}:")
        print(json.dumps(resource_data, indent=4))
    
    # Save to JSON if desired
    save_choice = input("\nDo you want to save the standardized data to a JSON file? (yes/no): ").strip().lower()
    if save_choice == 'yes':
        output_file = f"JSON_FORMAT_{generate_uuid()}.json"
        with open(output_file, 'w', encoding='utf-8') as f:
            json.dump(mapped_data, f, indent=4, ensure_ascii=False)
        print(f"File saved to {output_file}")

In [110]:
# Define the main function
def main():
    print("Type 'exit' at any time to quit.\n")
    
    while True:
        user_input = input("Do you want to process a new text? (yes/no): ").strip().lower()
        if user_input == "yes":
            free_input()
        elif user_input == "no" or user_input == "exit":
            print("Exit")
            break
        else:
            print("Invalid input. Please type 'yes' or 'no'.")

In [116]:
# Run the program
if __name__ == "__main__":
    main()

Type 'exit' at any time to quit.

Enter your text: 

Input Text:
Alice Queen, perempuan, nomor telepon 081923938478, 25 tahun, KTP 12348782909991992, alamat Jl. Sains No. 18, Serang, golongan darah AB, nomor rekam medis 782912, datang pada 07/12/2023, Diperiksa oleh Dr. Matt, KTP 2177727817828889, keluhan: sakit kepala, keluhan tambahan: sakit gigi. disarankan untuk tidur. dokter rujukan: Dr. Steve. tidur and makan

Standardized Information:
PATIENT_KTP: 1234878290999199
DOCTOR_KTP: 2177727817828889
AGE: 25
GENDER: female
PHONE: +6281923938478
MEDICAL_RECORD_NUMBER: 782912
BLOOD_TYPE: AB
MAIN_COMPLAINT: ['sakit kepala']
ADDITIONAL_COMPLAINT: ['sakit gigi']
RECOMMENDATION: ['disarankan untuk tidur']
NOTES: ['tidur and makan']
PATIENT_NAME: alice queen
DOCTOR_NAME: dr. matt
REFERRAL: dr. steve
ADDRESS: jl. sains no. 18
CITY: serang
DATE_VISIT: 2023-12-07
VISIT_YEAR: 2023
BIRTH_YEAR: 1999

Mapped FHIR Resources:

Patient:
{
    "resourceType": "Patient",
    "id": "5bfba629-303c-4c80-a1dd