## Creating sample_medical_reports

In [3]:
from fpdf import FPDF
import os

# Create output folder
os.makedirs("sample_medical_reports", exist_ok=True)

reports = {
    "report_1": """Patient Name: John Doe
Age: 45 years
Gender: Male
Report Date: 15/08/2025
Test Results:
Blood Test: Normal
ECG: Abnormal
X-ray Chest: Clear
MRI Brain: No abnormalities
Diagnosis:
Hypertension, Mild anemia""",

    "report_2": """Name: Sarah Lee
Age: 32 yrs
Sex: Female
Date: 20-08-2025
Laboratory Results:
Complete Blood Count (CBC): WBC 6.5x10^3/uL, RBC 4.8x10^6/uL
Thyroid Function Test: TSH 3.0 mIU/L
Ultrasound Abdomen: Normal
Observations:
Diabetes Mellitus Type 2, Hypothyroidism""",

    "report_3": """Patient Name: Michael Smith
Age: 60
Gender: M
Report Date: 10/08/2025
Test Results:
CT Scan Chest: Signs of pneumonia
Blood Test: Elevated WBC
ECG: Normal
Diagnosis:
Pneumonia""",

    "report_4": """Patient Name: Priya Kumar
Age: 28 years
Gender: Female
Report Date: 18/08/2025
Test Results:
Ultrasound Pelvis: Normal
Pap Smear: Negative
Blood Test: Hemoglobin 12.5 g/dL
Diagnosis:
Normal""",

    "report_5": """Patient Name: Ahmed Ali
Age: 55 y/o
Sex: Male
Date: 22/08/2025
Laboratory Results:
Liver Function Test: ALT 45 U/L, AST 38 U/L
Kidney Function Test: Creatinine 1.2 mg/dL
CT Scan Abdomen: Fatty liver
Diagnosis:
Fatty Liver, Hypertension""",

    "report_6": """Patient Name: Emily Davis
Age: 40
Gender: F
Report Date: 19/08/2025
Test Results:
MRI Spine: Mild disc protrusion at L4-L5
X-ray Spine: Normal
Blood Test: Normal
Diagnosis:
Lower back pain due to disc protrusion""",

    "report_7": """Patient Name: Rajesh Nair
Age: 50 years
Gender: Male
Report Date: 17/08/2025
Test Results:
ECG: Abnormal
Stress Test: Positive
Echocardiogram: Mild LV dysfunction
Blood Test: Troponin Normal
Diagnosis:
Coronary Artery Disease""",

    "report_8": """Patient Name: Ananya Singh
Age: 35
Sex: Female
Date: 21/08/2025
Laboratory Results:
Blood Sugar Fasting: 110 mg/dL
HbA1c: 6.8%
Lipid Panel: Cholesterol 210 mg/dL, LDL 140 mg/dL
Ultrasound Abdomen: Normal
Observations:
Prediabetes, Hyperlipidemia""",

    "report_9": """Patient Name: Robert Brown
Age: 65
Gender: M
Report Date: 16/08/2025
Test Results:
Pulmonary Function Test: FEV1 70%
X-ray Chest: Mild fibrosis
Blood Test: Normal
Diagnosis:
COPD""",

    "report_10": """Patient Name: Fatima Zahra
Age: 30 years
Gender: Female
Report Date: 23/08/2025
Test Results:
Pregnancy Test: Positive
Ultrasound Pelvis: Single intrauterine pregnancy
Blood Test: Normal
Diagnosis:
Early Pregnancy"""
}


# Styles for different reports
styles = [
    {"font": "Arial", "size": 12},
    {"font": "Times", "size": 14},
    {"font": "Courier", "size": 12},
    {"font": "Arial", "size": 10},
    {"font": "Times", "size": 12},
    {"font": "Courier", "size": 14},
    {"font": "Arial", "size": 13},
    {"font": "Times", "size": 11},
    {"font": "Courier", "size": 13},
    {"font": "Arial", "size": 15},
]

# Ensure output folder exists
os.makedirs(r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples", exist_ok=True)

for i, (report_name, content) in enumerate(reports.items()):
    pdf = FPDF()
    pdf.add_page()
    style = styles[i % len(styles)]
    pdf.set_font(style["font"], '', style["size"])
    pdf.multi_cell(0, 8, content)
    
    # Correct f-string for dynamic file names
    output_path = f"C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\{report_name}.pdf"
    pdf.output(output_path)

print("PDF reports generated successfully.")



PDF reports generated successfully.


## Importing Required Libraries

In [5]:

import os
import re
import json
import pandas as pd
import pdfplumber
import pytesseract
from PIL import Image
from pathlib import Path
import spacy
from typing import Dict, List, Union
import fitz  
import pytesseract
import io
from PIL import Image

nlp_general = spacy.load("en_core_web_sm")       # For general entities (names)
nlp_medical = spacy.load("en_core_sci_sm")       # For medical domain entities (tests, diseases, etc.)

  deserializers["tokenizer"] = lambda p: self.tokenizer.from_disk(  # type: ignore[union-attr]


# Text Exrtraction

In [6]:

def extract_text_from_pdfs(pdf_paths: List[str]) -> Dict[str, str]:
    """
    Extract text from multiple PDF files using pdfplumber for digital PDFs and OCR for image-based PDFs.
    """
    results = {}
    
    for pdf_path in pdf_paths:
        text = ""
        try:
            # Try extracting with pdfplumber first (digital PDFs)
            with pdfplumber.open(pdf_path) as pdf:
                for page in pdf.pages:
                    page_text = page.extract_text()
                    if page_text:
                        text += page_text + "\n"
            
            # If nothing extracted (image-based PDF), do OCR
            if not text.strip():
                with pdfplumber.open(pdf_path) as pdf:
                    for page in pdf.pages:
                        pil_img = page.to_image(resolution=300).original
                        text += pytesseract.image_to_string(pil_img) + "\n"
            
            results[pdf_path] = text.strip()
        
        except Exception as e:
            print(f"Error reading PDF {pdf_path}: {e}")
            results[pdf_path] = ""  # Store empty string for failed PDFs
    
    return results


In [None]:

sample_pdfs = [r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_1.pdf",
               r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_2.pdf",
               r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_3.pdf",
               r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_4.pdf",
               r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_5.pdf",
               r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_6.pdf"]
extracted_texts = extract_text_from_pdfs(sample_pdfs)

all_text = "\n".join(extracted_texts.values())

# Preview first 500 characters of each PDF's extracted text
for pdf_path, text in extracted_texts.items():
    print(f"\nText from {pdf_path} (first 500 chars):")
    print(text[:500])


print("\n Combined all text (first 500 chars):")
print(all_text[:500])


Text from C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_1.pdf :
Patient Name: John Doe
Age: 45 years
Gender: Male
Report Date: 15/08/2025
Test Results:
Blood Test: Normal
ECG: Abnormal
X-ray Chest: Clear
MRI Brain: No abnormalities
Diagnosis:
Hypertension, Mild anemia

Text from C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_2.pdf :
Name: Sarah Lee
Age: 32 yrs
Sex: Female
Date: 20-08-2025
Laboratory Results:
Complete Blood Count (CBC): WBC 6.5x10^3/uL, RBC 4.8x10^6/uL
Thyroid Function Test: TSH 3.0 mIU/L
Ultrasound Abdomen: Normal
Observations:
Diabetes Mellitus Type 2, Hypothyroidism

Text from C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\samples\report_3.pdf :
Patient Name: Michael Smith
Age: 60
Gender: M
Report Date: 10/08/2025
Test Results:
CT Scan Chest: Signs of pneumonia
Blood Test: Elevated WBC
ECG: Normal
Diagnosis:
Pneumonia

Text from C:\Users\pessh\Desktop\Medical Report Data Extraction\

In [8]:
print(extracted_texts)

{'C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_1.pdf': 'Patient Name: John Doe\nAge: 45 years\nGender: Male\nReport Date: 15/08/2025\nTest Results:\nBlood Test: Normal\nECG: Abnormal\nX-ray Chest: Clear\nMRI Brain: No abnormalities\nDiagnosis:\nHypertension, Mild anemia', 'C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_2.pdf': 'Name: Sarah Lee\nAge: 32 yrs\nSex: Female\nDate: 20-08-2025\nLaboratory Results:\nComplete Blood Count (CBC): WBC 6.5x10^3/uL, RBC 4.8x10^6/uL\nThyroid Function Test: TSH 3.0 mIU/L\nUltrasound Abdomen: Normal\nObservations:\nDiabetes Mellitus Type 2, Hypothyroidism', 'C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_3.pdf': 'Patient Name: Michael Smith\nAge: 60\nGender: M\nReport Date: 10/08/2025\nTest Results:\nCT Scan Chest: Signs of pneumonia\nBlood Test: Elevated WBC\nECG: Normal\nDiagnosis:\nPneumonia', 'C:\\Users\\pessh\\Desktop\\Medic

## Preprocess and Extract

In [9]:
import re
import json
import spacy
from typing import Dict, List, Union

# Load models
nlp_name = spacy.load("en_core_web_sm")        # For name detection
nlp_medical = spacy.load("en_ner_bc5cdr_md")   # For medical NER (diseases)

def preprocess_text(text: str) -> str:
    """
    Clean up text by removing extra spaces and line breaks.
    """
    if not text:
        return ""
    text = re.sub(r"\n+", " ", text)  # Replace multiple newlines with space
    text = re.sub(r"\s{2,}", " ", text)  # Replace multiple spaces with single space
    return text.strip()

def extract_patient_info(text: str) -> Dict[str, str]:
    """
    Extract basic patient information using regex patterns.
    """
    info = {}
    
    # Extract name
    name_match = re.search(r"(?:Patient Name|Name)[:\s]+([A-Za-z\s]+?)(?=\n|Age|$)", text, re.IGNORECASE)
    if name_match:
        info["Patient Name"] = name_match.group(1).strip()
    
    # Extract age
    age_match = re.search(
        r"\bAge\b[:=\s]{0,8}(\d{1,3})(?:\s*(?:years?|yrs?|y/o|year-old))?", 
        text, 
        re.IGNORECASE
    )

    if age_match:
        info["Age"] = age_match.group(1)

    # Extract gender
    gender_match = re.search(r"(?:Gender|Sex)[:\s]*(Male|Female|M|F)", text, re.IGNORECASE)
    if gender_match:
        gender = gender_match.group(1)
        # Standardize gender format
        if gender.upper() in ["M", "MALE"]:
            info["Gender"] = "Male"
        elif gender.upper() in ["F", "FEMALE"]:
            info["Gender"] = "Female"
    
    # Extract date
    date_match = re.search(r"(?:Report Date|Date)[:\s]*(\d{2}[/-]\d{2}[/-]\d{4})", text, re.IGNORECASE)
    if date_match:
        info["Report Date"] = date_match.group(1)
    
    return info

def extract_tests(text: str) -> List[Dict[str, str]]:
    """
    Extract test results + mentions of known tests from text.
    """
    tests = []
    

    test_list = [
        "X-ray", "CT scan", "MRI", "ultrasound", "blood test", "ECG", "echocardiogram", "biopsy",
        "urinalysis", "stool test", "colonoscopy", "endoscopy", "pulmonary function test",
        "mammogram", "Pap smear", "bone density test", "angiogram", "PET scan",
        "lumbar puncture", "stress test", "Holter monitor", "EEG", "EMG", "allergy test",
        "genetic testing", "vital signs", "physical examination", "glucose tolerance test",
        "thyroid function test", "lipid panel", "liver function test", "kidney function test",
        "coagulation test", "C-reactive protein test", "troponin test", "arterial blood gas",
        "culture and sensitivity", "COVID-19 test", "influenza test", "pregnancy test"
    ] 
    
    test_section_match = re.search(
        r"(?:Test Results|Laboratory Results)[:\s]*(.*?)(?=Diagnosis|Observations|$)", 
        text, re.IGNORECASE | re.DOTALL
    )
    
    if test_section_match:
        test_text = test_section_match.group(1)
        
        
        test_pattern = r"([A-Za-z\s\(\)/\-]+?):\s*([\d\.\-x^/]+|Normal|Abnormal|Positive|Negative)\s*([a-zA-Z/%\^\-\u00B5]*)(?:\s*\([^\)]+\))?"
        
        test_matches = re.findall(test_pattern, test_text, re.IGNORECASE)
        
        for match in test_matches:
            test_name, value, unit = match
            tests.append({
                "Test": test_name.strip(),
                "Value": value.strip(),
                "Unit": unit.strip()
            })
    

    for test_name in test_list:
        pattern = r"\b" + re.escape(test_name) + r"\b"
        if re.search(pattern, text, re.IGNORECASE):
            tests.append({
                "Test": test_name,
                "Value": "Detected",
                "Unit": ""
            })
    
    return tests

def extract_diagnosis(text: str) -> List[str]:
    """
    Extract diagnoses using both regex patterns and medical NER.
    """
    diagnoses = []
    
   
    diag_section_match = re.search(r"(?:Diagnosis|Diagnosis/Observations|Observations|Impression)[:\s]*(.*?)(?=Test Results|Patient|$)", text, re.IGNORECASE | re.DOTALL)
    
    if diag_section_match:
        diag_text = diag_section_match.group(1)
        
        # Use medical NER to extract diseases
        doc = nlp_medical(diag_text)
        for ent in doc.ents:
            if ent.label_ == "DISEASE":
                diagnoses.append(ent.text)
        
        
        common_diagnoses = [
            "diabetes", "hypertension", "asthma", "pneumonia", "cancer", "stroke",
            "myocardial infarction", "heart failure", "arrhythmia", "coronary artery disease",
            "COPD", "bronchitis", "influenza", "tuberculosis", "HIV", "AIDS", "hepatitis",
            "cirrhosis", "GERD", "IBS", "Crohn's disease", "ulcerative colitis", "appendicitis",
            "rheumatoid arthritis", "osteoarthritis", "osteoporosis", "lupus", "fibromyalgia",
            "migraine", "epilepsy", "Alzheimer's", "Parkinson's", "multiple sclerosis",
            "anxiety", "depression", "bipolar disorder", "schizophrenia", "anemia",
            "hyperthyroidism", "hypothyroidism", "chronic kidney disease", "psoriasis", "eczema","Early Pregnancy"
        ]
        
        for diag in common_diagnoses:
            if re.search(r"\b" + diag + r"\b", diag_text, re.IGNORECASE):
                if diag not in diagnoses:
                    diagnoses.append(diag)
    
    return list(set(diagnoses))  # Remove duplicates

def extract_medical_info(text: str) -> List[Dict[str, Union[str, List[Dict[str, str]], List[str]]]]:
    """
    Main function to extract all medical information from text.
    """
    # Preprocess the text
    clean_text = preprocess_text(text)
    
    # Extract patient information
    patient_info = extract_patient_info(clean_text)
    
    # Extract tests
    tests = extract_tests(clean_text)
    
    # Extract diagnosis
    diagnosis = extract_diagnosis(clean_text)
    
    # Combine all information
    result = {
        "Patient Name": patient_info.get("Patient Name", ""),
        "Age": patient_info.get("Age", ""),
        "Gender": patient_info.get("Gender", ""),
        "Report Date": patient_info.get("Report Date", ""),
        "Tests": tests,
        "Diagnosis": diagnosis
    }
    
    return [result]


# result = extract_medical_info(extracted_texts)


# print("Extracted Medical Information:")
# print(json.dumps(result, indent=2))

all_results = {}

for file_path, text in extracted_texts.items():
    all_results[file_path] = extract_medical_info(text)

# Print the results
print("Extracted Medical Information:")
print(json.dumps(all_results, indent=2))


Extracted Medical Information:
{
  "C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_1.pdf": [
    {
      "Patient Name": "John Doe",
      "Age": "45",
      "Gender": "Male",
      "Report Date": "15/08/2025",
      "Tests": [
        {
          "Test": "Blood Test",
          "Value": "Normal",
          "Unit": "ECG"
        },
        {
          "Test": "X-ray",
          "Value": "Detected",
          "Unit": ""
        },
        {
          "Test": "MRI",
          "Value": "Detected",
          "Unit": ""
        },
        {
          "Test": "blood test",
          "Value": "Detected",
          "Unit": ""
        },
        {
          "Test": "ECG",
          "Value": "Detected",
          "Unit": ""
        }
      ],
      "Diagnosis": [
        "Hypertension",
        "anemia",
        "hypertension"
      ]
    }
  ],
  "C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_2.pdf": [
    {
    

## Save to csv

In [None]:
import pandas as pd

def save_medical_info(all_results: dict, output_csv: str):
    """
    Save extracted medical information to a CSV file with one row per patient.
    """
    rows = []

    for file_path, reports in all_results.items():
        for report in reports:
            patient_name = report.get("Patient Name", "")
            age = report.get("Age", "")
            gender = report.get("Gender", "")
            report_date = report.get("Report Date", "")

            # Combine tests into a single string
            if report.get("Tests"):
                tests_combined = "; ".join([f"{t['Test']}: {t['Value']} {t['Unit']}".strip() for t in report["Tests"]])
            else:
                tests_combined = ""

            # Combine diagnoses into a single string
            diagnoses_combined = ", ".join(report.get("Diagnosis", []))

            rows.append({
                "File": file_path,
                "Patient Name": patient_name,
                "Age": age,
                "Gender": gender,
                "Report Date": report_date,
                "Tests": tests_combined,
                "Diagnosis": diagnoses_combined
            })

    # Convert to DataFrame and save
    df = pd.DataFrame(rows)
    df.to_csv(output_csv, index=False)
    print(f"CSV saved at: {output_csv}")


In [11]:
save_medical_info(
    all_results,
    r"C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\medical_info_sixth.csv"
)

CSV saved at: C:\Users\pessh\Desktop\Medical Report Data Extraction\Med_a\data\medical_info_sixth.csv


In [12]:
all_results

{'C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_1.pdf': [{'Patient Name': 'John Doe',
   'Age': '45',
   'Gender': 'Male',
   'Report Date': '15/08/2025',
   'Tests': [{'Test': 'Blood Test', 'Value': 'Normal', 'Unit': 'ECG'},
    {'Test': 'X-ray', 'Value': 'Detected', 'Unit': ''},
    {'Test': 'MRI', 'Value': 'Detected', 'Unit': ''},
    {'Test': 'blood test', 'Value': 'Detected', 'Unit': ''},
    {'Test': 'ECG', 'Value': 'Detected', 'Unit': ''}],
   'Diagnosis': ['Hypertension', 'anemia', 'hypertension']}],
 'C:\\Users\\pessh\\Desktop\\Medical Report Data Extraction\\Med_a\\data\\samples\\report_2.pdf': [{'Patient Name': 'Sarah Lee',
   'Age': '32',
   'Gender': 'Female',
   'Report Date': '20-08-2025',
   'Tests': [{'Test': 'mIU/L Ultrasound Abdomen',
     'Value': 'Normal',
     'Unit': ''},
    {'Test': 'ultrasound', 'Value': 'Detected', 'Unit': ''},
    {'Test': 'thyroid function test', 'Value': 'Detected', 'Unit': ''}],
   'Diagnosis': ['