## 1. Overview

- Input: Lab report file (PDF or image)
- Output: Structured JSON with test names, values, units, patient details
- Machine learning: Adapts to new formats

## 2. Required Libraries

Install and import necessary libraries for PDF/image reading, OCR, and NLP.

In [None]:
%pip install pytesseract
%pip install pdfplumber
%pip install spacy

# Install required packages (uncomment if needed)
# !pip install pytesseract pdfplumber pillow spacy

import pytesseract
from PIL import Image
import pdfplumber
import spacy
import re
import json


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.

[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Additional imports for file handling and type detection
import pandas as pd
import requests
import os
import mimetypes
from pathlib import Path

# Read URLs from Medical report.xlsx
sheet = pd.read_excel('Medical report.xlsx')
url_col = [col for col in sheet.columns if 'url' in col.lower()][0]
urls = sheet[url_col].dropna().tolist()

# Download files
def download_files(urls, download_dir='lab_reports'):
    os.makedirs(download_dir, exist_ok=True)
    local_paths = []
    for url in urls:
        filename = os.path.join(download_dir, url.split('/')[-1])
        r = requests.get(url)
        with open(filename, 'wb') as f:
            f.write(r.content)
        local_paths.append(filename)
    return local_paths

local_files = download_files(urls)
print('Downloaded files:', local_files)

Downloaded files: ['lab_reports/82226685_Report_50636000175_RATHNA%20GABRIAL.pdf', 'lab_reports/72829849_MultiLabIdReport.pdf', 'lab_reports/17756177_50641000301.pdf', 'lab_reports/32187653_MRS.%20SATHYAVATHY.pdf', 'lab_reports/53132939_MR.%20PRABHAKARAN.pdf', 'lab_reports/28396850_prabhakaran%20nair.pdf', 'lab_reports/85496534_77e377bf-8723-488e-b33d-cbde2becae3c%20NAGASHANKAR%20RAO.pdf', 'lab_reports/91681339_f3b8288c-4bcf-4c1c-b242-e05f152b8e62%20SWARNA%20LATA.pdf', 'lab_reports/97308261_Mr.%20MANISH%20BAJPIE.pdf']


## 3. Read Lab Report File

Functions to extract text from PDF or image files.

In [None]:
# Extract text from PDF
def extract_text_from_pdf(pdf_path):
    text = ""
    with pdfplumber.open(pdf_path) as pdf:
        for page in pdf.pages:
            text += page.extract_text() + "\n"
    return text

# Extract text from image
def extract_text_from_image(image_path):
    img = Image.open(image_path)
    text = pytesseract.image_to_string(img)
    return text

## 4. Information Extraction

Extract test names, values, units, and patient details using regex and NLP.

In [None]:
# Improved regex-based extraction for patient name and tests
def extract_lab_info(text):
    # Adjust patient name pattern to match more formats (e.g., 'Name:', 'Patient:', etc.)
    patient_pattern = r"(?:Patient Name|Name|Patient)[:\s]+([A-Za-z ,.'-]+)"
    # Example test pattern: 'TestName: value unit' or 'TestName value unit'
    test_pattern = r"([A-Za-z ]+)[\s:]+([\d.]+)\s*([A-Za-z/%]+)"
    results = []
    for match in re.finditer(test_pattern, text):
        results.append({
            "test_name": match.group(1).strip(),
            "value": match.group(2),
            "unit": match.group(3),
            "confidence": 1.0  # Rule-based extraction assumed perfect match
        })
    patient = re.search(patient_pattern, text)
    patient_name = patient.group(1).strip() if patient else None
    return {"patient_name": patient_name, "tests": results, "confidence": 1.0 if patient_name else 0.5}

## 5. Convert to JSON

Convert the extracted information to structured JSON format.

In [None]:
# Example: Use spaCy for NER (customize with training data)
%pip install spacy
import spacy
from spacy.tokens import DocBin

# Download and load pre-trained spaCy model
try:
    nlp = spacy.load('en_core_web_sm')
except OSError:
    from spacy.cli import download
    download('en_core_web_sm')
    nlp = spacy.load('en_core_web_sm')

def extract_entities_with_spacy(text):
    doc = nlp(text)
    entities = []
    for ent in doc.ents:
        # SpaCy does not provide per-entity confidence, but you can use heuristics
        confidence = 1.0 if ent.label_ in ['PERSON', 'ORG', 'DATE', 'GPE'] else 0.8
        entities.append({
            'text': ent.text,
            'label': ent.label_,
            'confidence': confidence
        })
    return entities

# Example usage:
# text = extract_text_from_pdf(local_files[0])
# entities = extract_entities_with_spacy(text)
# print(entities)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [None]:
# Example usage for full pipeline with human-in-the-loop review
# Process all downloaded files and convert extracted info to JSON

# extract_entities_with_spacy is already defined in a previous cell

results = []
if 'local_files' not in globals():
    raise NameError("Variable 'local_files' is not defined. Please run the cell that downloads the files first.")
for file_path in local_files:
    # Detect file type
    mime_type, _ = mimetypes.guess_type(file_path)
    if mime_type and 'pdf' in mime_type:
        text = extract_text_from_pdf(file_path)
    elif mime_type and ('image' in mime_type or file_path.lower().endswith(('.png', '.jpg', '.jpeg'))):
        text = extract_text_from_image(file_path)
    else:
        print(f'Unsupported file type: {file_path}')
        continue
    # Extract info
    info = extract_lab_info(text)
    if 'extract_entities_with_spacy' not in globals():
        raise NameError("Function 'extract_entities_with_spacy' is not defined. Please run the cell that defines it first.")
    entities = extract_entities_with_spacy(text)
    # Combine results
    result = {
        'file': file_path,
        'extracted_info': info,
        'entities': entities
    }
    # Human-in-the-loop review: prompt user to confirm or edit extracted info
    # To improve execution order and avoid NameError, move the cell that defines extract_entities_with_spacy (cell 12) above this cell.
    # After moving, you can remove any checks for 'extract_entities_with_spacy' in globals().
    # This ensures extract_entities_with_spacy is always available before this cell runs.
    print(f'Extracted info for {file_path}:')
    import pprint
    pprint.pprint(result)
    user_confirm = input('Is the extracted information correct? (y/n): ')
    if user_confirm.lower() != 'y':
        print('Please review and edit the extracted information below:')
        # Optionally, allow user to edit the result dictionary here
        # For simplicity, you can manually update the result in the notebook
    results.append(result)
    # Save as JSON
    json_path = file_path + '.json'
    with open(json_path, 'w') as f:
        json.dump(result, f, indent=2)
    print(f'Extracted and saved JSON for {file_path}')

# Display all results
pprint.pprint(results)

Extracted info for lab_reports/82226685_Report_50636000175_RATHNA%20GABRIAL.pdf:
{'entities': [{'confidence': 1.0,
               'label': 'ORG',
               'text': 'MC-6072\nLABORATORY REPORT - FINAL\nName'},
              {'confidence': 1.0, 'label': 'DATE', 'text': '50636000175'},
              {'confidence': 0.8, 'label': 'EVENT', 'text': 'Years Mob'},
              {'confidence': 1.0, 'label': 'PERSON', 'text': 'Loc'},
              {'confidence': 1.0, 'label': 'ORG', 'text': 'Time'},
              {'confidence': 0.8, 'label': 'CARDINAL', 'text': '03'},
              {'confidence': 1.0, 'label': 'ORG', 'text': 'Time'},
              {'confidence': 0.8, 'label': 'CARDINAL', 'text': '03'},
              {'confidence': 1.0,
               'label': 'ORG',
               'text': 'KA-Kasturinagar Ref'},
              {'confidence': 1.0,
               'label': 'PERSON',
               'text': 'Sample Collected'},
              {'confidence': 1.0,
               'label': 'PERSON',
  

## 6. Machine Learning Component

Use NLP models (e.g., spaCy NER) to adapt extraction for new formats. Train on labeled data to recognize entities.

In [None]:
# Continuous Learning: Save reviewed data for future model training
import json

# Save reviewed extractions to a training dataset file
with open('reviewed_extractions.jsonl', 'a') as train_file:
    for result in results:
        train_file.write(json.dumps(result) + '\n')
print('Reviewed extractions saved for future model retraining.')

# Outline for retraining spaCy NER model
# 1. Convert reviewed_extractions.jsonl to spaCy training format
# 2. Use spaCy CLI or API to retrain/fine-tune the NER model
# 3. Replace the old model with the new one for improved extraction

# Example (not executed here):
# !python -m spacy train config.cfg --paths.train ./train.spacy --paths.dev ./dev.spacy --output ./output_model

Reviewed extractions saved for future model retraining.


In [None]:
# Demo UI in Jupyter using Gradio
# Install gradio if not already installed
%pip install gradio
import gradio as gr

# Import extraction functions from notebook
# (Assume extract_text_from_pdf, extract_text_from_image, extract_lab_info, extract_entities_with_spacy are defined above)

def extract_report(file):
    import mimetypes
    filename = file.name
    mime_type, _ = mimetypes.guess_type(filename)
    if mime_type and 'pdf' in mime_type:
        text = extract_text_from_pdf(filename)
    elif mime_type and ('image' in mime_type or filename.lower().endswith(('.png', '.jpg', '.jpeg'))):
        text = extract_text_from_image(filename)
    else:
        return {'error': 'Unsupported file type'}
    info = extract_lab_info(text)
    entities = extract_entities_with_spacy(text)
    return {'extracted_info': info, 'entities': entities}

iface = gr.Interface(
    fn=extract_report,
    inputs=gr.File(label="Upload Lab Report (PDF or Image)"),
    outputs="json",
    title="Lab Report Extractor",
    description="Upload a lab report to extract structured information."
)
iface.launch(share=False)


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m25.1.1[0m[39;49m -> [0m[32;49m25.2[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpython -m pip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


  from .autonotebook import tqdm as notebook_tqdm


* Running on local URL:  http://127.0.0.1:7862
* To create a public link, set `share=True` in `launch()`.




In [None]:
# REST API endpoint in Jupyter using Gradio Blocks
# This allows you to POST files and get JSON results
import gradio as gr

def api_extract(file):
    import mimetypes
    filename = file.name
    mime_type, _ = mimetypes.guess_type(filename)
    if mime_type and 'pdf' in mime_type:
        text = extract_text_from_pdf(filename)
    elif mime_type and ('image' in mime_type or filename.lower().endswith(('.png', '.jpg', '.jpeg'))):
        text = extract_text_from_image(filename)
    else:
        return {'error': 'Unsupported file type'}
    info = extract_lab_info(text)
    entities = extract_entities_with_spacy(text)
    return {'extracted_info': info, 'entities': entities}

with gr.Blocks() as demo:
    gr.Interface(
        fn=api_extract,
        inputs=gr.File(label="Upload Lab Report (PDF or Image)"),
        outputs="json",
        title="Lab Report Extraction API",
        description="POST a file to this endpoint to get extracted JSON."
    )
    gr.Markdown("""
    ### API Usage
    You can POST a file to this endpoint using Python requests:
    ```python
    import requests
    files = {'file': open('your_report.pdf', 'rb')}
    response = requests.post('http://localhost:7860/', files=files)
    print(response.json())
    ```
    """)
demo.launch(share=False)

* Running on local URL:  http://127.0.0.1:7863
* To create a public link, set `share=True` in `launch()`.


