In [29]:
!pip install fastapi uvicorn pytesseract pdf2image pydantic




In [30]:
from fastapi import FastAPI, File, UploadFile
from pydantic import BaseModel
from pdf2image import convert_from_path
import pytesseract
import re
import io
from typing import List, Dict, Any
from fastapi.responses import JSONResponse

# FastAPI app setup
app = FastAPI()

# Helper function to extract text from PDF or image using OCR
def extract_text_from_image(file: io.BytesIO):
    # Convert image to text using pytesseract
    text = pytesseract.image_to_string(file)
    return text.strip()

# Function to parse the text and extract lab test data
def extract_lab_tests(text: str) -> List[Dict[str, Any]]:
    lab_tests = []
    # Regex pattern to find test names, values, and reference ranges (example pattern)
    pattern = r"(?P<test_name>[\w\s]+)\s+([\d\.\-]+)\s*(?:\((?P<ref_range>[^\)]+)\))?"
    matches = re.finditer(pattern, text)

    for match in matches:
        test_name = match.group("test_name").strip()
        test_value = match.group(2).strip()
        ref_range = match.group("ref_range") if match.group("ref_range") else None

        # Calculate if the test value is within the reference range
        lab_test_out_of_range = False
        if ref_range:
            # Split reference range (assuming range is like "10-20")
            try:
                min_range, max_range = map(float, ref_range.split('-'))
                if float(test_value) < min_range or float(test_value) > max_range:
                    lab_test_out_of_range = True
            except ValueError:
                pass  # If reference range is not valid, we skip this check

        lab_tests.append({
            "test_name": test_name,
            "test_value": test_value,
            "bio_reference_range": ref_range,
            "lab_test_out_of_range": lab_test_out_of_range
        })

    return lab_tests

# Define response model for the API
class LabTestResponse(BaseModel):
    lab_tests: List[Dict[str, Any]]
    is_success: bool

# Define the POST endpoint
@app.post("/get-lab-tests", response_model=LabTestResponse)
async def get_lab_tests(file: UploadFile = File(...)):
    try:
        # Read image file
        file_content = await file.read()

        # Convert the image file to a BytesIO object
        image_file = io.BytesIO(file_content)

        # Extract text from the image
        extracted_text = extract_text_from_image(image_file)

        # Parse the text to extract lab test data
        lab_tests = extract_lab_tests(extracted_text)

        return JSONResponse(
            content={
                "lab_tests": lab_tests,
                "is_success": True
            },
            status_code=200
        )
    except Exception as e:
        return JSONResponse(
            content={
                "message": str(e),
                "is_success": False
            },
            status_code=400
        )



In [32]:
!uvicorn main:app --reload


[32mINFO[0m:     Will watch for changes in these directories: ['/content']
[32mINFO[0m:     Uvicorn running on [1mhttp://127.0.0.1:8000[0m (Press CTRL+C to quit)
[32mINFO[0m:     Started reloader process [[36m[1m11423[0m] using [36m[1mStatReload[0m
[32mINFO[0m:     Started server process [[36m11425[0m]
[32mINFO[0m:     Waiting for application startup.
[32mINFO[0m:     Application startup complete.
[32mINFO[0m:     Shutting down
[32mINFO[0m:     Waiting for application shutdown.
[32mINFO[0m:     Application shutdown complete.
[32mINFO[0m:     Finished server process [[36m11425[0m]
[32mINFO[0m:     Stopping reloader process [[36m[1m11423[0m]


In [35]:
!pip install requests




In [61]:
import pytesseract
from PIL import Image
import re
import json
from transformers import pipeline
import numpy as np

# Load BioBERT pipeline
bio_bert_pipe = pipeline("feature-extraction", model="dmis-lab/biobert-v1.1")

# Function to extract text from image
def extract_text_from_image(image_path):
    image = Image.open(image_path)
    return pytesseract.image_to_string(image)

# Heuristic: Check if a line is likely a lab test (placeholder)
def is_probable_lab_test_line(line, threshold=0.5):
    if len(line.strip()) < 10:
        return False
    features = bio_bert_pipe(line)[0]  # [tokens, 768]
    avg_vec = np.mean(features, axis=0)
    vector_norm = np.linalg.norm(avg_vec)
    # Heuristic threshold (empirically chosen — tune this)
    return vector_norm > 12

# Function to extract lab tests from lines
def extract_lab_tests_from_lines(lines):
    lab_tests = []
    pattern = r"([A-Za-z0-9\s\(\)\-]+?)\s+([0-9]+(?:\.[0-9]+)?)\s*(mg/dL|g/dL|pg/mL|unit|mmol/L)?\s*\(?(\d+)\s*[-–]\s*(\d+)\)?"

    for line in lines:
        if not is_probable_lab_test_line(line):
            continue
        match = re.search(pattern, line)
        if match:
            test_name = match.group(1).strip()
            value = float(match.group(2))
            unit = match.group(3) if match.group(3) else "unit"
            ref_min = int(match.group(4))
            ref_max = int(match.group(5))
            lab_test_out_of_range = value < ref_min or value > ref_max

            lab_tests.append({
                "test_name": test_name,
                "test_value": value,
                "bio_reference_range": f"{ref_min} - {ref_max}",
                "test_unit": unit,
                "lab_test_out_of_range": lab_test_out_of_range
            })
    return lab_tests

# Main lab report processor
def process_lab_report(file_path):
    if file_path.endswith('.pdf'):
        extracted_text = extract_text_from_pdf(file_path)
    else:
        image = Image.open(file_path)
        extracted_text = pytesseract.image_to_string(image)

    lab_tests = extract_lab_tests(extracted_text)
    return {
        "is_success": True,
        "lab_tests": lab_tests
    }
# Run on given image
image_path = "/content/lab_reports_samples/lbmaske/GUR-0325-PA-0043338_Q-DINESHIPDFILE11zon1_250422_1309@F.pdf_page_33.png"
result = process_lab_report(image_path)

# Print output as JSON

result = process_lab_report(image_path)

# Final Output: Print the full result as JSON
print(json.dumps(result, indent=4))


Device set to use cpu


{
    "is_success": true,
    "lab_tests": [
        {
            "test_name": "dl",
            "test_value": 13.0,
            "bio_reference_range": "0 - 17",
            "test_unit": "unit",
            "lab_test_out_of_range": false
        },
        {
            "test_name": "emm",
            "test_value": 4000.0,
            "bio_reference_range": "0 - 1100",
            "test_unit": "unit",
            "lab_test_out_of_range": true
        },
        {
            "test_name": "mm",
            "test_value": 4.5,
            "bio_reference_range": "0 - 6",
            "test_unit": "unit",
            "lab_test_out_of_range": false
        },
        {
            "test_name": "fl",
            "test_value": 76.0,
            "bio_reference_range": "0 - 96",
            "test_unit": "unit",
            "lab_test_out_of_range": false
        },
        {
            "test_name": "umm",
            "test_value": 1.5,
            "bio_reference_range": "0 - 4",
            "tes

In [17]:
import pytesseract
from PIL import Image
import re
import json
from transformers import pipeline
import numpy as np
import cv2
import os

# Load BioBERT pipeline
bio_bert_pipe = pipeline("feature-extraction", model="dmis-lab/biobert-v1.1")

# Image preprocessing to clean up the text
def preprocess_image_for_ocr(image_path):
    image = cv2.imread(image_path, cv2.IMREAD_GRAYSCALE)
    image = cv2.resize(image, None, fx=2, fy=2, interpolation=cv2.INTER_LINEAR)
    image = cv2.GaussianBlur(image, (3, 3), 0)
    image = cv2.adaptiveThreshold(image, 255,
                                  cv2.ADAPTIVE_THRESH_GAUSSIAN_C,
                                  cv2.THRESH_BINARY, 11, 2)
    temp_path = "/tmp/processed_image.png"
    cv2.imwrite(temp_path, image)
    return temp_path

# Extract text from image using pytesseract
def extract_text_from_image(image_path):
    processed_path = preprocess_image_for_ocr(image_path)
    text = pytesseract.image_to_string(Image.open(processed_path))
    return text

# BioBERT heuristic to validate if line is related to lab tests
def is_probable_lab_test_line(line):
    try:
        # BioBERT feature extraction
        features = bio_bert_pipe(line)[0]
        avg_vec = np.mean(features, axis=0)
        vector_norm = np.linalg.norm(avg_vec)
        return vector_norm > 12  # Threshold for identifying probable lab tests
    except Exception as e:
        return False

# Main function to extract lab tests
def extract_lab_tests_from_lines(lines):
    lab_tests = []
    pattern = r"([A-Za-z0-9\s\(\)\-]+?)\s+([0-9]+(?:\.[0-9]+)?)\s*(mg/dL|g/dL|pg/mL|unit|mmol/L)?\s*\(?(\d+)\s*[-–]\s*(\d+)\)?"

    for idx, line in enumerate(lines):
        line = line.strip()
        if not line:
            continue


        # Run BioBERT evaluation
        bio_bert_passed = is_probable_lab_test_line(line)

        # Run regex match
        match = re.search(pattern, line)
        if match:
            test_name = match.group(1).strip()
            value = float(match.group(2))
            unit = match.group(3) if match.group(3) else "unit"
            ref_min = int(match.group(4))
            ref_max = int(match.group(5))
            out_of_range = value < ref_min or value > ref_max


            lab_tests.append({
                "test_name": test_name,
                "test_value": value,
                "bio_reference_range": f"{ref_min} - {ref_max}",
                "test_unit": unit,
                "lab_test_out_of_range": out_of_range
            })


    return lab_tests

# Full pipeline to process lab report
def process_lab_report(image_path):

    extracted_text = extract_text_from_image(image_path)



    lines = extracted_text.splitlines()
    lab_tests = extract_lab_tests_from_lines(lines)

    result = {
        "is_success": True,
        "lab_tests": lab_tests
    }

    print(json.dumps(result, indent=4))



# Test on image
image_path = "/content/lab_reports_samples/lbmaske/GUR-0325-PA-0043338_Q-DINESHIPDFILE11zon1_250422_1309@F.pdf_page_33.png"
process_lab_report(image_path)


Device set to use cpu


{
    "is_success": true,
    "lab_tests": [
        {
            "test_name": "MOV",
            "test_value": 0.0,
            "bio_reference_range": "7600 - 9600",
            "test_unit": "unit",
            "lab_test_out_of_range": true
        },
        {
            "test_name": "ORO",
            "test_value": 50.0,
            "bio_reference_range": "0 - 3600",
            "test_unit": "unit",
            "lab_test_out_of_range": false
        },
        {
            "test_name": "PlateletCount lacumm",
            "test_value": 13.0,
            "bio_reference_range": "0 - 450",
            "test_unit": "unit",
            "lab_test_out_of_range": false
        },
        {
            "test_name": "ROW",
            "test_value": 8805.0,
            "bio_reference_range": "0 - 9450",
            "test_unit": "unit",
            "lab_test_out_of_range": false
        }
    ]
}


In [18]:
!pip install supabase


Collecting supabase
  Downloading supabase-2.15.1-py3-none-any.whl.metadata (11 kB)
Collecting gotrue<3.0.0,>=2.11.0 (from supabase)
  Downloading gotrue-2.12.0-py3-none-any.whl.metadata (6.1 kB)
Collecting postgrest<1.1,>0.19 (from supabase)
  Downloading postgrest-1.0.1-py3-none-any.whl.metadata (3.5 kB)
Collecting realtime<2.5.0,>=2.4.0 (from supabase)
  Downloading realtime-2.4.3-py3-none-any.whl.metadata (6.7 kB)
Collecting storage3<0.12,>=0.10 (from supabase)
  Downloading storage3-0.11.3-py3-none-any.whl.metadata (1.8 kB)
Collecting supafunc<0.10,>=0.9 (from supabase)
  Downloading supafunc-0.9.4-py3-none-any.whl.metadata (1.2 kB)
Collecting pytest-mock<4.0.0,>=3.14.0 (from gotrue<3.0.0,>=2.11.0->supabase)
  Downloading pytest_mock-3.14.0-py3-none-any.whl.metadata (3.8 kB)
Collecting deprecation<3.0.0,>=2.1.0 (from postgrest<1.1,>0.19->supabase)
  Downloading deprecation-2.1.0-py2.py3-none-any.whl.metadata (4.6 kB)
Collecting aiohttp<4.0.0,>=3.11.18 (from realtime<2.5.0,>=2.4.0-