In [1]:
!pip install transformers torch pillow pytesseract python-docx openpyxl pandas
!pip install sentence-transformers chromadb langchain langchain-community
!pip install Pillow pdf2image zipfile36

Collecting pytesseract
  Downloading pytesseract-0.3.13-py3-none-any.whl.metadata (11 kB)
Collecting python-docx
  Downloading python_docx-1.2.0-py3-none-any.whl.metadata (2.0 kB)
Downloading pytesseract-0.3.13-py3-none-any.whl (14 kB)
Downloading python_docx-1.2.0-py3-none-any.whl (252 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m253.0/253.0 kB[0m [31m2.5 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: python-docx, pytesseract
Successfully installed pytesseract-0.3.13 python-docx-1.2.0
Collecting chromadb
  Downloading chromadb-1.4.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (7.2 kB)
Collecting langchain-community
  Downloading langchain_community-0.4.1-py3-none-any.whl.metadata (3.0 kB)
Collecting build>=1.0.3 (from chromadb)
  Downloading build-1.4.0-py3-none-any.whl.metadata (5.8 kB)
Collecting pybase64>=1.4.1 (from chromadb)
  Downloading pybase64-1.4.3-cp312-cp312-manylinux1_x86_64.manylinux2014_x86_64.manylinux_2_

In [2]:
import os
import zipfile
import json
from pathlib import Path
from typing import List, Dict, Any
import warnings
warnings.filterwarnings('ignore')

# Document processing
from PIL import Image
import pytesseract
from docx import Document
import openpyxl
import pandas as pd

# ML and NLP
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    AutoModelForSeq2SeqLM,
    pipeline
)
from sentence_transformers import SentenceTransformer

# Vector storage
import chromadb
from chromadb.config import Settings




In [3]:
class Config:
    """Configuration for the medical AI agent"""

    # Model configurations
    SLM_MODEL = "google/flan-t5-large"  # Small Language Model
    # Alternatives: "microsoft/phi-2", "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

    EMBEDDING_MODEL = "sentence-transformers/all-MiniLM-L6-v2"

    # Processing parameters
    CHUNK_SIZE = 512
    CHUNK_OVERLAP = 50
    MAX_LENGTH = 512
    TEMPERATURE = 0.7

    # Directories
    UPLOAD_DIR = "./medical_records"
    VECTOR_DB_DIR = "./vector_db"

    # Device
    DEVICE = "cuda" if torch.cuda.is_available() else "cpu"

config = Config()
print(f"✓ Configuration set. Using device: {config.DEVICE}")

# Create directories
os.makedirs(config.UPLOAD_DIR, exist_ok=True)
os.makedirs(config.VECTOR_DB_DIR, exist_ok=True)

✓ Configuration set. Using device: cpu


In [4]:
class DocumentExtractor:
    """Extract text from various document formats"""

    @staticmethod
    def extract_from_image(file_path: str) -> str:
        """Extract text from images using OCR"""
        try:
            image = Image.open(file_path)
            text = pytesseract.image_to_string(image)
            return text
        except Exception as e:
            return f"Error extracting from image: {str(e)}"

    @staticmethod
    def extract_from_docx(file_path: str) -> str:
        """Extract text from DOCX files"""
        try:
            doc = Document(file_path)
            text = "\n".join([paragraph.text for paragraph in doc.paragraphs])
            return text
        except Exception as e:
            return f"Error extracting from DOCX: {str(e)}"

    @staticmethod
    def extract_from_xlsx(file_path: str) -> str:
        """Extract text from Excel files"""
        try:
            df = pd.read_excel(file_path, sheet_name=None)
            text = ""
            for sheet_name, sheet_data in df.items():
                text += f"\n--- Sheet: {sheet_name} ---\n"
                text += sheet_data.to_string()
            return text
        except Exception as e:
            return f"Error extracting from Excel: {str(e)}"

    @staticmethod
    def extract_from_file(file_path: str) -> Dict[str, Any]:
        """Extract text from any supported file"""
        file_ext = Path(file_path).suffix.lower()

        extractors = {
            '.png': DocumentExtractor.extract_from_image,
            '.jpg': DocumentExtractor.extract_from_image,
            '.jpeg': DocumentExtractor.extract_from_image,
            '.docx': DocumentExtractor.extract_from_docx,
            '.xlsx': DocumentExtractor.extract_from_xlsx,
            '.xls': DocumentExtractor.extract_from_xlsx,
        }

        if file_ext in extractors:
            text = extractors[file_ext](file_path)
            return {
                'file_name': Path(file_path).name,
                'file_type': file_ext,
                'content': text,
                'success': not text.startswith("Error")
            }
        else:
            return {
                'file_name': Path(file_path).name,
                'file_type': file_ext,
                'content': '',
                'success': False,
                'error': 'Unsupported file type'
            }

print("✓ Document Extractor initialized")

✓ Document Extractor initialized


In [6]:
class TextChunker:
    """Chunk text for better processing"""

    def __init__(self, chunk_size: int = 512, overlap: int = 50):
        self.chunk_size = chunk_size
        self.overlap = overlap

    def chunk_text(self, text: str) -> List[str]:
        """Split text into overlapping chunks"""
        words = text.split()
        chunks = []

        for i in range(0, len(words), self.chunk_size - self.overlap):
            chunk = " ".join(words[i:i + self.chunk_size])
            if chunk:
                chunks.append(chunk)

        return chunks

    def chunk_documents(self, documents: List[Dict]) -> List[Dict]:
        """Chunk multiple documents"""
        chunked_docs = []

        for doc in documents:
            chunks = self.chunk_text(doc['content'])
            for idx, chunk in enumerate(chunks):
                chunked_docs.append({
                    'file_name': doc['file_name'],
                    'chunk_id': idx,
                    'content': chunk,
                    'metadata': {
                        'file_type': doc['file_type'],
                        'total_chunks': len(chunks)
                    }
                })

        return chunked_docs

chunker = TextChunker(config.CHUNK_SIZE, config.CHUNK_OVERLAP)
print("✓ Text Chunker initialized")

✓ Text Chunker initialized


In [7]:
class VectorStore:
    """Manage embeddings and vector storage"""

    def __init__(self, model_name: str, db_path: str):
        self.embedding_model = SentenceTransformer(model_name)
        self.client = chromadb.PersistentClient(path=db_path)
        self.collection = self.client.get_or_create_collection(
            name="medical_records",
            metadata={"description": "Patient medical records embeddings"}
        )

    def add_documents(self, documents: List[Dict]):
        """Add documents to vector store"""
        texts = [doc['content'] for doc in documents]
        embeddings = self.embedding_model.encode(texts).tolist()

        ids = [f"{doc['file_name']}_{doc['chunk_id']}" for doc in documents]
        metadatas = [doc['metadata'] for doc in documents]

        self.collection.add(
            embeddings=embeddings,
            documents=texts,
            metadatas=metadatas,
            ids=ids
        )

    def search(self, query: str, n_results: int = 5) -> List[Dict]:
        """Search for relevant chunks"""
        query_embedding = self.embedding_model.encode([query]).tolist()

        results = self.collection.query(
            query_embeddings=query_embedding,
            n_results=n_results
        )

        return results

vector_store = VectorStore(config.EMBEDDING_MODEL, config.VECTOR_DB_DIR)
print("✓ Vector Store initialized")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

✓ Vector Store initialized


In [8]:
class MedicalAIAgent:
    """Agentic AI system for medical record analysis"""

    def __init__(self, model_name: str, device: str = "cpu"):
        self.device = device
        print(f"Loading model: {model_name}...")

        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModelForSeq2SeqLM.from_pretrained(
            model_name,
            torch_dtype=torch.float16 if device == "cuda" else torch.float32
        ).to(device)

        print("✓ Model loaded successfully")

    def generate_response(self, prompt: str, max_length: int = 512) -> str:
        """Generate response from the model"""
        inputs = self.tokenizer(
            prompt,
            return_tensors="pt",
            truncation=True,
            max_length=512
        ).to(self.device)

        outputs = self.model.generate(
            **inputs,
            max_length=max_length,
            temperature=config.TEMPERATURE,
            do_sample=True,
            top_p=0.9,
            num_return_sequences=1
        )

        response = self.tokenizer.decode(outputs[0], skip_special_tokens=True)
        return response

    def analyze_patient_summary(self, context: str) -> str:
        """Generate patient summary"""
        prompt = f"""Based on the following medical records, provide a concise patient summary including:
- Key demographic information
- Current symptoms
- Medical history highlights

Medical Records:
{context}

Summary:"""
        return self.generate_response(prompt)

    def generate_diagnosis(self, context: str) -> str:
        """Generate potential diagnoses"""
        prompt = f"""Analyze these medical records and identify potential diagnoses based on symptoms, test results, and medical history.

Medical Records:
{context}

Potential Diagnoses:"""
        return self.generate_response(prompt)

    def recommend_treatment(self, context: str, diagnosis: str) -> str:
        """Recommend treatments and medications"""
        prompt = f"""Based on the diagnosis and medical records, suggest appropriate treatments and medications.

Diagnosis: {diagnosis}

Medical Records:
{context}

Treatment Recommendations:"""
        return self.generate_response(prompt)

    def identify_risks(self, context: str) -> str:
        """Identify risk factors"""
        prompt = f"""Analyze these medical records and identify any concerning risk factors or patterns.

Medical Records:
{context}

Risk Factors:"""
        return self.generate_response(prompt)

# Initialize the agent
agent = MedicalAIAgent(config.SLM_MODEL, config.DEVICE)


Loading model: google/flan-t5-large...


tokenizer_config.json: 0.00B [00:00, ?B/s]

spiece.model:   0%|          | 0.00/792k [00:00<?, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json: 0.00B [00:00, ?B/s]

config.json:   0%|          | 0.00/662 [00:00<?, ?B/s]

`torch_dtype` is deprecated! Use `dtype` instead!


model.safetensors:   0%|          | 0.00/3.13G [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/147 [00:00<?, ?B/s]

✓ Model loaded successfully


In [9]:
class MedicalAgentOrchestrator:
    """Orchestrate the agentic workflow"""

    def __init__(self, agent, vector_store, extractor, chunker):
        self.agent = agent
        self.vector_store = vector_store
        self.extractor = extractor
        self.chunker = chunker

    def process_zip_file(self, zip_path: str) -> List[Dict]:
        """Extract and process all files from a zip"""
        extracted_docs = []

        with zipfile.ZipFile(zip_path, 'r') as zip_ref:
            zip_ref.extractall(config.UPLOAD_DIR)

            for file_name in zip_ref.namelist():
                if not file_name.startswith('__MACOSX'):
                    file_path = os.path.join(config.UPLOAD_DIR, file_name)
                    if os.path.isfile(file_path):
                        doc = self.extractor.extract_from_file(file_path)
                        if doc['success']:
                            extracted_docs.append(doc)

        return extracted_docs

    def index_documents(self, documents: List[Dict]):
        """Chunk and index documents"""
        chunked_docs = self.chunker.chunk_documents(documents)
        self.vector_store.add_documents(chunked_docs)
        return chunked_docs

    def retrieve_context(self, query: str, n_results: int = 5) -> str:
        """Retrieve relevant context for a query"""
        results = self.vector_store.search(query, n_results)
        context = "\n\n".join(results['documents'][0])
        return context

    def run_analysis(self, all_content: str) -> Dict[str, str]:
        """Run complete agentic analysis"""
        print("🤖 Agent: Starting comprehensive analysis...")

        # Step 1: Patient Summary
        print("📋 Agent: Generating patient summary...")
        summary = self.agent.analyze_patient_summary(all_content[:2000])

        # Step 2: Diagnosis
        print("🔍 Agent: Analyzing for potential diagnoses...")
        diagnosis = self.agent.generate_diagnosis(all_content[:2000])

        # Step 3: Treatment Recommendations
        print("💊 Agent: Generating treatment recommendations...")
        treatment = self.agent.recommend_treatment(all_content[:1500], diagnosis)

        # Step 4: Risk Assessment
        print("⚠️ Agent: Identifying risk factors...")
        risks = self.agent.identify_risks(all_content[:2000])

        print("✓ Analysis complete!")

        return {
            'patient_summary': summary,
            'diagnosis': diagnosis,
            'treatment': treatment,
            'risk_factors': risks
        }

# Initialize orchestrator
orchestrator = MedicalAgentOrchestrator(agent, vector_store, DocumentExtractor, chunker)
print("✓ Agentic Orchestrator initialized")


✓ Agentic Orchestrator initialized


In [10]:
def process_medical_records(zip_file_path: str):
    """Main function to process medical records"""

    print("=" * 70)
    print("MEDICAL RECORDS AI AGENT - PROCESSING PIPELINE")
    print("=" * 70)

    # Step 1: Extract documents
    print("\n[1/5] Extracting documents from ZIP file...")
    documents = orchestrator.process_zip_file(zip_file_path)
    print(f"✓ Extracted {len(documents)} documents")

    # Step 2: Index documents
    print("\n[2/5] Chunking and indexing documents...")
    chunked_docs = orchestrator.index_documents(documents)
    print(f"✓ Created {len(chunked_docs)} chunks")

    # Step 3: Combine all content
    print("\n[3/5] Preparing content for analysis...")
    all_content = "\n\n".join([doc['content'] for doc in documents])

    # Step 4: Run agentic analysis
    print("\n[4/5] Running AI agent analysis...")
    analysis = orchestrator.run_analysis(all_content)

    # Step 5: Display results
    print("\n[5/5] Generating report...")
    print("\n" + "=" * 70)
    print("MEDICAL ANALYSIS REPORT")
    print("=" * 70)

    print("\n📋 PATIENT SUMMARY")
    print("-" * 70)
    print(analysis['patient_summary'])

    print("\n\n🔍 DIAGNOSIS ANALYSIS")
    print("-" * 70)
    print(analysis['diagnosis'])

    print("\n\n💊 TREATMENT RECOMMENDATIONS")
    print("-" * 70)
    print(analysis['treatment'])

    print("\n\n⚠️ RISK FACTORS")
    print("-" * 70)
    print(analysis['risk_factors'])

    print("\n" + "=" * 70)
    print("⚕️ DISCLAIMER: This AI analysis is for informational purposes only.")
    print("Always consult with qualified healthcare professionals.")
    print("=" * 70)

    return analysis


In [11]:
# Upload your ZIP file with medical records and run:

# Option 1: Process from ZIP file
zip_path = "/Case_Report_files.zip"
results = process_medical_records(zip_path)



MEDICAL RECORDS AI AGENT - PROCESSING PIPELINE

[1/5] Extracting documents from ZIP file...
✓ Extracted 6 documents

[2/5] Chunking and indexing documents...
✓ Created 8 chunks

[3/5] Preparing content for analysis...

[4/5] Running AI agent analysis...
🤖 Agent: Starting comprehensive analysis...
📋 Agent: Generating patient summary...
🔍 Agent: Analyzing for potential diagnoses...
💊 Agent: Generating treatment recommendations...
⚠️ Agent: Identifying risk factors...
✓ Analysis complete!

[5/5] Generating report...

MEDICAL ANALYSIS REPORT

📋 PATIENT SUMMARY
----------------------------------------------------------------------
The patient presented with a case of limited myocardial necrosis occurring just after completion of anti-malarial treatment.


🔍 DIAGNOSIS ANALYSIS
----------------------------------------------------------------------
Heart Failure


💊 TREATMENT RECOMMENDATIONS
----------------------------------------------------------------------
Artemether/lumefantrine (artemet

In [16]:
# Option 3: Query specific information
query = "Based on the extracted and processed medical documents, identify and list all medications currently or previously prescribed to the patient, including medication names, dosages, frequency, duration, and any noted changes or discontinuations."
context = orchestrator.retrieve_context(query)
print(context)


Time after last dose 7 hi5 103 h45 Artemether <detection limit <detection limit Dihydroartemisinin 26.5 ng/ml <detection limit Lumefantrine 4308 ng/ml 443 ng/ml Desbutyllumefantrine 40 ng/ml 18 ng/ml

Additional tests Convalescent microbiological tests Auto-immune antibodies Toxicology test Inflammatory parameters adenovirus, RS-virus, influenza virus, mycoplasma pneumonia, Chlamydia, Q-fever, parainfluenza, en- terovirus ANCA, anticardiolipin antibodies, 82 glycoprotein, lupus anticoagulans, anti-heart (rat) antibodies tetrahydrocannabinol, amphetamine-derivatives, co- caine, diazepam, methadone, tramadol, opiates Interleukin 6, Interleukin 8 Negative Negative Negative <detection limit

--- Sheet: Sheet1 --- Haematology and biochemistry tests Reference range Screening Admission Adm +1 Adm +2 Adm +3 Adm +4 = Discharge 0 Haemoglobin (mmol/l) 7.3 - 9.7 7.1 6.90 NaN 7.1 NaN 6.9 1 Leukocytes (*109/l) 3.5-11.0 6.6 7.90 NaN 3.9 NaN 5.4 2 Platelets (*109/l) 120-350 259 160.00 NaN 254.0 NaN 30

In [18]:
# Option 4: Interactive Q&A
def ask_medical_question(question: str):
    context = orchestrator.retrieve_context(question, n_results=3)
    prompt = f"Context: {context}\n\nQuestion: {question}\n\nAnswer:"
    answer = agent.generate_response(prompt)
    return answer


In [22]:
# Example question
answer = ask_medical_question("What treatment and medication plan should be continued or adjusted for the patient based on the analyzed medical findings, and why?")
print(answer)

A case of limited myocardial necrosis after completion of anti-malarial treatment in an experimentally infected volunteer without obvious risk factors for cardiovascular disease.
