In [2]:
import os
import getpass
import re
import numpy as np
import faiss
from typing import List, Dict, Tuple
from sentence_transformers import SentenceTransformer
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.prompts import ChatPromptTemplate
from langchain_groq import ChatGroq
import gradio as gr
import pickle

class MedicalComplianceChecker:
    def __init__(self, model_name="all-MiniLM-L6-v2", groq_model="deepseek-r1-distill-llama-70b"):
        self.embedding_model_name = model_name
        self.groq_model_name = groq_model
        self.embedder = SentenceTransformer(self.embedding_model_name)
        self.text_splitter = RecursiveCharacterTextSplitter(
            chunk_size=500,
            chunk_overlap=50,
            length_function=len,
            is_separator_regex=False,
        )
        self.faiss_index = None
        self.document_chunks = []
        self.llm = self._setup_llm()
        self.compliance_prompt = self._setup_prompt()
        self.regex_patterns = self._setup_regex_patterns()
        self.examples = self._create_examples()
        
    def _setup_llm(self):
        if "GROQ_API_KEY" not in os.environ:
            os.environ["GROQ_API_KEY"] = getpass.getpass("Enter your Groq API key: ")
        
        return ChatGroq(
            model=self.groq_model_name,
            temperature=0,
            max_tokens=None,
            reasoning_format="parsed",
            timeout=None,
            max_retries=2,
        )
    
    def _setup_prompt(self):
        system_message = """You are a medical regulatory compliance expert. Analyze the given medical text and determine if it complies with FDA/EMA regulations.

Non-compliant patterns include:
- Absolute claims (100%, guarantees, cure, prevents)
- Superlative claims without evidence (best, most advanced, revolutionary)
- Unsubstantiated medical claims
- Comparative claims without evidence
- Missing disclaimers for medical claims
- Claims stating "backed by trials" (this is compliant and acceptable)

Relevant regulatory context:
{context}

Respond with:
Classification: [Compliant/Non-Compliant]
Explanation: [Brief explanation]"""
        
        return ChatPromptTemplate([
            ("system", system_message),
            ("user", "Analyze this medical text: {text}")
        ])
    
    def _setup_regex_patterns(self):
        return [
            (r'\b(100%|guarantee[ds]?|cure[ds]?|prevent[s]?)\b', "Absolute claims not allowed"),
            (r'\b(best|most advanced|revolutionary|superior)\b', "Superlative claims need evidence"),
            (r'\bbetter than all\b', "Comparative claims must be evidence-based"),
            (r'\b(will cure|will prevent|eliminates)\b', "Medical claims require disclaimers"),
        ]
    
    def _create_examples(self):
        return [
            "Our supplement guarantees 100% weight loss in 30 days without side effects",
            "This treatment is backed by clinical trials and may help reduce symptoms",
            "Revolutionary breakthrough medicine that cures all diseases instantly",
            "Studies suggest this medication may improve patient outcomes when used as directed",
            "Best cancer treatment available - eliminates tumors completely",
            "This device is backed by trials and FDA approved for pain management",
            "Miracle cure prevents all heart attacks with our special formula",
            "Clinical research indicates potential benefits for joint health support",
            "Superior to all competitors - will prevent diabetes forever",
            "This therapy is backed by trials and shows promising results in studies"
        ]
    
    def load_documents(self, file_paths: List[str]):
        all_chunks = []
        
        for file_path in file_paths:
            try:
                print(f"Processing PDF: {file_path}")
                loader = PyPDFLoader(file_path)
                documents = loader.load()
                
                for doc in documents:
                    chunks = self.text_splitter.create_documents([doc.page_content])
                    chunk_texts = [chunk.page_content for chunk in chunks]
                    all_chunks.extend(chunk_texts)
                    
            except Exception as e:
                print(f"Error loading {file_path}: {e}")
        
        self.document_chunks = all_chunks
        self._create_faiss_index()
    
    def _create_faiss_index(self):
        if not self.document_chunks:
            print("No documents loaded")
            return
        
        embeddings = self.embedder.encode(self.document_chunks)
        dimension = embeddings.shape[1]
        
        self.faiss_index = faiss.IndexFlatL2(dimension)
        self.faiss_index.add(embeddings.astype('float32'))
        
        print(f"Created FAISS index with {len(self.document_chunks)} chunks")
    
    def _retrieve_context(self, query: str, k: int = 3) -> str:
        if self.faiss_index is None:
            return "No regulatory context available"
        
        query_embedding = self.embedder.encode([query])
        distances, indices = self.faiss_index.search(query_embedding.astype('float32'), k)
        
        relevant_chunks = [self.document_chunks[idx] for idx in indices[0]]
        return "\n\n".join(relevant_chunks)
    
    def _regex_check(self, text: str) -> Tuple[bool, str]:
        text_lower = text.lower()
        for pattern, reason in self.regex_patterns:
            if re.search(pattern, text_lower):
                return False, reason
        
        return True, "Passes basic regex checks"
    
    def check_compliance(self, text: str) -> Dict[str, str]:
        regex_compliant, regex_reason = self._regex_check(text)
        
        if not regex_compliant:
            return {
                "classification": "Non-Compliant",
                "explanation": regex_reason,
                "method": "Regex Detection"
            }
        
        context = self._retrieve_context(text)
        
        try:
            prompt = self.compliance_prompt.invoke({
                "text": text,
                "context": context
            })
            
            response = self.llm.invoke(prompt)
            
            classification = "Non-Compliant" if "Non-Compliant" in response.content else "Compliant"
            
            explanation_start = response.content.find("Explanation:")
            explanation = response.content[explanation_start + 12:].strip() if explanation_start != -1 else response.content
            
            return {
                "classification": classification,
                "explanation": explanation,
                "method": "LLM Analysis with RAG"
            }
            
        except Exception as e:
            return {
                "classification": "Error",
                "explanation": f"Analysis failed: {str(e)}",
                "method": "Error"
            }
    
    def run_examples(self):
        print("Running 10 random examples:")
        print("=" * 80)
        
        for i, example in enumerate(self.examples, 1):
            print(f"\nExample {i}:")
            print(f"Text: {example}")
            result = self.check_compliance(example)
            print(f"Classification: {result['classification']}")
            print(f"Explanation: {result['explanation']}")
            print(f"Method: {result['method']}")
            print("-" * 60)

class ComplianceUI:
    def __init__(self, checker: MedicalComplianceChecker):
        self.checker = checker
    
    def analyze_text(self, text: str):
        result = self.checker.check_compliance(text)
        
        return (
            result['classification'],
            result['explanation'],
            result['method']
        )
    
    def create_interface(self):
        with gr.Blocks(title="Medical Compliance Checker") as interface:
            gr.Markdown("# Medical Text Compliance Checker")
            gr.Markdown("Analyze medical claims for regulatory compliance (FDA/EMA standards)")
            
            with gr.Row():
                with gr.Column():
                    text_input = gr.Textbox(
                        label="Medical Text to Analyze",
                        placeholder="Enter medical claim or text...",
                        lines=5
                    )
                    
                    analyze_btn = gr.Button("Analyze Compliance", variant="primary")
                
                with gr.Column():
                    classification_output = gr.Textbox(
                        label="Classification",
                        interactive=False
                    )
                    
                    explanation_output = gr.Textbox(
                        label="Explanation",
                        lines=4,
                        interactive=False
                    )
                    
                    method_output = gr.Textbox(
                        label="Analysis Method",
                        interactive=False
                    )
            
            analyze_btn.click(
                self.analyze_text,
                inputs=[text_input],
                outputs=[classification_output, explanation_output, method_output]
            )
        
        return interface

def main():
    checker = MedicalComplianceChecker()
    
    file_paths = [
        "/kaggle/input/textsforrules/EMA-1.pdf",
        "/kaggle/input/textsforrules/FDA -1.pdf",
        "/kaggle/input/textsforrules/FDA -2.pdf",
        "/kaggle/input/textsforrules/FDA -3.pdf",
        "/kaggle/input/textsforrules/FDA -4.pdf",
        "/kaggle/input/textsforrules/HSA -1.pdf"
    ]
    
    print("Loading regulatory documents...")
    checker.load_documents(file_paths)
    
    checker.run_examples()
    
    print("\nLaunching Gradio interface...")
    ui = ComplianceUI(checker)
    interface = ui.create_interface()
    interface.launch(share=True)

if __name__ == "__main__":
    main()

2025-06-26 14:38:26.727457: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1750948707.046477      35 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1750948707.096648      35 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered


modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/116 [00:00<?, ?B/s]

README.md: 0.00B [00:00, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/53.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/612 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/90.9M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/350 [00:00<?, ?B/s]

vocab.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Enter your Groq API key:  ········


Loading regulatory documents...
Processing PDF: /kaggle/input/textsforrules/EMA-1.pdf
Processing PDF: /kaggle/input/textsforrules/FDA -1.pdf
Processing PDF: /kaggle/input/textsforrules/FDA -2.pdf
Error loading /kaggle/input/textsforrules/FDA -2.pdf: File path /kaggle/input/textsforrules/FDA -2.pdf is not a valid file or url
Processing PDF: /kaggle/input/textsforrules/FDA -3.pdf
Error loading /kaggle/input/textsforrules/FDA -3.pdf: File path /kaggle/input/textsforrules/FDA -3.pdf is not a valid file or url
Processing PDF: /kaggle/input/textsforrules/FDA -4.pdf
Error loading /kaggle/input/textsforrules/FDA -4.pdf: File path /kaggle/input/textsforrules/FDA -4.pdf is not a valid file or url
Processing PDF: /kaggle/input/textsforrules/HSA -1.pdf
Error loading /kaggle/input/textsforrules/HSA -1.pdf: File path /kaggle/input/textsforrules/HSA -1.pdf is not a valid file or url


Batches:   0%|          | 0/14 [00:00<?, ?it/s]

Created FAISS index with 422 chunks
Running 10 random examples:

Example 1:
Text: Our supplement guarantees 100% weight loss in 30 days without side effects
Classification: Non-Compliant
Explanation: Absolute claims not allowed
Method: Regex Detection
------------------------------------------------------------

Example 2:
Text: This treatment is backed by clinical trials and may help reduce symptoms


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Classification: Compliant
Explanation: ** The text uses the phrase "backed by clinical trials," which is acceptable, and the claim "may help reduce symptoms" is appropriately qualified, indicating possibility without certainty. There are no absolute, superlative, or unsubstantiated claims, and no missing disclaimers are evident in the provided context.
Method: LLM Analysis with RAG
------------------------------------------------------------

Example 3:
Text: Revolutionary breakthrough medicine that cures all diseases instantly
Classification: Non-Compliant
Explanation: Absolute claims not allowed
Method: Regex Detection
------------------------------------------------------------

Example 4:
Text: Studies suggest this medication may improve patient outcomes when used as directed


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Classification: Compliant
Explanation: ** The text uses qualified language ("may improve") and references studies, avoiding absolute or superlative claims. It implies conditional effectiveness with "when used as directed," serving as a disclaimer. No prohibited claims are made, ensuring compliance with regulatory standards.
Method: LLM Analysis with RAG
------------------------------------------------------------

Example 5:
Text: Best cancer treatment available - eliminates tumors completely
Classification: Non-Compliant
Explanation: Superlative claims need evidence
Method: Regex Detection
------------------------------------------------------------

Example 6:
Text: This device is backed by trials and FDA approved for pain management


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Classification: Compliant
Explanation: ** The text complies with FDA/EMA regulations as it avoids absolute or superlative claims, is supported by references to trials and FDA approval, and includes appropriate disclaimers.
Method: LLM Analysis with RAG
------------------------------------------------------------

Example 7:
Text: Miracle cure prevents all heart attacks with our special formula
Classification: Non-Compliant
Explanation: Absolute claims not allowed
Method: Regex Detection
------------------------------------------------------------

Example 8:
Text: Clinical research indicates potential benefits for joint health support


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Classification: Compliant
Explanation: ** The text avoids absolute or superlative claims, instead using cautious language like "potential benefits" and referencing clinical research, which provides substantiation without overstepping regulatory guidelines.
Method: LLM Analysis with RAG
------------------------------------------------------------

Example 9:
Text: Superior to all competitors - will prevent diabetes forever
Classification: Non-Compliant
Explanation: Absolute claims not allowed
Method: Regex Detection
------------------------------------------------------------

Example 10:
Text: This therapy is backed by trials and shows promising results in studies


Batches:   0%|          | 0/1 [00:00<?, ?it/s]

Classification: Compliant
Explanation: ** The text states that the therapy is "backed by trials," which is compliant. It also mentions "promising results in studies," a general statement without absolute or comparative claims, thus adhering to regulatory standards.
Method: LLM Analysis with RAG
------------------------------------------------------------

Launching Gradio interface...
* Running on local URL:  http://127.0.0.1:7860
* Running on public URL: https://1d17ecc5cc8cae30bd.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Batches:   0%|          | 0/1 [00:00<?, ?it/s]