### Importing relevant libraries

In [1]:
import requests
import os
import urllib3
import json
from enum import Enum
from pydantic import BaseModel, Field

### Creating get_text() using pdf/text API call

In [2]:
def get_text(file_path):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    headers = {
        'Authorization': 'Bearer YOUR_TOKEN_HERE',
    }

    try:
        with open(file_path, 'rb') as file:
            files = {
                'file': (os.path.basename(file_path), file, 'application/pdf')
            }

            response = requests.post('https://grupmedai-api-des.itcomb.cat/pdf/text', headers=headers, files=files, verify=False)

            if response.status_code == 200:
                response_data = response.json()
                if 'content' in response_data:
                    text = response_data['content']
                    return text
                else:
                    print("Error: 'content' key not found in the response.")
                    return response_data
            else:
                print(f"Error: Received status code {response.status_code}")
                print(f"Response: {response.json()}")
                return None

    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [3]:
get_text(r"C:\Users\Sophie\Final deliverables\documents_simulated\HISTORIAL_MEDIC.pdf")

'HISTORIAL MÈDIC \nHospital Clínic de Barcelona \nDepartament de Medicina Interna \n \nInformació del Pacient \n• \nNom complet: Marc Ferrer Puig \n• \nData de naixement: 5 d’abril de 1982 \n• \nEdat: 42 anys \n• \nGènere: Masculí \n• \nTelèfon: +34 678 123 456 \n• \nCorreu electrònic: marcferrer82@email.com \n• \nAdreça: Carrer Balmes, 123, 4t 2a, 08008 Barcelona \n• \nContacte d’emergència:  \no Nom: Laura Martí Soler (Esposa) \no Telèfon: +34 654 987 321 \n \nAntecedents Mèdics \n• \nMalalties cròniques: \no Hipertensió arterial diagnosticada el 2018 \no Diabetis tipus 2 diagnosticada el 2021 \no Asma lleu des de la infància \n• \nCirurgies prèvies: \no Apendicectomia (2005) \no Reparació d’hèrnia inguinal (2017) \n• \nAl·lèrgies conegudes: \no Penicil·lina (reacció anafilàctica) \no Fruits secs (urticària moderada) \n• \nHistorial familiar: \no Pare: Cardiopatia isquèmica (defunció als 65 anys) \no Mare: Diabetis tipus 2 \no Germà: Hipertensió arterial \n \nMedicació Actual \n• \nE

### Creating get_doc_tags_1() to get the primary tag

In [4]:
def get_doc_tags_1(doc_path,model):

    text = get_text(doc_path)
   
    class TagsEnum(str, Enum):
        medical = 'Medical'
        legal_procedure = 'Legal/Procedure'
        email = 'Email'
        attachment = 'Attachment'

    class DocTags(BaseModel):
        tag: TagsEnum = Field(..., description="Tag of the document")

   
    #print("json schema", DocTags.model_json_schema())

    api_url = "http://ollama-api-des.itcomb.cat/api/chat"

    prompt = f"""
    You are classifying the type of document based on its overall purpose and context.

    Context: These documents are part of a **medical malpractice legal case**, and may include clinical records, legal filings, emails, or miscellaneous attachments. Your goal is to assign the document to the most appropriate category based on its **intended role or function in the case**, rather than its format or specific terms.

    Available categories:
    - Medical: Primarily related to patient care, diagnoses, prescriptions, medical procedures, or insurance matters.
    - Legal/Procedure: Focused on legal rights, responsibilities, procedural documents, formal notices, or agreements.
    - Email: A form of communication, either formal or informal, often includes greetings, sender/receiver details, email-specific elements (e.g., '@', '.com'), and may contain varied content.
    - Attachment: Any other type of document not fitting the above, often supplemental or miscellaneous in nature.

    Document:\n
    {text}\n\n"""

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "format": DocTags.model_json_schema(),
        "stream": False
    }
   
    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    tag = DocTags.model_validate_json(response_data["message"]["content"])
    return tag

### Creating get_doc_tags_2() to get the secondary tag

In [5]:
def get_doc_tags_2(doc_path,category,model):

    text = get_text(doc_path)

    if category == 'Medical':

        class MedicalSubTags(str,Enum):
            reports = 'Reports'
            medical_history = 'Medical History'
            medical_certificate = 'Medical Certificate'
            initial_clinical_diagnosis = 'Initial Clinical Diagnosis'
            sequelae_stabilization = 'Sequelae Stabilization'
            other = 'Other'
        
        class MedicalTag(BaseModel):
            sub_tag: MedicalSubTags = Field(...,description='Type of medical document')

        model_class = MedicalTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Reports: Summaries or findings from clinical evaluations or diagnostics.
        - Medical History: Records of a patient’s past health conditions or treatments.
        - Medical Certificate: Formal statements issued by a doctor for administrative/legal purposes.
        - Initial Clinical Diagnosis: First clinical assessment of a condition.
        - Sequelae Stabilization: Documents related to the stabilization of after-effects of prior medical issues.
        - Other: Any medical document that doesn’t fit the above.

        Document:\n
        {text}\n\n
        """

    elif category == 'Legal/Procedure':

        class LegalSubTags(str,Enum):
            acts = 'Acts'
            receipt = 'Acknowledgment of Receipt'
            resolutions = 'Resolutions or Closure'
            notices = 'Notices'
            reports = 'Reports'
            deposits = 'Guarantees or Deposits'
            administrative = 'Administrative'
            prelim_proceedings = 'Preliminary Proceedings'
            payments = 'Payments or Compensations'
            appeal = 'Appeal'
            power_attorney = 'Power of Attorney'
            sentence = 'Sentence'
            other = 'Other'

        class LegalTag(BaseModel):
            sub_tag: LegalSubTags = Field(...,description='Type of legal document')
        
        model_class = LegalTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Acts: Formal legislative or regulatory documents.
        - Acknowledgment of Receipt: Confirmations that a party has received a document or notice.
        - Resolutions or Closure: Documents marking the end or resolution of a legal process or case.
        - Notices: Formal communications informing parties of legal procedures or rights.
        - Reports: Legal assessments or statements generated during proceedings or investigations.
        - Guarantees or Deposits: Documents related to financial sureties or collateral.
        - Administrative: Internal or procedural legal communications.
        - Preliminary Proceedings: Initial steps taken in a legal case or investigation.
        - Payments or Compensations: Documents involving settlements or financial reimbursements.
        - Appeal: Requests for review or reconsideration of a prior legal decision.
        - Power of Attorney: Documents granting legal authority to act on another’s behalf.
        - Sentence: Final decisions or rulings issued by a court or authority.
        - Other: Any legal document that does not clearly fit into the above categories.

        Document:\n
        {text}\n\n
        """
    
    elif category == 'Email':

        class EmailSubTags(str,Enum):
            legal = 'Legal'
            medical = 'Medical'
        
        class EmailTag(BaseModel):
            sub_tag: EmailSubTags = Field(...,description='Type of email')
        
        model_class = EmailTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Legal: The email discusses or contains legal matters, contracts, notices, or procedures.
        - Medical: The email contains or references medical content, such as diagnoses, treatments, or health records.

        Document:\n
        {text}\n\n
        """

    
    elif category == 'Attachment':

        class AttachmentSubTags(str, Enum):
            other = 'Other'

        class AttachmentTag(BaseModel):
            sub_tag: AttachmentSubTags = Field(..., description='Type of attachment')

        model_class = AttachmentTag

        return model_class(sub_tag="Other")

    
    schema = model_class.model_json_schema()
    #expected_field = 'sub_tag'

    api_url = "http://ollama-api-des.itcomb.cat/api/chat"

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "format": schema,
        "stream": False
    }

    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    tag = model_class.model_validate_json(response_data["message"]["content"])
    return tag

In [6]:
file_path = r"C:\Users\Sophie\Final deliverables\documents_simulated\attachment.pdf"
tag_1 = get_doc_tags_1(file_path, model="qwen2.5:32b")
print(f"    Primary Tag: {tag_1.tag.value}")
tag_2 = get_doc_tags_2(file_path,category=tag_1.tag.value,model="qwen2.5:32b")
print(f"    Secondary Tag: {tag_2.sub_tag.value}")

    Primary Tag: Attachment
    Secondary Tag: Other


### Testing functions on own simulated documents

In [7]:
folder_path = r"c:\Users\Sophie\Final deliverables\documents_simulated"

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        file_path = os.path.join(root, filename)
        if not file_path.lower().endswith('.pdf'):
            #print(f"Skipping unsupported file type: {file_path}")
            continue
        else:
            try:
                print(f"Processing file: {filename}")
                tag_1 = get_doc_tags_1(file_path, model="qwen2.5:32b")
                print(f"    Primary Tag: {tag_1.tag.value}")
                tag_2 = get_doc_tags_2(file_path,category=tag_1.tag.value,model="qwen2.5:32b")
                print(f"    Secondary Tag: {tag_2.sub_tag.value}")
                print(75*'*')
            except Exception as e:
                print(f"Error processing {filename}: {e}")

Processing file: attachment.pdf
    Primary Tag: Attachment
    Secondary Tag: Other
***************************************************************************
Processing file: claim_medical_malpractice.pdf
    Primary Tag: Legal/Procedure
    Secondary Tag: Preliminary Proceedings
***************************************************************************
Processing file: diagnostic_clinic_inicial.pdf
    Primary Tag: Medical
    Secondary Tag: Initial Clinical Diagnosis
***************************************************************************
Processing file: doctor_file.pdf
    Primary Tag: Medical
    Secondary Tag: Other
***************************************************************************
Processing file: doctor_report.pdf
    Primary Tag: Medical
    Secondary Tag: Reports
***************************************************************************
Processing file: email_legal.pdf
    Primary Tag: Legal/Procedure
    Secondary Tag: Preliminary Proceedings
***************

### Testing functions on intial documents provided by Medidedalia

In [8]:
parent_folder = r"c:\Users\Sophie\Final deliverables\Test Documents"

for subfolder_name in os.listdir(parent_folder):
    subfolder_path = os.path.join(parent_folder, subfolder_name)

    if os.path.isdir(subfolder_path):
        print(f"\nProcessing folder: {subfolder_name}")

        for root, dirs, files in os.walk(subfolder_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                try:
                    print(75*'*')
                    tag_1 = get_doc_tags_1(file_path, model="qwen2.5:32b")
                    print(f"    {filename} -- Primary Tag: {tag_1.tag.value}")
                    tag_2 = get_doc_tags_2(file_path,category=tag_1.tag.value,model="qwen2.5:32b")
                    print(f"    {filename} -- Secondary Tag: {tag_2.sub_tag.value}")
                except Exception as e:
                    print(f"Error processing {filename}: {e}")


Processing folder: General surgery Ana Garcia Lopez
***************************************************************************
    Claim details.pdf -- Primary Tag: Legal/Procedure
    Claim details.pdf -- Secondary Tag: Preliminary Proceedings
***************************************************************************
    Defense Strategy email.pdf -- Primary Tag: Legal/Procedure
    Defense Strategy email.pdf -- Secondary Tag: Preliminary Proceedings
***************************************************************************
    Doctor Information.pdf -- Primary Tag: Medical
    Doctor Information.pdf -- Secondary Tag: Other
***************************************************************************
    Hospital and Clinical Data.pdf -- Primary Tag: Medical
    Hospital and Clinical Data.pdf -- Secondary Tag: Reports
***************************************************************************
    Medical Malpractice Claim Report.pdf -- Primary Tag: Legal/Procedure
    Medical Malpra