### Importing relevant libraries

In [1]:
import requests
import os
import urllib3
import json
from enum import Enum
from pydantic import BaseModel, Field
import pandas as pd

### Creating get_text() using pdf/text API call

In [2]:
def get_text(file_path):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    headers = {
        'Authorization': 'Bearer YOUR_TOKEN_HERE',
    }

    try:
        with open(file_path, 'rb') as file:
            files = {
                'file': (os.path.basename(file_path), file, 'application/pdf')
            }

            response = requests.post('https://grupmedai-api-des.itcomb.cat/pdf/text', headers=headers, files=files, verify=False)

            if response.status_code == 200:
                response_data = response.json()
                if 'content' in response_data:
                    text = response_data['content']
                    return text
                else:
                    print("Error: 'content' key not found in the response.")
                    return response_data
            else:
                print(f"Error: Received status code {response.status_code}")
                print(f"Response: {response.json()}")
                return None

    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [3]:
get_text(r"C:\Users\Sophie\Final deliverables\documents_simulated\HISTORIAL_MÈDIC.pdf")

Error: File C:\Users\Sophie\Final deliverables\documents_simulated\HISTORIAL_MÈDIC.pdf not found.


### Creating get_doc_tags_1() to get the primary tag

In [4]:
def get_doc_tags_1(doc_path,model):

    text = get_text(doc_path)
   
    class TagsEnum(str, Enum):
        medical = 'Medical'
        legal_procedure = 'Legal/Procedure'
        email = 'Email'
        attachment = 'Attachment'

    class DocTags(BaseModel):
        tag: TagsEnum = Field(..., description="Tag of the document")

   
    #print("json schema", DocTags.model_json_schema())

    api_url = "http://ollama-api-des.itcomb.cat/api/chat"

    prompt = f"""
    You are classifying the type of document based on its overall purpose and context.

    Context: These documents are part of a **medical malpractice legal case**, and may include clinical records, legal filings, emails, or miscellaneous attachments. Your goal is to assign the document to the most appropriate category based on its **intended role or function in the case**, rather than its format or specific terms.

    Available categories:
    - Medical: Primarily related to patient care, diagnoses, prescriptions, medical procedures, or insurance matters.
    - Legal/Procedure: Focused on legal rights, responsibilities, procedural documents, formal notices, or agreements.
    - Email: A form of communication, either formal or informal, often includes greetings, sender/receiver details, email-specific elements (e.g., '@', '.com'), and may contain varied content.
    - Attachment: Any other type of document not fitting the above, often supplemental or miscellaneous in nature.

    Document:\n
    {text}\n\n"""

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": prompt
            }
        ],
        "format": DocTags.model_json_schema(),
        "stream": False
    }
   
    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    tag = DocTags.model_validate_json(response_data["message"]["content"])
    return tag

### Creating get_doc_tags_2() to get the secondary tag

In [5]:
def get_doc_tags_2(doc_path,category,model):

    text = get_text(doc_path)

    if category == 'Medical':

        class MedicalSubTags(str,Enum):
            medical_history = 'Medical History'
            informed_consent = 'Informed Consent'
            other = 'Other'
        
        class MedicalTag(BaseModel):
            sub_tag: MedicalSubTags = Field(...,description='Type of medical document')

        model_class = MedicalTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Medical History: records of a patient’s past health conditions or treatments.
        - Informed consent: a document where a patient agrees to a procedure after being informed of risks and benefits.
        - Other: any medical document that doesn’t fit the above.

        Document:\n
        {text}\n\n
        """

    elif category == 'Legal/Procedure':

        class LegalSubTags(str,Enum):
            doctor_file = "Doctor's file"
            insurance_certificate = 'Insurance certificate'
            claim = 'Claim'
            complaint = 'Complaint'
            citation = 'Citation'
            other = 'Other'

        class LegalTag(BaseModel):
            sub_tag: LegalSubTags = Field(...,description='Type of legal/procedure document')
        
        model_class = LegalTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Doctor's file: information about the doctor.
        - Insurance certificate: proof of insurance certificate from any involved party.
        - Claim: a formal request for payment or reimbursement.
        - Complaint: a formal grievance.
        - Citation: a legal reference or notice.
        - Other: any legal document that does not clearly fit into the above categories.

        Document:\n
        {text}\n\n
        """
    
    elif category == 'Email':

        class EmailSubTags(str,Enum):
            legal = 'Legal'
            medical = 'Medical'
        
        class EmailTag(BaseModel):
            sub_tag: EmailSubTags = Field(...,description='Type of email')
        
        model_class = EmailTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Legal/Procedure: the email discusses or contains legal matters, contracts, notices, or procedures.
        - Medical: the email contains or references medical content, such as diagnoses, treatments, or health records.

        Document:\n
        {text}\n\n
        """

    elif category == 'Attachment':

        class AttachmentSubTags(str, Enum):
            other = 'Other'

        class AttachmentTag(BaseModel):
            sub_tag: AttachmentSubTags = Field(..., description='Type of attachment')

        model_class = AttachmentTag

        return model_class(sub_tag="Other")

    else:
        raise ValueError(f"Unknown category: {category}")
    
    schema = model_class.model_json_schema()

    api_url = "http://ollama-api-des.itcomb.cat/api/chat"

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "format": schema,
        "stream": False
    }

    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    # Parse the response
    tag = model_class.model_validate_json(response_data["message"]["content"])
    return tag

### Testing functions on own simulated documents

In [6]:
records = []

In [7]:
folder_path = r"c:\Users\Sophie\Final deliverables\documents_simulated"

for root, dirs, files in os.walk(folder_path):
    for filename in files:
        file_path = os.path.join(root, filename)
        if not file_path.lower().endswith('.pdf'):
            #print(f"Skipping unsupported file type: {file_path}")
            continue
        else:
            try:
                print(f"Processing file: {filename}")
                tag_1 = get_doc_tags_1(file_path, model="qwen2.5:32b")
                print(f"    Primary Tag: {tag_1.tag.value}")
                tag_2 = get_doc_tags_2(file_path,category=tag_1.tag.value,model="qwen2.5:32b")
                print(f"    Secondary Tag: {tag_2.sub_tag.value}")
                print(75*'*')

                records.append({
                "filepath": file_path,
                "primary_tag": tag_1.tag.value,
                "secondary_tag": tag_2.sub_tag.value
                })
            except Exception as e:
                print(f"Error processing {filename}: {e}")

Processing file: attachment.pdf
    Primary Tag: Attachment
    Secondary Tag: Other
***************************************************************************
Processing file: claim_medical_malpractice.pdf
    Primary Tag: Legal/Procedure
    Secondary Tag: Claim
***************************************************************************
Processing file: diagnostic_clinic_inicial.pdf
    Primary Tag: Medical
    Secondary Tag: Medical History
***************************************************************************
Processing file: doctor_file.pdf
    Primary Tag: Attachment
    Secondary Tag: Other
***************************************************************************
Processing file: doctor_report.pdf
    Primary Tag: Medical
    Secondary Tag: Medical History
***************************************************************************
Processing file: email_legal.pdf
    Primary Tag: Email
    Secondary Tag: Legal
*************************************************************

### Testing functions on intial documents provided by Medidedalia

In [8]:
parent_folder = r"c:\Users\Sophie\Final deliverables\Test Documents"

for subfolder_name in os.listdir(parent_folder):
    subfolder_path = os.path.join(parent_folder, subfolder_name)

    if os.path.isdir(subfolder_path):
        print(f"\nProcessing folder: {subfolder_name}")

        for root, dirs, files in os.walk(subfolder_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                try:
                    print(75*'*')
                    tag_1 = get_doc_tags_1(file_path, model="qwen2.5:32b")
                    print(f"    {filename} -- Primary Tag: {tag_1.tag.value}")
                    tag_2 = get_doc_tags_2(file_path,category=tag_1.tag.value,model="qwen2.5:32b")
                    print(f"    {filename} -- Secondary Tag: {tag_2.sub_tag.value}")

                    records.append({
                    "filepath": file_path,
                    "primary_tag": tag_1.tag.value,
                    "secondary_tag": tag_2.sub_tag.value
                    })
                except Exception as e:
                    print(f"Error processing {filename}: {e}")


Processing folder: General surgery Ana Garcia Lopez
***************************************************************************
    Claim details.pdf -- Primary Tag: Legal/Procedure
    Claim details.pdf -- Secondary Tag: Claim
***************************************************************************
    Defense Strategy email.pdf -- Primary Tag: Legal/Procedure
    Defense Strategy email.pdf -- Secondary Tag: Claim
***************************************************************************
    Doctor Information.pdf -- Primary Tag: Medical
    Doctor Information.pdf -- Secondary Tag: Other
***************************************************************************
    Hospital and Clinical Data.pdf -- Primary Tag: Medical
    Hospital and Clinical Data.pdf -- Secondary Tag: Medical History
***************************************************************************
    Medical Malpractice Claim Report.pdf -- Primary Tag: Legal/Procedure
    Medical Malpractice Claim Report.pdf -- Se

### Testing functions on anonymized documents provided by Medidedalia

!!! Important to note: these tags are 100% wrong because, though the documents are PDFs, they are scans and cannot be read by the API when calling the get_text() function.

In [9]:
parent_folder = r"c:\Users\Sophie\Final deliverables\documents_anonymized"

for subfolder_name in os.listdir(parent_folder):
    subfolder_path = os.path.join(parent_folder, subfolder_name)

    if os.path.isdir(subfolder_path):
        print(f"\nProcessing folder: {subfolder_name}")

        for root, dirs, files in os.walk(subfolder_path):
            for filename in files:
                file_path = os.path.join(root, filename)
                
                try:
                    print(75*'*')
                    tag_1 = get_doc_tags_1(file_path, model="qwen2.5:32b")
                    print(f"    {filename} -- Primary Tag: {tag_1.tag.value}")
                    tag_2 = get_doc_tags_2(file_path,category=tag_1.tag.value,model="qwen2.5:32b")
                    print(f"    {filename} -- Secondary Tag: {tag_2.sub_tag.value}")
                except Exception as e:
                    print(f"Error processing {filename}: {e}")


Processing folder: 195
***************************************************************************
    20250602103930696.pdf -- Primary Tag: Medical
    20250602103930696.pdf -- Secondary Tag: Other
***************************************************************************
    20250602103939095.pdf -- Primary Tag: Medical
    20250602103939095.pdf -- Secondary Tag: Other
***************************************************************************
    CONSENTIMENT.pdf -- Primary Tag: Medical
    CONSENTIMENT.pdf -- Secondary Tag: Other
***************************************************************************
    HC.pdf -- Primary Tag: Medical
    HC.pdf -- Secondary Tag: Other
***************************************************************************
    RECLAMACIO.pdf -- Primary Tag: Medical
    RECLAMACIO.pdf -- Secondary Tag: Other
***************************************************************************
    SOCIETAT.pdf -- Primary Tag: Attachment
    SOCIETAT.pdf -- Secondary 

### Turning records[] into a DataFrame

In [10]:
df_tags = pd.DataFrame(records)

In [11]:
df_tags.head()

Unnamed: 0,filepath,primary_tag,secondary_tag
0,c:\Users\Sophie\Final deliverables\documents_s...,Attachment,Other
1,c:\Users\Sophie\Final deliverables\documents_s...,Legal/Procedure,Claim
2,c:\Users\Sophie\Final deliverables\documents_s...,Medical,Medical History
3,c:\Users\Sophie\Final deliverables\documents_s...,Attachment,Other
4,c:\Users\Sophie\Final deliverables\documents_s...,Medical,Medical History


In [12]:
output_path = r"C:\Users\Sophie\Final deliverables\tagged_documents.csv"
#df_tags.to_csv(output_path, index=False, encoding='utf-8')

In [13]:
df_correct = pd.read_csv(r"C:\Users\Sophie\Final deliverables\tagged_documents_correct.csv")

In [14]:
df_correct.head()

Unnamed: 0,filepath,primary_correct,secondary_correct
0,c:\Users\Sophie\Final deliverables\documents_s...,Legal/Procedure,Claim
1,c:\Users\Sophie\Final deliverables\documents_s...,Medical,Medical History
2,c:\Users\Sophie\Final deliverables\documents_s...,Legal/Procedure,Doctor's file
3,c:\Users\Sophie\Final deliverables\documents_s...,Medical,Medical History
4,c:\Users\Sophie\Final deliverables\documents_s...,Email,Legal/Procedure


In [15]:
df = pd.merge(df_tags, df_correct, on="filepath", how="inner")

In [16]:
df.head()

Unnamed: 0,filepath,primary_tag,secondary_tag,primary_correct,secondary_correct
0,c:\Users\Sophie\Final deliverables\documents_s...,Legal/Procedure,Claim,Legal/Procedure,Claim
1,c:\Users\Sophie\Final deliverables\documents_s...,Medical,Medical History,Medical,Medical History
2,c:\Users\Sophie\Final deliverables\documents_s...,Attachment,Other,Legal/Procedure,Doctor's file
3,c:\Users\Sophie\Final deliverables\documents_s...,Medical,Medical History,Medical,Medical History
4,c:\Users\Sophie\Final deliverables\documents_s...,Email,Legal,Email,Legal/Procedure


In [17]:
df_filtered = df[df["primary_correct"] != "?"]

In [18]:
primary_match = (df_filtered["primary_tag"] == df_filtered["primary_correct"])
primary_match_pct = primary_match.mean() * 100

In [19]:
secondary_match = (df_filtered["secondary_tag"] == df_filtered["secondary_correct"])

In [20]:
both_match_pct = ((primary_match & secondary_match).mean()) * 100

In [21]:
print(f"Primary tag match:   {primary_match_pct:.2f}%")
print(f"Both tags match:     {both_match_pct:.2f}%")

Primary tag match:   66.67%
Both tags match:     50.00%


Caveat: there's a small number of secondary tags represented (especially for legal).

In theory, there are 12 secondary tags:

For Medical:
- Medical History
- Informed consent
- Other

For Legal/Procedure:
- Doctor's file
- Insurance certificate
- Claim
- Complaint
- Citation
- Other

For Email:
- Legal/Procedure
- Medical

For Attachment:
- Other

In [22]:
unique_pairs = df_filtered[["primary_correct", "secondary_correct"]].drop_duplicates()
num_unique_pairs = len(unique_pairs)

print(f"Number of unique (primary_correct, secondary_correct) pairs: {num_unique_pairs}")
print(unique_pairs)

Number of unique (primary_correct, secondary_correct) pairs: 5
   primary_correct secondary_correct
0  Legal/Procedure             Claim
1          Medical   Medical History
2  Legal/Procedure     Doctor's file
4            Email   Legal/Procedure
5       Attachment             Other
