In [1]:
import requests
import os
import urllib3
import json
from enum import Enum
from pydantic import BaseModel, Field

In [2]:
def get_text(file_path):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    headers = {
        'Authorization': 'Bearer YOUR_TOKEN_HERE',
    }

    try:
        with open(file_path, 'rb') as file:
            files = {
                'file': (os.path.basename(file_path), file, 'application/pdf')
            }

            response = requests.post('https://grupmedai-api-des.itcomb.cat/pdf/text', headers=headers, files=files, verify=False)

            if response.status_code == 200:
                response_data = response.json()
                if 'content' in response_data:
                    text = response_data['content']
                    return text
                else:
                    print("Error: 'content' key not found in the response.")
                    return response_data
            else:
                print(f"Error: Received status code {response.status_code}")
                print(f"Response: {response.json()}")
                return None

    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [3]:
# get_text(r"C:\Users\Sophie\Tagging code\documents\HISTORIAL_MÈDIC.pdf")

In [4]:
def get_doc_tags_1(doc_path,model):

    text = get_text(doc_path)
   
    class TagsEnum(str, Enum):
        medical = 'Medical'
        legal = 'Legal'
        email = 'Email'

    class DocTags(BaseModel):
        tag: TagsEnum = Field(..., description="Tag of the document")

   
    #print("json schema", DocTags.model_json_schema())

     # Define API endpoint
    api_url = "http://ollama-api-des.itcomb.cat/api/chat"  # Default Ollama API endpoint

    prompt = f'What category is the best fit for this document:\n{text}\n\n'
    test_prompt =  f"""
    You are classifying the type of document based on its overall purpose and context, not just the words used.

    Available categories:
    - Medical: primarily about diagnoses, prescriptions, patient care, or insurance.
    - Legal: primarily about rights, obligations, regulations, or formal agreements.
    - Email: informal or formal communication, regardless of content, especially with greeting lines, sender/receiver info, and signatures.

    Document:\n
    {text}\n\n"""

    # Prepare the payload
    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": test_prompt
            }
        ],
        "format": DocTags.model_json_schema(),
        "stream": False
    }
   
    # Make the API request
    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    # Parse the response
    tag = DocTags.model_validate_json(response_data["message"]["content"])
    return tag

In [5]:
tag_test = get_doc_tags_1(r"C:\Users\Sophie\Tagging code\documents\HISTORIAL_MÈDIC.pdf",model="qwen2.5:32b")
print(tag_test)
print(tag_test.tag.value)

tag=<TagsEnum.medical: 'Medical'>
Medical


In [6]:
def get_doc_tags_2(doc_path,category,model):

    text = get_text(doc_path)

    if category == 'Medical':

        class MedicalSubTags(str,Enum):
            reports = 'Reports'
            medical_history = 'Medical History'
            medical_certificate = 'Medical Certificate'
            initial_clinical_diagnosis = 'Initial Clinical Diagnosis'
            sequelae_stabilization = 'Sequelae Stabilization'
            other = 'Other'
        
        class MedicalTag(BaseModel):
            sub_tag: MedicalSubTags = Field(...,description='Type of medical document')

        model_class = MedicalTag

    elif category == 'Legal':

        class LegalSubTags(str,Enum):
            acts = 'Acts'
            receipt = 'Acknowledgment of Receipt'
            resolutions = 'Resolutions or Closure'
            notices = 'Notices'
            reports = 'Reports'
            deposits = 'Guarantees or Deposits'
            administrative = 'Administrative'
            prelim_proceedings = 'Preliminary Proceedings'
            payments = 'Payments or Compensations'
            appeal = 'Appeal'
            power_attorney = 'Power of Attorney'
            sentence = 'Sentence'
            other = 'Other'

        class LegalTag(BaseModel):
            sub_tag: LegalSubTags = Field(...,description='Type of legal document')
        
        model_class = LegalTag
    
    elif category == 'Email':

        class EmailSubTags(str,Enum):
            legal = 'Legal'
            medical = 'Medical'
        
        class EmailTag(BaseModel):
            sub_tag: EmailSubTags = Field(...,description='Type of email')
        
        model_class = EmailTag
    
    schema = model_class.model_json_schema()
    #expected_field = 'sub_tag'

    api_url = "http://ollama-api-des.itcomb.cat/api/chat"

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": f'This document was categorized as {category}. What is its specific type:\n{text}\n\n'
            }
        ],
        "format": schema,
        "stream": False
    }

    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    # Parse the response
    tag = model_class.model_validate_json(response_data["message"]["content"])
    return tag

In [7]:
def get_doc_tags_2_iter2(doc_path,category,model):

    text = get_text(doc_path)

    if category == 'Medical':

        class MedicalSubTags(str,Enum):
            reports = 'Reports'
            medical_history = 'Medical History'
            medical_certificate = 'Medical Certificate'
            initial_clinical_diagnosis = 'Initial Clinical Diagnosis'
            sequelae_stabilization = 'Sequelae Stabilization'
            other = 'Other'
        
        class MedicalTag(BaseModel):
            sub_tag: MedicalSubTags = Field(...,description='Type of medical document')

        model_class = MedicalTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Reports: Summaries or findings from clinical evaluations or diagnostics.
        - Medical History: Records of a patient’s past health conditions or treatments.
        - Medical Certificate: Formal statements issued by a doctor for administrative/legal purposes.
        - Initial Clinical Diagnosis: First clinical assessment of a condition.
        - Sequelae Stabilization: Documents related to the stabilization of after-effects of prior medical issues.
        - Other: Any medical document that doesn’t fit the above.

        Document:\n
        {text}\n\n
        """

    elif category == 'Legal':

        class LegalSubTags(str,Enum):
            acts = 'Acts'
            receipt = 'Acknowledgment of Receipt'
            resolutions = 'Resolutions or Closure'
            notices = 'Notices'
            reports = 'Reports'
            deposits = 'Guarantees or Deposits'
            administrative = 'Administrative'
            prelim_proceedings = 'Preliminary Proceedings'
            payments = 'Payments or Compensations'
            appeal = 'Appeal'
            power_attorney = 'Power of Attorney'
            sentence = 'Sentence'
            other = 'Other'

        class LegalTag(BaseModel):
            sub_tag: LegalSubTags = Field(...,description='Type of legal document')
        
        model_class = LegalTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Acts: Formal legislative or regulatory documents.
        - Acknowledgment of Receipt: Confirmations that a party has received a document or notice.
        - Resolutions or Closure: Documents marking the end or resolution of a legal process or case.
        - Notices: Formal communications informing parties of legal procedures or rights.
        - Reports: Legal assessments or statements generated during proceedings or investigations.
        - Guarantees or Deposits: Documents related to financial sureties or collateral.
        - Administrative: Internal or procedural legal communications.
        - Preliminary Proceedings: Initial steps taken in a legal case or investigation.
        - Payments or Compensations: Documents involving settlements or financial reimbursements.
        - Appeal: Requests for review or reconsideration of a prior legal decision.
        - Power of Attorney: Documents granting legal authority to act on another’s behalf.
        - Sentence: Final decisions or rulings issued by a court or authority.
        - Other: Any legal document that does not clearly fit into the above categories.

        Document:\n
        {text}\n\n
        """
    
    elif category == 'Email':

        class EmailSubTags(str,Enum):
            legal = 'Legal'
            medical = 'Medical'
        
        class EmailTag(BaseModel):
            sub_tag: EmailSubTags = Field(...,description='Type of email')
        
        model_class = EmailTag

        content = f"""
        This document was categorized as {category}. What is its specific type?

        - Legal: The email discusses or contains legal matters, contracts, notices, or procedures.
        - Medical: The email contains or references medical content, such as diagnoses, treatments, or health records.

        Document:\n
        {text}\n\n
        """
    
    schema = model_class.model_json_schema()
    #expected_field = 'sub_tag'

    api_url = "http://ollama-api-des.itcomb.cat/api/chat"

    payload = {
        "model": model,
        "messages": [
            {
                "role": "user",
                "content": content
            }
        ],
        "format": schema,
        "stream": False
    }

    response = requests.post(api_url, json=payload, verify=False)
    #print(response.text)
    response_data = response.json()
   
    # Parse the response
    tag = model_class.model_validate_json(response_data["message"]["content"])
    return tag

In [8]:
path = r"C:\Users\Sophie\Tagging code\documents\HISTORIAL_MÈDIC.pdf"
tag_1 = get_doc_tags_1(path,model="qwen2.5:32b")
print(tag_1.tag.value)
tag_2 = get_doc_tags_2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print(tag_2.sub_tag.value)
tag_2_2 = get_doc_tags_2_iter2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print('tag2 alternative',tag_2_2.sub_tag.value)

Medical
Medical History
tag2 alternative Medical History


In [9]:
path = r"C:\Users\Sophie\Tagging code\documents\diagnostic_clinic_inicial.pdf"
tag_1 = get_doc_tags_1(path,model="qwen2.5:32b")
print(tag_1.tag.value)
tag_2 = get_doc_tags_2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print(tag_2.sub_tag.value)
tag_2_2 = get_doc_tags_2_iter2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print('tag2 alternative',tag_2_2.sub_tag.value)

Medical
Medical History
tag2 alternative Initial Clinical Diagnosis


In [10]:
path = r"C:\Users\Sophie\Tagging code\documents\claim_medical_malpractice.pdf"
tag_1 = get_doc_tags_1(path,model="qwen2.5:32b")
print(tag_1.tag.value)
tag_2 = get_doc_tags_2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print(tag_2.sub_tag.value)
tag_2_2 = get_doc_tags_2_iter2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print('tag2 alternative',tag_2_2.sub_tag.value)

Legal
Sentence
tag2 alternative Reports


In [11]:
path = r"C:\Users\Sophie\Tagging code\documents\email_legal.pdf"

tag_1 = get_doc_tags_1(path,model="qwen2.5:32b")
print(tag_1.tag.value)
tag_2 = get_doc_tags_2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print(tag_2.sub_tag.value)
tag_2_2 = get_doc_tags_2_iter2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print('tag2 alternative',tag_2_2.sub_tag.value)

Legal
Resolutions or Closure
tag2 alternative Preliminary Proceedings


In [12]:
path = r"C:\Users\Sophie\Tagging code\documents\doctor_file.pdf"

tag_1 = get_doc_tags_1(path,model="qwen2.5:32b")
print(tag_1.tag.value)
tag_2 = get_doc_tags_2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print(tag_2.sub_tag.value)
tag_2_2 = get_doc_tags_2_iter2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print('tag2 alternative',tag_2_2.sub_tag.value)

Medical
Medical History
tag2 alternative Other


In [13]:
path = r"C:\Users\Sophie\Tagging code\documents\PublicWaterMassMailing.pdf"

tag_1 = get_doc_tags_1(path,model="qwen2.5:32b")
print(tag_1.tag.value)
tag_2 = get_doc_tags_2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print(tag_2.sub_tag.value)
tag_2_2 = get_doc_tags_2_iter2(path,category=tag_1.tag.value,model="qwen2.5:32b")
print('tag2 alternative',tag_2_2.sub_tag.value)

Error: Received status code 413
An error occurred: Expecting value: line 1 column 1 (char 0)
Email
Error: Received status code 413
An error occurred: Expecting value: line 1 column 1 (char 0)
Legal
Error: Received status code 413
An error occurred: Expecting value: line 1 column 1 (char 0)
tag2 alternative Legal
