In [10]:
import requests
import os
import urllib3
import json

In [11]:
def get_text(file_path):
    urllib3.disable_warnings(urllib3.exceptions.InsecureRequestWarning)

    headers = {
        'Authorization': 'Bearer YOUR_TOKEN_HERE',
    }

    try:
        with open(file_path, 'rb') as file:
            files = {
                'file': (os.path.basename(file_path), file, 'application/pdf')
            }

            response = requests.post('https://grupmedai-api-des.itcomb.cat/pdf/text', headers=headers, files=files, verify=False)

            if response.status_code == 200:
                response_data = response.json()
                if 'content' in response_data:
                    text = response_data['content']
                    return text
                else:
                    print("Error: 'content' key not found in the response.")
                    return response_data
            else:
                print(f"Error: Received status code {response.status_code}")
                print(f"Response: {response.json()}")
                return None

    except FileNotFoundError:
        print(f"Error: File {file_path} not found.")
        return None
    except Exception as e:
        print(f"An error occurred: {e}")
        return None

In [12]:
get_text(r"C:\Users\Sophie\Tagging code\documents\HISTORIAL_MÈDIC.pdf")

'HISTORIAL MÈDIC \nHospital Clínic de Barcelona \nDepartament de Medicina Interna \n \nInformació del Pacient \n• \nNom complet: Marc Ferrer Puig \n• \nData de naixement: 5 d’abril de 1982 \n• \nEdat: 42 anys \n• \nGènere: Masculí \n• \nTelèfon: +34 678 123 456 \n• \nCorreu electrònic: marcferrer82@email.com \n• \nAdreça: Carrer Balmes, 123, 4t 2a, 08008 Barcelona \n• \nContacte d’emergència:  \no Nom: Laura Martí Soler (Esposa) \no Telèfon: +34 654 987 321 \n \nAntecedents Mèdics \n• \nMalalties cròniques: \no Hipertensió arterial diagnosticada el 2018 \no Diabetis tipus 2 diagnosticada el 2021 \no Asma lleu des de la infància \n• \nCirurgies prèvies: \no Apendicectomia (2005) \no Reparació d’hèrnia inguinal (2017) \n• \nAl·lèrgies conegudes: \no Penicil·lina (reacció anafilàctica) \no Fruits secs (urticària moderada) \n• \nHistorial familiar: \no Pare: Cardiopatia isquèmica (defunció als 65 anys) \no Mare: Diabetis tipus 2 \no Germà: Hipertensió arterial \n \nMedicació Actual \n• \nE

In [13]:
def upload_and_process(document_path, prompt=None):

    file_data = get_text(document_path)

    if prompt is None:
        prompt = f"""Label this document into one of the following categories: LEGAL, MEDICAL, EMAIL. 
        
        Return only the label. Example: LEGAL
        
        Document information:
        {file_data}
        """

        prompt_legal = f"""This document is a LEGAL document. Add an appropriate sub-label:
        - ACTS
        - ACKNOWLEDGMENT OF RECEIPT
        - RESOLUTIONS OR CLOSURE
        - NOTICES
        - REPORTS
        - GUARANTEES OR DEPOSITS
        - ADMINISTRATIVE
        - PRELIMINARY PROCEEDINGS
        - PAYMENTS OR COMPENSATIONS
        - APPEAL
        - POWER OF ATTORNEY
        - SENTENCE
        - OTHER

        If possible, avoid using OTHER and try to find the right category. 

        Return only the sub-label. Example: REPORTS

        Document information:
        {file_data}
        """
        prompt_medical = f"""This document is a MEDICAL document. Add an appropriate sub-label:
        - REPORTS
        - MEDICAL HISTORY
        - MEDICAL CERTIFICATE
        - INITIAL CLINICAL DIAGNOSIS
        - SEQUELAE STABILIZATION
        - OTHER

        If possible, avoid using OTHER and try to find the right category.

        Return only the sub-label. Example: MEDICAL HISTORY

        Document information:
        {file_data}
        """

    try:
        destination_url = "https://grupmedai-api-des.itcomb.cat/llm/generate"
        headers = {
        'Authorization':  'Bearer YOUR_ACCESS_TOKEN',
        'Content-Type': 'application/json'
        }
        
        data = {
            "model": "llama3.1:8b",
            "prompt": prompt
        }

        response = requests.post(
            destination_url,
            headers=headers,
            data=json.dumps(data),
            verify=False
        )

        if response.status_code == 200:
            label = response.json().get('text', '')
        
        else:
            raise Exception(f"API request failed: {response.status_code}, {response.text}")
        
        sublabel = None
        
        if label == 'LEGAL':
            subprompt = prompt_legal
        elif label == 'MEDICAL':
            subprompt = prompt_medical
        else:
            return label
        
        data = {
                "model": "llama3.1:8b",
                "prompt": subprompt
            }

        response_sublabel = requests.post(
            destination_url,
            headers=headers,
            data=json.dumps(data),
            verify=False
        )

        if response_sublabel.status_code == 200:
            sublabel = response_sublabel.json().get('text', '')
            return {"label": label, "sublabel": sublabel}
            
        else:
            raise Exception(f"API request failed: {response_sublabel.status_code}, {response_sublabel.text}")
   
    except Exception as e:
        print(f"Error: {e}")
        return None

In [14]:
upload_and_process(r"C:\Users\Sophie\Tagging code\documents\HISTORIAL_MÈDIC.pdf")

{'label': 'MEDICAL', 'sublabel': 'MEDICAL HISTORY'}

In [15]:
upload_and_process(r"C:\Users\Sophie\Tagging code\documents\diagnostic_clinic_inicial.pdf")

{'label': 'MEDICAL', 'sublabel': 'MEDICAL HISTORY'}

In [16]:
upload_and_process(r"C:\Users\Sophie\Tagging code\documents\claim_medical_malpractice.pdf")

{'label': 'LEGAL', 'sublabel': 'ADMINISTRATIVE'}

In [17]:
upload_and_process(r"C:\Users\Sophie\Tagging code\documents\email_legal.pdf")

{'label': 'LEGAL', 'sublabel': 'REPORTS'}

In [18]:
upload_and_process(r"C:\Users\Sophie\Tagging code\documents\doctor_file.pdf")

{'label': 'MEDICAL', 'sublabel': 'MEDICAL HISTORY'}