In [26]:
import requests
import json
import cohere
import pytesseract
from pdfminer.high_level import extract_text
from PIL import Image
import os

with open("cohere.key") as f:
    COHERE_API_KEY = f.read()

cohere_client = cohere.Client(api_key=COHERE_API_KEY)

api_url = "https://api.cohere.ai/v1/generate"

headers = {
    'Authorization': f'Bearer {COHERE_API_KEY}',
    'Content-Type': 'application/json'
}

def extract_text_from_file(document):
    """
    Extracts text from a document. Supports:
    - PDFs (both selectable text and scanned PDFs using OCR)
    - Images (JPG, PNG, JPEG, TIFF) using OCR
    """
    try:
        file_ext = document.lower().split(".")[-1]

        if file_ext == "pdf":

            text = extract_text(document).strip()
            
            if not text:
                print("No selectable text found in PDF. Using OCR...")
                text = extract_text_from_images_in_pdf(document)

        elif file_ext in ["jpg", "png", "jpeg", "tiff"]:
            print("Processing image with OCR...")
            image = Image.open(document)
            text = pytesseract.image_to_string(image).strip()
        
        else:
            raise ValueError("Unsupported file type. Only PDFs and images (JPG, PNG, JPEG, TIFF) are supported.")

        return text

    except Exception as e:
        print(f"Error extracting text: {e}")
        return None

def extract_text_from_images_in_pdf(pdf_path):
    """
    Converts a PDF to images and extracts text from each page using OCR.
    Requires `pdf2image` (install via `pip install pdf2image`).
    """
    from pdf2image import convert_from_path  # import only if needed

    images = convert_from_path(pdf_path)
    text = ""

    for i, img in enumerate(images):
        print(f"Running OCR on page {i+1}...")
        text += pytesseract.image_to_string(img) + "\n"

    return text.strip()

def upload_and_process(document, prompt=None):

    if prompt is None:
        
        prompt = """Label this document into one of the following categories: LEGAL, MEDICAL, EMAIL. 
        Add an appropriate sub-label:

        Sub-labels of Legal:
        - ACTS
        - ACKNOWLEDGMENT OF RECEIPT
        - OTHER
        - RESOLUTIONS OR CLOSURE
        - NOTICES
        - REPORTS
        - GUARANTEES OR DEPOSITS
        - ADMINISTRATIVE
        - PRELIMINARY PROCEEDINGS
        - PAYMENTS OR COMPENSATIONS
        - APPEAL
        - POWER OF ATTORNEY
        - SENTENCE

        Sub-labels of Medical:
        - OTHER
        - REPORTS
        - MEDICAL HISTORY
        - MEDICAL CERTIFICATE
        - INITIAL CLINICAL DIAGNOSIS
        - SEQUELAE STABILIZATION

        Return only the label and the sub-label separated by a comma. Example: LEGAL, ACKNOWLEDGMENT OF RECEIPT."""

    try:
        content = extract_text_from_file(document)

        if not content or len(content) < 5:
            raise ValueError("Extracted text is too short. The document might be empty or unreadable.")

        data = {
            "model": "command-r-plus",
            "prompt": f"{prompt}\n\n{content}",
            "max_tokens": 100,  # low because we only need a short response
            "temperature": 0.3,
        }

        response = requests.post(api_url, headers=headers, json=data)

        if response.status_code == 200:
            return response.json()["generations"][0]["text"]
        else:
            raise Exception(f"API request failed with status code {response.status_code}: {response.text}")

    except FileNotFoundError:
        print(f"Error: File '{document}' not found.")
        return None
    except ValueError as e:
        print(f"Error: {e}")
        return None
    except requests.exceptions.RequestException as e:
        print(f"Error processing the file: {e}")
        return None

In [27]:
result_certificado_medico = upload_and_process('/Users/sophiebald/Desktop/test_documents/certificado_medico.png')

Processing image with OCR...


In [28]:
result_certificado_medico

'MEDICAL, OTHER'

In [29]:
result_arbitration = upload_and_process('/Users/sophiebald/Desktop/test_documents/arbitration_agreement.jpg')

Processing image with OCR...


In [30]:
result_arbitration

'LEGAL, OTHER'

In [31]:
result_public_mass = upload_and_process('/Users/sophiebald/Desktop/test_documents/PublicWaterMassMailing.pdf')

No selectable text found in PDF. Using OCR...
Running OCR on page 1...
Running OCR on page 2...
Running OCR on page 3...
Running OCR on page 4...
Running OCR on page 5...
Running OCR on page 6...
Running OCR on page 7...
Running OCR on page 8...


In [32]:
result_public_mass

'LEGAL, OTHER'

In [33]:
result_medical_report = upload_and_process('/Users/sophiebald/Desktop/test_documents/medical_report_chewy.png')

Processing image with OCR...


In [34]:
result_medical_report

'MEDICAL, REPORTS'

In [35]:
result_acta_previa = upload_and_process('/Users/sophiebald/Desktop/test_documents/acta_previa.jpg')

Processing image with OCR...


In [36]:
result_acta_previa

'LEGAL, ADMINISTRATIVE'

In [37]:
result_correu = upload_and_process('/Users/sophiebald/Desktop/test_documents/correu_condis.png')

Processing image with OCR...


In [38]:
result_correu

'LEGAL, OTHER'

In [39]:
result_doctor_report = upload_and_process('/Users/sophiebald/Desktop/test_documents/doctor_report.pdf')

In [40]:
result_doctor_report

'MEDICAL, REPORTS'

In [41]:
result_doctor_file = upload_and_process('/Users/sophiebald/Desktop/test_documents/doctor_file.pdf')

In [42]:
result_doctor_file

'LEGAL, OTHER'

In [43]:
result_malpractice_claim = upload_and_process('/Users/sophiebald/Desktop/test_documents/claim_medical_malpractice.pdf')

In [44]:
result_malpractice_claim

'LEGAL, OTHER'

In [45]:
result_historial_medic = upload_and_process('/Users/sophiebald/Desktop/test_documents/HISTORIAL_MÈDIC.pdf')

In [46]:
result_historial_medic

'MEDICAL, MEDICAL HISTORY'

In [47]:
result_dci = upload_and_process('/Users/sophiebald/Desktop/test_documents/diagnostic_clinic_inicial.pdf')

The tag below is technically wrong since there's a sub-label INITIAL CLINICAL DIAGNOSIS, it seems to struggle if there's not explicitly the words

In [48]:
result_dci

'MEDICAL, REPORTS'

In [49]:
result_email_legal = upload_and_process('/Users/sophiebald/Desktop/test_documents/email_legal.pdf')

The tag below is wrong, it should be email, but raise: email could be a subcategory of legal?

In [50]:
result_email_legal

'LEGAL, OTHER'