# Setup

Check that everything is ok!

In [1]:
from pathlib import Path
import os
from dotenv import load_dotenv
from openai import OpenAI
from mistralai import Mistral

In [2]:
import sys
import torch
import pydantic

print(f"{sys.version=}")
print(f"{pydantic.__version__=}")
print(f"{torch.__version__=}")

sys.version='3.12.10 (tags/v3.12.10:0cc8128, Apr  8 2025, 12:21:36) [MSC v.1943 64 bit (AMD64)]'
pydantic.__version__='2.12.5'
torch.__version__='2.10.0+cpu'


# Environment Variables

In [3]:
load_dotenv()

OPENAI_API_KEY = os.environ['OPENAI_API_KEY']
MISTRAL_API_KEY = os.environ["MISTRAL_API_KEY"]

# Docling

In [4]:
path_artifacts = Path.home() / ".cache" / "docling" / "models"  # Docling models are stored here

path_artifacts.exists()

True

In [5]:
from docling.document_converter import DocumentConverter
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import PdfPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption

source = "https://hutchesonlab.fiu.edu/wp-content/uploads/sample-pdf.pdf" 

pipeline_options = PdfPipelineOptions(
    artifacts_path=path_artifacts.as_posix(),
    )

converter = DocumentConverter(
    format_options={
        InputFormat.PDF: PdfFormatOption(pipeline_options=pipeline_options)
    },
)

doc = converter.convert(source).document

doc.export_to_markdown()


  from .autonotebook import tqdm as notebook_tqdm


'## This is a Sample PDF file'

# OpenAI

In [6]:
client_openai = OpenAI(
    api_key=OPENAI_API_KEY
)

response = client_openai.embeddings.create(
    input="Your text string goes here",
    model="text-embedding-3-small"
)

print(len(response.data[0].embedding))

1536


# MistralAI

In [7]:
client_mistral = Mistral(api_key=MISTRAL_API_KEY)

chat_response = client_mistral.chat.complete(
    model = "mistral-medium-2505",
    messages = [
        {
            "role": "user",
            "content": "Tell me a joke about devops",
        },
    ]
)

print(chat_response.choices[0].message.content)

Sure! Here's a DevOps-themed joke for you:

**Why did the DevOps engineer break up with their significant other?**

Because they kept saying, *"It works on my machine!"*

*(Bonus groan-worthy follow-up: And then they tried to fix the relationship with a hotfix, but it just caused more downtime.)*

Hope that gives you a chuckle! ðŸ˜„


In [8]:
text = "pag 2 | 15"
import re
pattern = r"pag\s+\d+\s*\|\s*\d+"
matches = re.findall(pattern, text)
matches

['pag 2 | 15']