[![Open In Colab](https://colab.research.google.com/assets/colab-badge.svg)](https://colab.research.google.com/github/pdf-tools/components-code-sample-hub/blob/main/jupyter/pdftools_sdk/pdftools_sdk_chat_with_pdf.ipynb)

In [None]:
%pip install openai
%pip install pdftools_sdk
%pip install ipython

# Chat with a PDF
Ask a question about a PDF and get an answer on the
console.

In [None]:
import io
from openai import OpenAI
from pdftools_sdk.pdf import Document
from pdftools_sdk.extraction import Extractor, TextOptions, TextExtractionFormat

In [None]:
# Download a file from a given URL and save it to the local system
def prepare_file(url: str, path: str):
    import requests
    response = requests.get(url)
    response.raise_for_status()

    with open(path, 'wb') as f:
        f.write(response.content)

In [None]:
# Set input arguments
input_url = 'https://pdftools-public-downloads-production.s3.eu-west-1.amazonaws.com/samples/testfiles/PdfPrimerWhitepaper.pdf'
input_path = 'PdfPrimerWhitepaper.pdf'
prepare_file(input_url, input_path)
question = 'What is the title?'

In [None]:
def extract_text(input_file_path: str) -> str:
    # Open input document
    with open(input_file_path, 'rb') as in_stream:
        with Document.open(in_stream) as in_doc:
            # Set extraction options
            options = TextOptions()
            options.extraction_format = TextExtractionFormat.DOCUMENT_ORDER

            # Extract text from PDF
            extractor = Extractor()
            with io.BytesIO() as output_stream:
                extractor.extract_text(in_doc, output_stream, options)
                return output_stream.getvalue().decode('utf-8')

In [None]:
def answer_question(text: str, question: str) -> str:
    client = OpenAI(api_key="***insert-open-ai-api-key***")

    prompt = (
        "You are a helpful assistant. Use the provided text to answer the "
        "question. If the answer is not in the text, say 'Not found'.\n\n"
        f"Text: {text}\nQuestion: {question}\nAnswer:"
    )

    response = client.chat.completions.create(
        model="gpt-4",
        messages=[
            {"role": "system", "content": "You answer questions based on text."},
            {"role": "user", "content": prompt}
        ],
    )

    return response.choices[0].message.content.strip()

In [None]:
try:
    # By default, a test license key is active. In this case, a watermark is added to the output. 
    # If you have a license key, please uncomment the following call and set the license key.
    # from pdftools_sdk.sdk import Sdk
    # Sdk.initialize("INSERT-LICENSE-KEY")

    extracted_text = extract_text(input_path)
    answer = answer_question(extracted_text, question)
    print(f"Question: {question}")
    print(f"Answer: {answer}")

    print(f"Finished chatting with PDF.")
except Exception as e:
    print(f"An error occurred: {e}")