### tiktoken

In [7]:
import tiktoken
from typing import List, Dict

def count_tiktoken_length(messages: List[Dict[str, str]], model_name: str = "gpt-3.5-turbo") -> int:
    """
    Counts the total number of tokens in a list of messages using tiktoken.

    Args:
        messages (List[Dict[str, str]]): List of messages, where each message is a dictionary
                                         with keys like "role" and "content".
        model_name (str): The name of the model for which the tokenization should be done.
                          Default is "gpt-3.5-turbo".

    Returns:
        int: Total number of tokens across all messages.
    """
    try:
        # Load the tokenizer for the specified model
        encoding = tiktoken.encoding_for_model(model_name)
        
        total_tokens = 0
        
        for message in messages:
            for key, value in message.items():
                # Count tokens for each value in the message dictionary
                total_tokens += len(encoding.encode(value))
        
        return total_tokens
    except Exception as e:
        raise RuntimeError(f"Error in calculating token length: {e}")

# Example usage
messages = [
    {"role": "system", "content": "You are a helpful assistant."},
    {"role": "user", "content": "What is the weather like today?"},
    {"role": "assistant", "content": "The weather is sunny and warm."}
]

token_count = count_tiktoken_length(messages)
print(f"Total token count: {token_count}")

Total token count: 23


### docling

In [7]:
import os
import sys

current_dir = os.getcwd()
sys.path.append(os.path.join(current_dir,".."))

from docling.document_converter import DocumentConverter

print("Import successfully!")

Import successfully!


In [8]:
#Extract pdf pages
!python ../utils/extract_pdf_by_pages.py ../assets/Fortran77_tutorial.pdf tmp.pdf 2 3

Extracted pages 2 to 3 into 'tmp.pdf'


In [9]:
# Convert to markdown
source = "tmp.pdf"  # document per local path or URL
converter = DocumentConverter()

In [10]:
%%time
result = converter.convert(source)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

    There is an imbalance between your GPUs. You may want to exclude GPU 1 which
    has less than 75% of the memory or cores of GPU 0. You can do so by setting
    the device_ids argument to DataParallel, or by setting the CUDA_VISIBLE_DEVICES
    environment variable.


CPU times: user 6.03 s, sys: 262 ms, total: 6.29 s
Wall time: 4.65 s


In [11]:
markdown_content = result.document.export_to_markdown()
print(markdown_content)

## 1. What is Fortran?

Fortran is a general purpose programming language, mainly intended for mathematical computations in science applications (e.g. physics). Fortran is an acronym for FORmula TRANslation, and was originally capitalized as FORTRAN. However, following the current trend to only capitalize the first letter in acronyms, we will call it Fortran. Fortran was the first high-level programming language. The work on Fortran started in the 1950's at IBM and there have been many versions since. By convention, a Fortran version is denoted by the last two digits of the year the standard was proposed. Thus we have Fortran 66, Fortran 77 and Fortran 90 (95).

The most common Fortran version today is still Fortran 77, although Fortran 90 is growing in popularity. Fortran 95 is a revised version of Fortran 90 which is expected to be approved by ANSI soon (1996). There are also several versions of Fortran aimed at parallel computers. The most important one is High Performance Fortran (

In [12]:
# Export output
with open("tmp.txt","w", encoding="utf-8") as f:
    f.write(markdown_content)

### openai

In [1]:
import os
import sys

current_dir = os.getcwd()
sys.path.append(os.path.join(current_dir,".."))

from utils.helper import set_openai_key, test_openai_api, create_openai_client

print("Import successfully!")

# Set openai key
set_openai_key()

Import successfully!
API key set successfully.


In [2]:
# Test openai api
test_openai_api()

This is a test.


In [3]:
# Create openai client
client = create_openai_client()

In [4]:
response = client.chat.completions.create(
    messages=[
        {
            "role": "user",
            "content": "Say this is a test",
        }
    ],
    model="gpt-4o",
)

response

ChatCompletion(id='chatcmpl-Ace9u6rUbil7JuJUJv9q5laSHt6TK', choices=[Choice(finish_reason='stop', index=0, logprobs=None, message=ChatCompletionMessage(content='This is a test.', refusal=None, role='assistant', audio=None, function_call=None, tool_calls=None))], created=1733774198, model='gpt-4o-2024-08-06', object='chat.completion', service_tier=None, system_fingerprint='fp_c7ca0ebaca', usage=CompletionUsage(completion_tokens=5, prompt_tokens=12, total_tokens=17, completion_tokens_details=CompletionTokensDetails(accepted_prediction_tokens=0, audio_tokens=0, reasoning_tokens=0, rejected_prediction_tokens=0), prompt_tokens_details=PromptTokensDetails(audio_tokens=0, cached_tokens=0)))

In [5]:
print(response.choices[0].message.content)

This is a test.
