In [None]:

import logging

from docling.datamodel import vlm_model_specs
from docling.datamodel.base_models import InputFormat
from docling.datamodel.pipeline_options import VlmPipelineOptions
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.models.utils.generation_utils import (
    DocTagsRepetitionStopper,
)
from docling.pipeline.vlm_pipeline import VlmPipeline

logging.basicConfig(level=logging.INFO, format="%(levelname)s:%(name)s:%(message)s")


# Set up logging to see when repetition stopping is triggered
logging.basicConfig(level=logging.INFO)

# Replace with a local path if preferred.
# source = "https://ibm.biz/docling-page-with-table" # Example that shows no repetitions.
source = r"D:\agents\assistant\uploads\20251126_134514_17249f00.pdf"  # Example that creates repetitions.
print(f"Processing document: {source}")

###### USING GRANITEDOCLING WITH CUSTOM REPETITION STOPPING

## Using standard Huggingface Transformers (most portable, slowest)
custom_vlm_options = vlm_model_specs.GRANITEDOCLING_TRANSFORMERS.model_copy()

# Uncomment this to use MLX-accelerated version on Apple Silicon
# custom_vlm_options = vlm_model_specs.GRANITEDOCLING_MLX.model_copy() # use this for Apple Silicon


# Create custom VLM options with repetition stopping criteria
custom_vlm_options.custom_stopping_criteria = [
    DocTagsRepetitionStopper(N=32)
]  # check for repetitions for every 32 new tokens decoded.

pipeline_options = VlmPipelineOptions(
    vlm_options=custom_vlm_options,
)

converter = DocumentConverter(
    format_options={
        InputFormat.IMAGE: PdfFormatOption(
            pipeline_cls=VlmPipeline,
            pipeline_options=pipeline_options,
        ),
    }
)

doc = converter.convert(source=source).document

print(doc.export_to_markdown())

## Using a remote VLM inference service (for example VLLM) - uncomment to use

# custom_vlm_options = ApiVlmOptions(
#     url="http://localhost:8000/v1/chat/completions",  # LM studio defaults to port 1234, VLLM to 8000
#     params=dict(
#         model=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS.repo_id,
#         max_tokens=8192,
#         skip_special_tokens=True,  # needed for VLLM
#     ),
#     headers={
#         "Authorization": "Bearer YOUR_API_KEY",
#     },
#     prompt=vlm_model_specs.GRANITEDOCLING_TRANSFORMERS.prompt,
#     timeout=90,
#     scale=2.0,
#     temperature=0.0,
#     response_format=ResponseFormat.DOCTAGS,
#     custom_stopping_criteria=[
#         DocTagsRepetitionStopper(N=1)
#     ],  # check for repetitions for every new chunk of the response stream
# )


# pipeline_options = VlmPipelineOptions(
#     vlm_options=custom_vlm_options,
#     enable_remote_services=True, # required when using a remote inference service.
# )

# converter = DocumentConverter(
#     format_options={
#         InputFormat.IMAGE: PdfFormatOption(
#             pipeline_cls=VlmPipeline,
#             pipeline_options=pipeline_options,
#         ),
#     }
# )

# doc = converter.convert(source=source).document

# print(doc.export_to_markdown())

  from .autonotebook import tqdm as notebook_tqdm
  converter = DocumentConverter(
2025-11-26 15:08:04,874 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-26 15:08:05,007 - INFO - Going to convert document batch...
2025-11-26 15:08:05,010 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-26 15:08:05,037 - INFO - Loading plugin 'docling_defaults'
2025-11-26 15:08:05,042 - INFO - Registered picture descriptions: ['vlm', 'api']


Processing document: D:\agents\assistant\uploads\20251126_134514_17249f00.pdf


2025-11-26 15:08:05,068 - INFO - Loading plugin 'docling_defaults'
2025-11-26 15:08:05,086 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-26 15:08:05,819 - INFO - Accelerator device: 'cpu'
[32m[INFO] 2025-11-26 15:08:05,849 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-26 15:08:05,875 [RapidOCR] download_file.py:60: File exists and is valid: D:\agents\assistant\env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-26 15:08:05,877 [RapidOCR] main.py:53: Using D:\agents\assistant\env\Lib\site-packages\rapidocr\models\ch_PP-OCRv4_det_infer.onnx[0m
[32m[INFO] 2025-11-26 15:08:06,067 [RapidOCR] base.py:22: Using engine_name: onnxruntime[0m
[32m[INFO] 2025-11-26 15:08:06,071 [RapidOCR] download_file.py:60: File exists and is valid: D:\agents\assistant\env\Lib\site-packages\rapidocr\models\ch_ppocr_mobile_v2.0_cls_infer.onnx[0m
[32m[INFO] 2025-11-26 15:08:06,

## Le Nguyen Quoc Anh

Undergraduate, FPT University, Ho Chi Minh, Vietnam lenguyenquocanh2005@gmail.com - 0909427722 - www.linkedin.com/in/quocanh - Github: lyanh238

## CAREER OBJECTIVE

An AI major student passionate about Machine Learning and Data Science, aiming to explore how machines learn from language and data to create intelligent and insightful systems.

## EDUCATION

FPT University , Ho Chi Minh City, Vietnam

GPA:

8.1/10

The Degree of Bachelor in Artificial Intelligence

## TECHNICAL SKILLS

- Cloud Platforms: Microsoft Azure, Supabase, NeonDB
- Frameworks: Scikit-learn, Tensorflow, Pytorch, Flask, FastAPI
- Languages: Python, R, JavaScript
- Libraries: Numpy, Pandas, Scipy, OpenCV, Matplotlib, Librosa, Seaborn, Statsmodels, Beautiful Soup, Selenium
- IDE: VS code, Pycharm, Jupyter notebook
- Environments: Anaconda, Docker
- OS:

Window

- Database: MySQL, PostgreSQL

- Version Control:

GIT

- Tools: Dataiku, Exel, Power BI, n8n

## WORKING EXPERIENCE

## FPT Software



: 

In [None]:
from sqlalchemy import Column, Integer, String
from sqlalchemy.orm import declarative_base
# Create a base class
Base = declarative_base()
# Define a model class
class User(Base):
   __tablename__ = 'users'
   id = Column(Integer, primary_key=True)
   username = Column(String(50))
   email = Column(String(100))
   def __repr__(self):
       return f"<User(username='{self.username}', email='{self.email}')>"

  Base = declarative_base()
