# **OCR PDF using TESSERACT AND PADDLEOCR**

In [None]:
from pdf2image import convert_from_path
import pytesseract
import os
import requests
import zipfile
import io
import json
from paddleocr import PaddleOCR
from dotenv import load_dotenv
from utils import (ocr_layout,
                   extract_vn_letters,
                   extract_cn_letters,
                   extract_letters_index,)

## **Prepare Environment**

### Download Tessdata for **Tesseract**

In [None]:
def download_tessdata(lang):
    local_dir = os.path.join(os.getcwd(), 'tessdata')
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
        
    file_path = os.path.join(local_dir, f'{lang}.traineddata')
    
    # Tải về nếu file chưa tồn tại
    if not os.path.exists(file_path):
        url = f"https://github.com/tesseract-ocr/tessdata/blob/main/{lang}.traineddata"
        response = requests.get(url)
        with open(file_path, 'wb') as f:
            f.write(response.content)
        print(f"Download tessdata for {lang} successfully")
    else:
        print(f"Tessdata for {lang} existed")
        
    return local_dir

In [None]:
tessdata_path = download_tessdata('vie')
tessdata_path = download_tessdata('chi-sim')

# Khi gọi tesseract để biết sẽ config folder testdata của mình tạo
custom_config = f'--testdata-dir "{tessdata_path}"'

### Download Poppler for **pdf2image**

In [None]:
def install_poppler():
    local_dir = os.path.join(os.getcwd(), 'Poppler')
    bin_path = os.path.join(local_dir,'poppler-25.11.0', 'Library', 'bin')
    
    url = "https://github.com/oschwartz10612/poppler-windows/releases/download/v25.11.0-0/Release-25.11.0-0.zip"
    
    response = requests.get(url)
    response.raise_for_status()
    
    if not os.path.exists(local_dir):
        os.makedirs(local_dir)
        
    with zipfile.ZipFile(io.BytesIO(response.content)) as z:
        z.extractall(local_dir)
        
    return bin_path

In [None]:
poppler_path = install_poppler()

## **Run OCR**

In [None]:
# Config path
load_dotenv()
poppler_path_windows = poppler_path
tesseract_path = os.getenv('TESSERACT_PATH')
pytesseract.pytesseract.tesseract_cmd = tesseract_path

In [None]:
start_page = 107
end_page = 127

pages = convert_from_path('data/raw/pdf1.pdf', 
                          poppler_path=poppler_path_windows, 
                          dpi=400,
                          #first_page = start_page,
                          #last_page = end_page,
                          )

In [None]:
paddle_ocr = PaddleOCR(lang='ch', 
                use_doc_orientation_classify=False,
                use_doc_unwarping=False,
                use_textline_orientation=False,)

In [None]:
full_text = {}
full_text['CN'] = []
full_text['VN'] = []

for i, page_image in enumerate(pages):
    cn_text, vn_text = ocr_layout(page_image, paddle_ocr, i)
    full_text['CN'].append(cn_text)
    
    # vn_text sẽ chứa tiếng việt và tiếng pinyin -> sẽ xử lý sau
    full_text['VN'].append(vn_text)

In [None]:
letters_vi = []
for v in full_text['VN']:
    letters_vi.extend(extract_vn_letters(v))

In [None]:
letters_cn = []
for v in full_text['CN']:
    letters_cn.extend(extract_cn_letters(v))

In [None]:
# Tuỳ chỉnh idx cần extract
start_num = 1
end_num = 20

# Extract theo idx
extracted_letters = extract_letters_index(letters_vi, letters_cn, start_num, end_num)

In [None]:
def save_to_json(data, filename):
    output_data = []
    
    ids = data['id']
    letters_vi = data['vi']
    letters_cn= data['cn']

    for idx, vi_text, cn_text in zip(ids, letters_vi, letters_cn):
        item = {
            "source_id" : idx,
            "src_lang" : vi_text,
            "tgt_lang" : cn_text
        }
        
        output_data.append(item)
        
    with open(filename, 'w', encoding='utf-8') as f:
        json.dump(
            output_data,
            f,
            indent = 4,
            ensure_ascii=False,
        )
        
    print("Save file json sucessfully")

In [None]:
save_to_json(extracted_letters, "data/processed/output.json")