Ref.:
https://pymupdf.readthedocs.io/en/latest/pymupdf4llm/index.html

In [62]:
# Using py3.11
#!pip install nltk
#!pip install PyMuPDF==1.25.5
#!pip install pymupdf4llm
#!pip install deep-translator

In [63]:
import os
import re
import nltk
import tqdm
import pymupdf4llm
from unicodedata import normalize

In [64]:
from nltk.tokenize import sent_tokenize
from deep_translator import GoogleTranslator

In [65]:
nltk.download('punkt')
nltk.download('punkt_tab')
nltk.download('stopwords') # For improved tokenization, optionally download stopwords
nltk.download('perluni') # To handle Unicode characters effectively
nltk.download('nonbreaking_prefixes')

[nltk_data] Downloading package punkt to /home/matheus/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Error loading perluni: Package 'perluni' not found in
[nltk_data]     index
[nltk_data] Downloading package nonbreaking_prefixes to
[nltk_data]     /home/matheus/nltk_data...
[nltk_data]   Package nonbreaking_prefixes is already up-to-date!


True

In [66]:
def pre_process_md(md_text_):
    md_text = normalize('NFKD', md_text_).encode('ascii','ignore').decode("utf-8")
    md_text = re.sub(r'\n+', '\n', md_text).strip()
    md_text = re.sub(r"-{3,}", "", md_text).strip()

    return md_text

def remove_references(md_text):
    return md_text.split("## **References**").pop(0)

def translate_text(text, src_lang, tgt_lang):
    all_translated = []

    for sentence in tqdm.tqdm(sent_tokenize(text)):
        translated = GoogleTranslator(
            source=src_lang,
            target=tgt_lang
        ).translate(sentence)
        
        all_translated.append(translated)
    
    return ' '.join(all_translated)

def post_process_md(md_text_):
    if md_text_ is None:
        return None
    
    md_text = re.sub(r"\*{1,}", "", md_text_).strip()
    md_text = re.sub(r"#{1,}", "", md_text).strip()
    md_text = re.sub(r"-\* \*", "", md_text).strip()
    md_text = re.sub(r"\* \*", " ", md_text).strip()
    md_text = re.sub(r"\n", " ", md_text).strip()

    return md_text

### Extraindo texto do PDF:

In [67]:
DIR = "./artigos/"
articles_md = []

for fname in os.listdir(DIR):
    full_fname = os.path.join(DIR, fname)
    md_text = pymupdf4llm.to_markdown(full_fname)
    pre_processed_md = pre_process_md(md_text)

    articles_md.append(pre_processed_md)


In [68]:
text = articles_md[0][:]
text = remove_references(text)
text = post_process_md(text)

### Traduzindo texto para Português:

In [69]:
translated = translate_text(
    text, "english", "portuguese"
)

100%|██████████| 227/227 [01:17<00:00,  2.91it/s]


In [70]:
with open("artigo.txt", "w") as arq:
    arq.write(translated)

In [71]:
#from IPython.display import display_markdown
#display_markdown(articles_md[0])