In [92]:
import os
import pymupdf4llm
import re
import json
import tqdm
import spacy

from langdetect import detect # Add to requirements.txt
#from itertools import pairwise
from unicodedata import normalize

from nltk.tokenize import sent_tokenize, word_tokenize
from deep_translator import GoogleTranslator

https://pypi.org/project/langdetect/

In [9]:
#!pip install langdetect

In [98]:
INPUT_DIR = "../input_raw/"
OUTPUT_DIR = "../input_parsed/"
NLP_MODEL = spacy.load("pt_core_news_sm")

In [None]:
def detect_language(text):
    return detect(text)

def pre_process_md(md_text_):
    md_text = normalize('NFKD', md_text_).encode('ascii','ignore').decode("utf-8")
    md_text = re.sub(r'\n+', '\n', md_text).strip()
    md_text = re.sub(r"-{3,}", "", md_text).strip()

    return md_text

def translate_text(text, src_lang, tgt_lang):
        all_translated = []

        for sentence in tqdm.tqdm(sent_tokenize(text)):
            translated = GoogleTranslator(
                source=src_lang,
                target=tgt_lang
            ).translate(sentence)
            
            all_translated.append(translated)
        
        return ' '.join(all_translated)

def post_process_md(md_text_):
    if md_text_ is None:
        return None
    
    md_text = re.sub(r"\*{1,}", "", md_text_).strip()
    md_text = re.sub(r"#{1,}", "", md_text).strip()
    md_text = re.sub(r"-\* \*", "", md_text).strip()
    md_text = re.sub(r"\* \*", " ", md_text).strip()
    md_text = re.sub(r"\n", " ", md_text).strip()

    return md_text


In [50]:
def extract_title(preprocessed_md_text):
    pat = r"^#\s\*\*(.+)\*\*\n"
    match = re.search(pat, preprocessed_md_text)
    if match:
        return post_process_md(match.group(1))
    
    return None

def extract_authors(preprocessed_md_text):
    pat = r"^###\s(.+)$"
    authors = [
        re.search(pat, preprocessed_md_text, re.MULTILINE).group(1)
    ]
    return authors

def extract_abstract(preprocessed_md_text):
    pat = r"^###\s\*Abstract\.\s(.+)\*"
    match = re.search(pat, preprocessed_md_text, re.M)
    if match:
        return post_process_md(match.group(1))
    
    return None

def extract_sections(preprocessed_md_text):
    pat = r"^##\s\*\*(.+)\*\*$"
    section_titles = []
    section_contents = []
    match_starts = []
    match_ends = []

    for match in re.finditer(pat, preprocessed_md_text, re.MULTILINE):
        section_titles.append(match.group(1))
        match_starts.append(match.start())
        match_ends.append(match.end())

    match_starts = match_starts[1:] + [len(preprocessed_md_text)]

    for start, end in zip(match_ends, match_starts):
        section_contents.append(post_process_md(preprocessed_md_text[start:end]))

    sections = {
        title: content\
            for title, content in zip(section_titles, section_contents) 
    }
    return sections
    

In [103]:
def split_out_references(sections_dict):
    SECTION_NAMES = ["References", "Referencias"]
    numbered_sections = {section_name: section_content\
                         for section_name, section_content in sections_dict.items()}

    ref_keys = [key for key in sections_dict.keys() if key in SECTION_NAMES]
    if not ref_keys:
        return numbered_sections, dict()
    
    ref_sections = dict()
    for key in ref_keys:
        ref_sections[key] = numbered_sections.pop(key)

    return numbered_sections, ref_sections

def concat_sections(sections_dict):
    return "\n".join(
        [f"{key}\n{val}" for key, val in sections_dict.items()]
    )

def tokenize_text(postprocessed_text, nlp_model):
    tokens = []
    for token in nlp_model(postprocessed_text):
        tokens.append({
            "token": token.text,
            "pos": token.pos_,
            "lemma": token.lemma_
        })
    
    return tokens

def post_process_references(referencs_from_sections):
    pass


In [90]:
def adjust_to_template(preprocessed_md_text, nlp_model):
    template = {}
    template["titulo"] = extract_title(preprocessed_md_text)
    template["informacoes_url"] = None
    template["idioma"] = detect_language(preprocessed_md_text)
    template["storage_key"] = None
    template["autores"] = extract_authors(preprocessed_md_text)
    template["data_publicacao"] = None
    template["resumo"] = extract_abstract(preprocessed_md_text)
    template["keywords"] = None

    sections_dict = extract_sections(preprocessed_md_text)
    numbered_sections, references_dict = split_out_references(sections_dict)
    template["referencias"] = list(references_dict.values()).pop() if references_dict else None
    
    postprocessed_text = concat_sections(numbered_sections)
    template["artigo_completo"] = postprocessed_text

    token_list = tokenize_text(postprocessed_text, nlp_model)
    template["artigo_tokenizado"] = [token.get("token") for token in token_list]
    template["pos_tagger"] = [token.get("pos") for token in token_list]
    template["lema"] = [token.get("lemma") for token in token_list]

    return template

In [93]:
def write_template_to_file(template: dict, output_dir: str, fname: str):
    full_name = os.path.join(output_dir, fname)
    with open(full_name, "w", encoding="utf-8") as out:
        json.dump(template, out)


In [None]:
preprocessed_md_texts = []

for fname in tqdm.tqdm(os.listdir(INPUT_DIR)):
    full_input_fname = os.path.join(INPUT_DIR, fname)
    raw_md_text = pymupdf4llm.to_markdown(full_input_fname)
    preprocessed_md_text = pre_process_md(raw_md_text)
    template = adjust_to_template(preprocessed_md_text, NLP_MODEL)

    output_fname = fname.replace(".pdf", ".json")
    write_template_to_file(template, OUTPUT_DIR, output_fname)
