In [None]:
import os
import pymupdf4llm
import re
import tqdm

from langdetect import detect # Add to requirements.txt
from itertools import pairwise
from unicodedata import normalize

from nltk.tokenize import sent_tokenize
from deep_translator import GoogleTranslator

https://pypi.org/project/langdetect/

In [96]:
#!pip install langdetect

In [3]:
INPUT_DIR = "./input/"

In [18]:
def pre_process_md(md_text_):
    md_text = normalize('NFKD', md_text_).encode('ascii','ignore').decode("utf-8")
    md_text = re.sub(r'\n+', '\n', md_text).strip()
    md_text = re.sub(r"-{3,}", "", md_text).strip()

    return md_text

def translate_text(text, src_lang, tgt_lang):
        all_translated = []

        for sentence in tqdm.tqdm(sent_tokenize(text)):
            translated = GoogleTranslator(
                source=src_lang,
                target=tgt_lang
            ).translate(sentence)
            
            all_translated.append(translated)
        
        return ' '.join(all_translated)

def post_process_md(md_text_):
    if md_text_ is None:
        return None
    
    md_text = re.sub(r"\*{1,}", "", md_text_).strip()
    md_text = re.sub(r"#{1,}", "", md_text).strip()
    md_text = re.sub(r"-\* \*", "", md_text).strip()
    md_text = re.sub(r"\* \*", " ", md_text).strip()
    md_text = re.sub(r"\n", " ", md_text).strip()

    return md_text


In [8]:
preprocessed_md_texts = []

for fname in tqdm.tqdm(os.listdir(INPUT_DIR)):
    full_fname = os.path.join(INPUT_DIR, fname)
    raw_md_text = pymupdf4llm.to_markdown(full_fname)
    preprocessed_md_text = pre_process_md(raw_md_text)
    preprocessed_md_texts.append(preprocessed_md_text)


In [None]:
def detect_language(text):
    return detect(text)

In [None]:
def extract_title(preprocessed_md_text):
    pat = r"^#\s\*\*(.+)\*\*\n"
    match = re.search(pat, preprocessed_md_text)
    if match:
        return post_process_md(match.group(1))
    
    return None

def extract_authors(preprocessed_md_text):
    pat = r"^###\s(.+)$"
    authors = [
        re.search(pat, preprocessed_md_text, re.MULTILINE).group(1)
    ]
    return authors

def extract_abstract(preprocessed_md_text):
    pat = r"^###\s\*Abstract\.\s(.+)\*"
    match = re.search(pat, preprocessed_md_text, re.M)
    if match:
        return post_process_md(match.group(1))
    
    return None

def extract_sections(preprocessed_md_text):
    pat = r"^##\s\*\*(.+)\*\*$"
    section_titles = []
    section_contents = []
    line_starts = []

    for match in re.finditer(pat, preprocessed_md_text, re.MULTILINE):
        section_titles.append(match.group(1))
        line_starts.append(match.start())

    line_starts = line_starts + [len(preprocessed_md_text)]

    for start, end in pairwise(line_starts):
        section_contents.append(preprocessed_md_text[start:end-1])

    sections = {
        title: content\
            for title, content in zip(section_titles, section_contents) 
    }
    return sections


In [105]:
def split_references(md_text, section_name): # Check for Referencias or References
    split = md_text.split(f"## **{section_name}**", 2)
    if len(split) == 2:
        return post_process_md(split[0]), post_process_md(split[1])
    
    return post_process_md(split[0]), ""

def retrieve_references_from_sections(sections_dict):
    SECTION_NAMES = ["References", "Referencias"]
    candidate_keys = [key for key in sections_dict.keys() if key in SECTION_NAMES]
    if not candidate_keys:
        return None
    
    return sections_dict.get(candidate_keys[0])

def post_process_references(referencs_from_sections):
    pass


In [103]:
def adjust_to_template(preprocessed_md_text):
    template = {}
    template["titulo"] = extract_title(preprocessed_md_text)
    template["informacoes_url"] = None
    template["idioma"] = detect_language(preprocessed_md_text)
    template["storage_key"] = None
    template["autores"] = extract_authors(preprocessed_md_text)
    template["data_publicacao"] = None
    template["resumo"] = extract_abstract(preprocessed_md_text)
    template["keywords"] = None

    sections_dict = extract_sections(preprocessed_md_text)
    template["referencias"] = retrieve_references_from_sections(sections_dict)
    template["artigo_completo"] = None
    template["artigo_tokenizado"] = None
    template["pos_tagger"] = None
    template["lema"] = None

    return template

In [104]:
adjust_to_template(preprocessed_md_text)

{'titulo': 'Impact of Shot Noise Estimation on the Secret Key Rate of a CV-QKD System',
 'informacoes_url': None,
 'idioma': 'en',
 'storage_key': None,
 'autores': ['Daniel Pereira [1] [,] [2], Nuno A. Silva [1], Armando N. Pinto [1] [,] [2] 1 Instituto de Telecomunicac  oes, University of Aveiro, Campus Universit ario de Santiago, 3810-193, Aveiro, Portugal 2 Department of Electronics, Telecommunications and Informatics, University of Aveiro, Campus Universit ario de Santiago, 3810-193, Aveiro, Portugal'],
 'data_publicacao': None,
 'resumo': 'In this work we present the impact of the uncertainty of the shot noise estimate on the performance of a continuous variables quantum key distribution system using a probabilistically shaped 128-APSK constellation. We demons- trate that the performance of the system is greatly degraded by the uncertainty of the shot noise estimate, with a total loss of security being possible.',
 'keywords': None,
 'referencias': '## **Referencias**\n### Almeid

In [100]:
retrieve_references_from_sections(extract_sections(preprocessed_md_text))

'## **Referencias**\n### Almeida, M. (2021). Practical security limits of continuous-variable quantum key distri- bution. Masters thesis, University of Aveiro. Almeida, M., Pereira, D., Muga, N., Fac ao, M., Pinto, A. N., and Silva, N. A. (2021). Secret key rate of multi-ring m-apsk continuous variable quantum key distribution. Optics Express . Bennett, C. H. and Brassard, G. (1984). Quantum cryptography: Public key distribution and con tos5. In Proceedings of the International Conference on Computers, Systems and Signal Processing . Denys, A., Brown, P., and Leverrier, A. (2021). Explicit asymptotic secret key rate of continuous-variable quantum key distribution with an arbitrary modulation. Quantum, 5:540. Faruk, M. S. and Savory, S. J. (2017). Digital signal processing for coherent transceivers employing multilevel formats. Journal of Lightwave Technology, 35(5):11251141. Grosshans, F. and Grangier, P. (2002). Continuous variable quantum cryptography using coherent states. Physical 

In [10]:
print(preprocessed_md_text)

# **Impact of Shot Noise Estimation on the Secret Key Rate of a** **CV-QKD System**
### Daniel Pereira [1] [,] [2], Nuno A. Silva [1], Armando N. Pinto [1] [,] [2] 1 Instituto de Telecomunicac  oes, University of Aveiro, Campus Universit ario de Santiago, 3810-193, Aveiro, Portugal 2 Department of Electronics, Telecommunications and Informatics, University of Aveiro, Campus Universit ario de Santiago, 3810-193, Aveiro, Portugal
danielfpereira@ua.pt, nasilva@ua.pt, anp@ua.pt
### *Abstract. In this work we present the impact of the uncertainty of the shot noise* *estimate on the performance of a continuous variables quantum key distribution* *system using a probabilistically shaped 128-APSK constellation. We demons-* *trate that the performance of the system is greatly degraded by the uncertainty* *of the shot noise estimate, with a total loss of security being possible.*
## **1. Introduction**
### The near-future emergence of a practical quantum computer is a threat to classical cryptog