In [130]:
from pypdf import PdfReader
from functools import reduce


def get_pages_in_pdf(path_to_pdf: str) -> int:
    """Return the number of pages in a pdf file."""
    reader = PdfReader(path_to_pdf)
    num_of_pages = len(reader.pages)
    return num_of_pages


def read_text_from_pdf(path_to_pdf: str) -> str:
    """Return the text content of a pdf file."""
    reader = PdfReader(path_to_pdf)
    pages_text = [page.extract_text() for page in reader.pages]

    pdf_text = reduce(lambda x, y: x + y, pages_text)
    return pdf_text

In [143]:
import re
from logs import log_unexpected_num_of_matches


def drop_lines_with_pattern(input_string, pattern, expected_num_of_matches):
    """
    Drop the lines that contain a match to some pattern.
    If the number of lines dropped is not the expected, log it.
    """
    # filter lines that don't contain a match to the pattern
    lines = input_string.split("\n")
    filtered_lines = [line for line in lines if not re.search(pattern, line)]

    # check if the number of matches is not the expected
    num_of_matches = len(lines) - len(filtered_lines)
    if num_of_matches != expected_num_of_matches:
        log_unexpected_num_of_matches(pattern, num_of_matches, expected_num_of_matches)

    # join filtered lines to a single string
    result_string = "\n".join(filtered_lines)

    return result_string


def drop_pattern(input_string, pattern, expected_num_of_matches):
    """
    Substitute the occurrences of the pattern with an empty string.
    If the number of occurrences is not the expected, log it.
    """
    # substitute matches with an empty string
    result_string = re.sub(pattern, "", input_string)

    # check num of matches is the expected number
    num_of_matches = len(re.findall(pattern, input_string))
    if num_of_matches != expected_num_of_matches:
        log_unexpected_num_of_matches(pattern, num_of_matches, expected_num_of_matches)

    return result_string

In [142]:
def drop_headers_and_footnotes(pdf_text: str, num_of_pages: int) -> str:
    """
    Given the text content of a pdf of the BORME registry,
    use regex patterns to find the headers and footnotes inside of the pdf text,
    drop them.
    """
    # Define pattern, expected_num_of_matches for the filler texts that we want to remove
    final_footnote = (
        "https://www.boe.es BOLETÍN OFICIAL DEL REGISTRO MERCANTIL D.L.",
        1,
    )
    section_header = ("SECCIÓN PRIMERA\n Empresarios\n Actos inscritos\n", 1)
    header = ("BOLETÍN OFICIAL DEL REGISTRO MERCANTIL", num_of_pages)
    subheader = (r"Núm. \d+ [A-Za-z]+ \d+ de [A-Za-z]+ de \d+ Pág. \d+", num_of_pages)
    footnote_1 = (r"cve: BORME-[A-Za-z]-\d+-\d+-\d+", num_of_pages)
    footnote_2 = ("Verificable en https://www.boe.es", num_of_pages)

    # Remove lines that contain the specified patterns
    result_text = pdf_text
    for pattern, expected_num_of_matches in [
        final_footnote,
        header,
        subheader,
        footnote_1,
        footnote_2,
    ]:
        result_text = drop_lines_with_pattern(
            result_text, pattern, expected_num_of_matches
        )

    # Remove occurrences of the specified pattern
    result_text = drop_pattern(result_text, *section_header)

    return result_text.strip()

In [207]:
def split_text_by_acts(acts_text: str) -> list[str]:
    """
    Given a text from a pdf of the BORME registry,
    containing only the act information with no headers footnotes or region name,
    split text in a list, where each element of the list corresponds to a different act.
    """
    # Pattern: digit at the start of line followed by ' - ' and a word in all uppercase
    pattern = r"(\n\d+ - [A-Z]+)"
    # Split string by pattern, keep the separators.
    # The separators are kept because the pattern is wrapped in parenthesis.
    splitted_text = re.split(pattern, acts_text)
    # remove empty strings from splitted text
    splitted_text = [e for e in splitted_text if e != ""]

    # The splitted text has the form [pattern match, (...), pattern match, (...), ...]
    # we want to add the odd and even elements
    even_els = splitted_text[0::2]
    odd_els = splitted_text[1::2]
    assert len(even_els) == len(odd_els), "num of even and odd elements do not match"
    acts = [even_els[i] + odd_els[i] for i in range(len(even_els))]

    return acts

In [226]:
def parse_act(act: str, region_name: str) -> dict:
    """
    Parse the string containing the information of a given act,
    return a dictionary with the structured information
    """
    lines = act.strip().split("\n")
    # id, company name are in the 1st line, separated by an '-' character
    id, company_name = lines[0].split("-", 1)
    clean_company_name = company_name.strip().replace(".", "")

    # store the rest of the text in the variable description
    # For more information about this design choice, consult the README
    description = "\n".join(lines[1:])

    return {
        "id": id,
        "company_name": clean_company_name,
        "region_name": region_name,
        "description": description,
    }

In [227]:
parse_act(acts[0], region_name)

{'id': '505995 ',
 'company_name': 'PRODUCTOS JAFEP SOCIEDAD LIMITADA',
 'region_name': 'ALBACETE',
 'description': 'Nombramientos.  Auditor: EUDITA CUSPIDE AUDITORES SOCIEDAD LIMITADA.  Datos registrales. T 1027, L 791, F 55, S 8, H\nAB 2767, I/A 24 (20.11.23).'}

In [230]:
path = "../../data/output/2023-11-28/BORME-A-2023-226-02.pdf"
pdf_text = read_text_from_pdf(path)
num_of_pages = get_pages_in_pdf(path)

cleaned_pdf_text = drop_headers_and_footnotes(pdf_text, num_of_pages)

# The first line of the cleaned pdf text is the region name,
# the rest is the text containing the act information
region_name = cleaned_pdf_text.split("\n")[0]
acts_text = cleaned_pdf_text.replace(region_name, "", 1)

acts = split_text_by_acts(acts_text)
cleaned_acts = [parse_act(act, region_name) for act in acts]

In [231]:
cleaned_acts

[{'id': '505995 ',
  'company_name': 'PRODUCTOS JAFEP SOCIEDAD LIMITADA',
  'region_name': 'ALBACETE',
  'description': 'Nombramientos.  Auditor: EUDITA CUSPIDE AUDITORES SOCIEDAD LIMITADA.  Datos registrales. T 1027, L 791, F 55, S 8, H\nAB 2767, I/A 24 (20.11.23).'},
 {'id': '505996 ',
  'company_name': 'CONSTRUCCIONES LORENZO LINUESA SOCIEDAD LIMITADA',
  'region_name': 'ALBACETE',
  'description': 'Constitución.  Comienzo de operaciones: 7.11.23. Objeto social: - Las obras de construcción y albañilería en general. - La\ncompraventa de terrenos y urbanización de los mismos. - La promoción y compraventa de locales comerciales, viviendas y\napartamentos. - La explotación en régimen de arrendamiento y la adquisición de bienes inmuebles, así como otras actividade.\nDomicilio: C/ JUAN II, 3 02300 (ALCARAZ). Capital: 5.000,00 Euros.  Declaración de unipersonalidad. Socio único: LORENZO\nLINUESA JUAN. Nombramientos.  Adm. Unico: LORENZO LINUESA JUAN.  Datos registrales. T 1071, L 835, F 47

In [74]:
x = [0, 1, 2, 3, 4, 5]
even_els = x[0::2]
odd_els = x[1::2]
[x + y for x in even_els for y in odd_els]

[1, 3, 5, 3, 5, 7, 5, 7, 9]