# Instalar dependências necessárias

In [None]:
import pdfplumber
import pandas as pd
import os
import pdfplumber

pdf_folder = '../data'

pdf_files = []
for root, dirs, files in os.walk(pdf_folder):
  for file in files:
    if file.endswith('.pdf'):
      pdf_files.append(os.path.join(root, file))

raw_texts = []

for pdf_file in pdf_files:
  whole_text = ''

  with pdfplumber.open(pdf_file) as pdf:
    for page in pdf.pages:
      text = page.extract_text()
      whole_text += f"{text}\n"

  raw_texts.append(whole_text)

## Extrair o texto não estruturado do PDF

In [None]:
dataframe = pd.DataFrame(data={'raw': raw_texts})

row_count = dataframe.count()
columns = dataframe.columns
print(f"Row count: {row_count}")
print(f"Columns: {columns}")

## Functions

In [None]:
def extract_text_based_on_appearance_in_text(text_to_extract: str, text: str, type: str = 'after'):
  """
  Extracts the text after the appearance of a specific text in a given string.

  Parameters:
    text_to_extract (str): The text to extract.
    text (str): The input string.
    type (str): The type of extraction. It can be either "before" or "after".

  Returns:
    str: The extracted text.

  Raises:
    ValueError: If the type is neither "before" nor "after".
  """

  import re

  if type == 'before':
    search = re.search(rf'.*(?={text_to_extract})', text)
  elif type == 'after':
    search = re.search(rf'(?<={text_to_extract}).*', text)
  else:
    raise ValueError('type must be either "before" or "after"')

  extracted_text = search.group().strip() if search else None

  return extracted_text


## Extract all essential data from raw data

In [None]:
import re
import re

example = dataframe['raw'][0]

# Remove redundant texts that are not useful for the analysis
example = re.sub(r'\bTribunal Regional Eleitoral do Rio Grande do Norte\n\b', '', example, count=1)
example = re.sub(r'\bPJe - Processo Judicial Eletrônico\n\b', '', example, count=1)
example = re.sub(r'\bN úmero: \b', '', example, count=1)

# Extracting the date of process
date_pattern = r'\d{2}/\d{2}/\d{4}'
date_match = re.search(date_pattern, example)
date = date_match.group() if date_match else None 
print(date)

example = re.sub(date_pattern, '\n', example, count=1)

# Extracting the legal action number
legal_action_number_pattern = r'\b\d{7}-\d{2}.\d{4}.\d{1}.\d{2}.\d{4}\b' 
legal_action_number_match = re.search(legal_action_number_pattern, example)
legal_action_number = legal_action_number_match.group() if legal_action_number_match else None
example = re.sub(legal_action_number_pattern, '\n', example, count=1)
print(legal_action_number)


# Extracting the data of raw text
search = 'Classe:'
legal_class = extract_text_based_on_appearance_in_text(search, example)
print(legal_class)

example = re.sub(rf'\b{search} {legal_class}\n\b', '', example, count=1)

search = 'Órgão julgador:'
tribunal = extract_text_based_on_appearance_in_text(search, example)
print(tribunal)

example = re.sub(rf'\b{search} {tribunal}\n\b', '', example, count=1)

search = 'Última distribuição :'
last_distribution = extract_text_based_on_appearance_in_text('Última distribuição :', example)
print(last_distribution)

example = re.sub(rf'\b{search} {last_distribution}\n\b', '', example, count=1)

search = 'Processo referência:'
reference_legal_action = extract_text_based_on_appearance_in_text('Processo referência:', example)
print(reference_legal_action)

example = re.sub(rf'\b{search} {reference_legal_action}\n\b', '', example, count=1)

search = 'Assuntos:'
matters = extract_text_based_on_appearance_in_text('Assuntos:', example)
print(matters)

example = re.sub(rf'\b{search} {matters}\n\b', '', example, count=1)

search = 'Cargo -'
position = extract_text_based_on_appearance_in_text('Cargo -', example)
print(position)

example = re.sub(rf'\b{search} {position}\n\b', '', example, count=1)

legal_action_goal_string = 'Objeto do processo: '
judicial_secrecy = 'Segredo de Justiça?'
extracted_text = re.search(f'{legal_action_goal_string}(.*?){judicial_secrecy}', example, re.DOTALL)
legal_action_goal = extracted_text.group(1).strip() if extracted_text else None
print(legal_action_goal)

example = re.sub(rf'\b{legal_action_goal_string}{legal_action_goal}\n\b', '', example, count=1)

search = 'Segredo de Justiça\?'
judicial_secrecy = extract_text_based_on_appearance_in_text('Segredo de Justiça\?', example)
print(judicial_secrecy)

example = re.sub(rf'\b{search} {judicial_secrecy}\n\b', '', example, count=1)

search = 'Justiça gratuita\?'
free_judicial = extract_text_based_on_appearance_in_text('Justiça gratuita\?', example)
print(free_judicial)

example = re.sub(rf'\b{search} {free_judicial}\n\b', '', example, count=1)

search = 'Pedido de liminar ou antecipação de tutela\?'
formal_request = extract_text_based_on_appearance_in_text('Pedido de liminar ou antecipação de tutela\?', example)
print(formal_request)

example = re.sub(rf'\b{search} {formal_request}\n\b', '', example, count=1)

example = re.sub(r'\bPartes Advogados\n\b', '', example, count=1)

requerente = extract_text_based_on_appearance_in_text('REQUERENTE', example.replace('\n', ' '), type='before')
print(requerente)

example = re.sub(r'\n+', '\n', example)

In [None]:
print(example)