# Instalar dependências necessárias

In [None]:
import pdfplumber
import pandas as pd
import os

pdf_folder = '../data'

pdf_files_path = []
for root, dirs, files in os.walk(pdf_folder):
  for file in files:
    if file.endswith('.pdf'):
      pdf_files_path.append(os.path.join(root, file))

raw_texts = []

for pdf_file_path in pdf_files_path:
  whole_text = ''

  with pdfplumber.open(pdf_file_path) as pdf:
    for page in pdf.pages:
      text = page.extract_text()
      whole_text += f"{text}\n"

  raw_texts.append(whole_text)

## Extrair o texto não estruturado do PDF

In [None]:
dataframe = pd.DataFrame(data={'raw': raw_texts})

row_count = dataframe.count()
columns = dataframe.columns
print(f"Row count: {row_count}")
print(f"Columns: {columns}")

## Functions

In [None]:
def extract_text_based_on_appearance_in_text(text_to_extract: str, text: str, type: str = 'after'):
  """
  Extracts the text after the appearance of a specific text in a given string.

  Parameters:
    text_to_extract (str): The text to extract.
    text (str): The input string.
    type (str): The type of extraction. It can be either "before" or "after".

  Returns:
    str: The extracted text.

  Raises:
    ValueError: If the type is neither "before" nor "after".
  """

  import re

  if type == 'before':
    search = re.search(rf'.*(?<={text_to_extract})', text, re.DOTALL)
  elif type == 'after':
    search = re.search(rf'(?<={text_to_extract}).*', text)
  else:
    raise ValueError('type must be either "before" or "after"')

  extracted_text = search.group().strip() if search else None

  return extracted_text


## Extract all essential data from raw data

In [None]:
import re
import re
import datetime

example = dataframe['raw'][0]
print(pdf_files_path[0])

# Remove redundant texts that are not useful for the analysis
example = re.sub(r'\bTribunal Regional Eleitoral do Rio Grande do Norte\n\b', '', example, count=1)
example = re.sub(r'\bPJe - Processo Judicial Eletrônico\n\b', '', example, count=1)
example = re.sub(r'\bN úmero: \b', '', example, count=1)

# Extracting the date of process
date_pattern = r'\d{2}/\d{2}/\d{4}'
date_match = re.search(date_pattern, example)
date = date_match.group() if date_match else None 
print(date)

example = re.sub(date_pattern, '\n', example, count=1)

# Extracting the legal action number
legal_action_number_pattern = r'\b\d{7}-\d{2}.\d{4}.\d{1}.\d{2}.\d{4}\b' 
legal_action_number_match = re.search(legal_action_number_pattern, example)
legal_action_number = legal_action_number_match.group() if legal_action_number_match else None
example = re.sub(legal_action_number_pattern, '\n', example, count=1)
print(legal_action_number)


# Extracting the data of raw text
search = 'Classe:'
legal_class = extract_text_based_on_appearance_in_text(search, example)
print(legal_class)

example = re.sub(rf'\b{search} {legal_class}\n\b', '', example, count=1)

search = 'Órgão julgador:'
tribunal = extract_text_based_on_appearance_in_text(search, example)
print(tribunal)

example = re.sub(rf'\b{search} {tribunal}\n\b', '', example, count=1)

search = 'Última distribuição :'
last_distribution = extract_text_based_on_appearance_in_text('Última distribuição :', example)
print(last_distribution)

example = re.sub(rf'\b{search} {last_distribution}\n\b', '', example, count=1)

search = 'Processo referência:'
reference_legal_action = extract_text_based_on_appearance_in_text('Processo referência:', example)
print(reference_legal_action)

example = re.sub(rf'\b{search} {reference_legal_action}\n\b', '', example, count=1)

search = 'Assuntos:'
matters = extract_text_based_on_appearance_in_text('Assuntos:', example)
print(matters)

example = re.sub(rf'\b{search} {matters}\n\b', '', example, count=1)

search = 'Cargo -'
position = extract_text_based_on_appearance_in_text('Cargo -', example)
print(position)

example = re.sub(rf'\b{search} {position}\n\b', '', example, count=1)

legal_action_goal_string = 'Objeto do processo: '
judicial_secrecy = 'Segredo de Justiça?'
extracted_text = re.search(f'{legal_action_goal_string}(.*?){judicial_secrecy}', example, re.DOTALL)
legal_action_goal = extracted_text.group(1).strip() if extracted_text else None
print(legal_action_goal)

example = re.sub(rf'\b{legal_action_goal_string}{legal_action_goal}\n\b', '', example, count=1)

search = 'Segredo de Justiça\?'
judicial_secrecy = extract_text_based_on_appearance_in_text('Segredo de Justiça\?', example)
print(judicial_secrecy)

example = re.sub(rf'\b{search} {judicial_secrecy}\n\b', '', example, count=1)

search = 'Justiça gratuita\?'
free_judicial = extract_text_based_on_appearance_in_text('Justiça gratuita\?', example)
print(free_judicial)

example = re.sub(rf'\b{search} {free_judicial}\n\b', '', example, count=1)

search = 'Pedido de liminar ou antecipação de tutela\?'
formal_request = extract_text_based_on_appearance_in_text('Pedido de liminar ou antecipação de tutela\?', example)
print(formal_request)

example = re.sub(rf'\b{search} {formal_request}\n\b', '', example, count=1)

example = re.sub(r'\bPartes Advogados\n\b', '', example, count=1)

search = r'\(REQUERENTE\)'
match = re.search(rf'(.+?)\s+{search}', example.replace('\n', ' '))
requerente = match.group(1).strip() if match else None
print(requerente)

example = re.sub(r'\b(REQUERENTE)\b', '', example, count=1)
example = re.sub(rf'\b{requerente}\b', '', example, count=1)
example = re.sub(r'\(\)', '', example, count=1)

search = r'\(ADVOGADO\)'
match = re.search(rf'(.+?)\s+{search}', example.replace('\n', ' '))
advogado = match.group(1).strip() if match else None
print(advogado)

example = re.sub(r'\b(ADVOGADO)\b', '', example, count=1)
example = re.sub(rf'\b{advogado}\b', '', example, count=1)
example = re.sub(r'\(\)', '', example, count=1)

search = r'\(REQUERENTE\)'
match = re.search(rf'(.+?)\s+{search}', example.replace('\n', ' '))
requerente_2 = match.group(1).strip() if match else None
print(requerente_2)

example = re.sub(r'\b(REQUERENTE)\b', '', example, count=1)
example = re.sub(rf'\b{requerente_2}\b', '', example, count=1)
example = re.sub(r'\(\)', '', example, count=1)

outros_participantes_search = 'Outros participantes'
fiscal_de_lei_search = r'\(FISCAL DA LEI\)'
extracted_text = re.search(f'{outros_participantes_search}(.*?){fiscal_de_lei_search}', example, re.DOTALL)
fiscal_de_lei_nome = extracted_text.group(1).strip().replace('\n', ' ') if extracted_text else None
print(fiscal_de_lei_nome)

example = re.sub(rf'\b{outros_participantes_search}\n\b', '', example, count=1)
example = re.sub(rf'\bFISCAL DA LEI\b', '', example, count=1)
example = re.sub(r'\(\)', '', example, count=1)
example = re.sub(r'\bPROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO\nNORTE\b', '', example, count=1)

example = re.sub(r'\bDocumentos\b', '', example, count=1)

text_treated_for_index = example.strip().replace('\n', ' ').split(' ')
print(text_treated_for_index)
id = text_treated_for_index[6]
print(id)

data_da_assinatura = text_treated_for_index[7]
print(data_da_assinatura)

hora_da_assinatura = text_treated_for_index[10]
print(hora_da_assinatura)

data_hora_da_assinatura = f'{data_da_assinatura} {hora_da_assinatura}'
data_hora_da_assinatura_timestamp = datetime.datetime.strptime(data_hora_da_assinatura, '%d/%m/%Y %H:%M').isoformat()
print(data_hora_da_assinatura_timestamp)

tipo = text_treated_for_index[9]
print(tipo)

example = re.sub(r'\bId. Data da Documento Tipo Assinatura\b', '', example.replace('\n', ' '), count=1)
example = re.sub(rf'\b{id}\b', '', example, count=1)
example = re.sub(rf'\b{data_da_assinatura}\b', '', example, count=1)
example = re.sub(rf'\b{hora_da_assinatura}\b', '', example, count=1)
example = re.sub(r'\bSentença\b', '', example, count=2)

documento = example

indeferimento_word_appearance = documento.lower().find('indefiro')
resultado = 'INDEFERIDO' if indeferimento_word_appearance != -1 else 'DEFERIDO'
print(resultado)

In [None]:
import pandas as pd

data = {
  'advogado': [advogado],
  'data_hora_da_assinatura_timestamp': [data_hora_da_assinatura_timestamp],
  'documento': [documento],
  'fiscal_de_lei_nome': [fiscal_de_lei_nome],
  'formal_request': [formal_request],
  'free_judicial': [free_judicial],
  'id': [id],
  'indeferimento_word_appearance': [indeferimento_word_appearance],
  'judicial_secrecy': [judicial_secrecy],
  'last_distribution': [last_distribution],
  'legal_action_goal': [legal_action_goal],
  'legal_action_number': [legal_action_number],
  'legal_class': [legal_class],
  'matters': [matters],
  'position': [position],
  'reference_legal_action': [reference_legal_action],
  'requerente': [requerente],
  'requerente_2': [requerente_2],
  'resultado': [resultado],
}

df = pd.DataFrame(data=data)

df.head()