# Instalar dependências necessárias

In [1]:
import pdfplumber
import pandas as pd
import os

pdf_folder = '../data'

pdf_files_path = []
for root, dirs, files in os.walk(pdf_folder):
  for file in files:
    if file.endswith('.pdf'):
      pdf_files_path.append(os.path.join(root, file))

raw_texts = []

for pdf_file_path in pdf_files_path:
  whole_text = ''

  with pdfplumber.open(pdf_file_path) as pdf:
    for index, page in enumerate(pdf.pages):
      text = page.extract_text()
      if index > 0:
        whole_text += f"page: {index} {text}\n"
      else:
        whole_text += f"{text}\n"

  raw_texts.append(whole_text)

## Extrair o texto não estruturado do PDF

In [2]:
dataframe = pd.DataFrame(data={'raw': raw_texts})

row_count = dataframe.count()
columns = dataframe.columns
print(f"Row count: {row_count}")
print(f"Columns: {columns}")
print(dataframe.head())

Row count: raw    211
dtype: int64
Columns: Index(['raw'], dtype='object')
                                                 raw
0  Tribunal Regional Eleitoral do Rio Grande do N...
1  Tribunal Regional Eleitoral do Rio Grande do N...
2  Tribunal Regional Eleitoral do Rio Grande do N...
3  Tribunal Regional Eleitoral do Rio Grande do N...
4  Tribunal Regional Eleitoral do Rio Grande do N...


## Functions

In [3]:
def extract_text_based_on_appearance_in_text(text_to_extract: str, text: str, type: str = 'after'):
  """
  Extracts the text after the appearance of a specific text in a given string.

  Parameters:
    text_to_extract (str): The text to extract.
    text (str): The input string.
    type (str): The type of extraction. It can be either "before" or "after".

  Returns:
    str: The extracted text.

  Raises:
    ValueError: If the type is neither "before" nor "after".
  """

  import re

  if type == 'before':
    search = re.search(rf'.*(?<={text_to_extract})', text, re.DOTALL)
  elif type == 'after':
    search = re.search(rf'(?<={text_to_extract}).*', text)
  else:
    raise ValueError('type must be either "before" or "after"')

  extracted_text = search.group().strip() if search else None

  return extracted_text


## Extract all essential data from raw data

In [4]:
import re
import datetime

def extract_needed_information_from_pdf_text(text: str):
  """
  Extracts the needed information from a given PDF text.

  Parameters:
    text (str): The input text.

  Returns:
    dict: The extracted information.
  """

  # Remove redundant texts that are not useful for the analysis
  text = re.sub(r'\bTribunal Regional Eleitoral do Rio Grande do Norte\n\b', '', text, count=1)
  text = re.sub(r'\bPJe - Processo Judicial Eletrônico\n\b', '', text, count=1)
  text = re.sub(r'\bN úmero: \b', '', text, count=1)
  # Extracting the date of process
  date_pattern = r'\d{2}/\d{2}/\d{4}'
  date_match = re.search(date_pattern, text)
  date = date_match.group() if date_match else None 
  print(date)

  text = re.sub(date_pattern, '\n', text, count=1)

  # Extracting the legal action number
  legal_action_number_pattern = r'\b\d{7}-\d{2}.\d{4}.\d{1}.\d{2}.\d{4}\b' 
  legal_action_number_match = re.search(legal_action_number_pattern, text)
  legal_action_number = legal_action_number_match.group() if legal_action_number_match else None
  text = re.sub(legal_action_number_pattern, '\n', text, count=1)
  print(legal_action_number)


  # Extracting the data of raw text
  search = 'Classe:'
  legal_class = extract_text_based_on_appearance_in_text(search, text)
  print(legal_class)

  text = re.sub(rf'\b{search} {legal_class}\n\b', '', text, count=1)

  search = 'Órgão julgador:'
  tribunal = extract_text_based_on_appearance_in_text(search, text)
  print(tribunal)

  text = re.sub(rf'\b{search} {tribunal}\n\b', '', text, count=1)

  search = 'Última distribuição :'
  last_distribution = extract_text_based_on_appearance_in_text('Última distribuição :', text)
  print(last_distribution)

  text = re.sub(rf'\b{search} {last_distribution}\n\b', '', text, count=1)

  search = 'Valor da causa:'
  cause_cost = extract_text_based_on_appearance_in_text('Valor da causa:', text)
  print(f'cause cost: {cause_cost}')

  text = re.sub(rf'\b{search} {cause_cost}\n\b', '', text, count=1)
  text = re.sub(r'R\$', '', text, count=1)

  search = 'Processo referência:'
  reference_legal_action = extract_text_based_on_appearance_in_text('Processo referência:', text)
  print(reference_legal_action)

  text = re.sub(rf'\b{search} {reference_legal_action}\n\b', '', text, count=1)

  search = 'Assuntos:'
  matters = extract_text_based_on_appearance_in_text('Assuntos:', text)
  print(matters)

  text = re.sub(rf'\b{search} {matters}\n\b', '', text, count=1)

  search = 'Cargo -'
  position = extract_text_based_on_appearance_in_text('Cargo -', text)
  print(position)

  text = re.sub(rf'\b{search} {position}\n\b', '', text, count=1)

  legal_action_goal_string = 'Objeto do processo: '
  judicial_secrecy = 'Segredo de Justiça?'
  extracted_text = re.search(f'{legal_action_goal_string}(.*?){judicial_secrecy}', text, re.DOTALL)
  legal_action_goal = extracted_text.group(1).strip() if extracted_text else None
  print(legal_action_goal)

  text = re.sub(rf'\b{legal_action_goal_string}\b', '', text.strip(), count=1)
  legal_action_goal = legal_action_goal.replace('\n', ' ').strip()

  legal_action_goal_splitted = legal_action_goal.split(' ')

  for word in legal_action_goal_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)

  text = text.strip()

  search = 'Segredo de Justiça\?'
  judicial_secrecy = extract_text_based_on_appearance_in_text('Segredo de Justiça\?', text)
  print(judicial_secrecy)

  text = re.sub(rf'\b{search} {judicial_secrecy}\b', '', text, count=1)

  search = 'Justiça gratuita\?'
  free_judicial = extract_text_based_on_appearance_in_text('Justiça gratuita\?', text)
  print(free_judicial)

  text = re.sub(rf'\b{search} {free_judicial}\b', '', text, count=1)

  search = 'Pedido de liminar ou antecipação de tutela\?'
  formal_request = extract_text_based_on_appearance_in_text('Pedido de liminar ou antecipação de tutela\?', text)
  print(formal_request)

  text = re.sub(rf'\b{search} {formal_request}\b', '', text, count=1)

  text = re.sub(r'\bPartes Advogados\n\b', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' '))
  requerente = match.group(1).strip() if match else None
  print(requerente)

  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  requerente_splitted = requerente.split(' ') if requerente != None else []
  for word in requerente_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNANTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnante = match.group(1).strip() if match else None
  print(f'impugnante: {impugnante}')

  text = re.sub(r'\b(IMPUGNANTE)\b', '', text, count=1)
  text = text.strip()
  impugnante_splitted = impugnante.split(' ') if impugnante != None else []
  for word in impugnante_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNANTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnante_2 = match.group(1).strip() if match else None
  print(f'impugnante: {impugnante_2}')

  text = re.sub(r'\b(IMPUGNANTE)\b', '', text, count=1)
  text = text.strip()
  impugnante_splitted_2 = impugnante_2.split(' ') if impugnante_2 != None else []
  for word in impugnante_splitted_2:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  # print(text)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnado = match.group(1).strip() if match else None
  print(f"impugnado: {impugnado}")

  text = re.sub(r'\b(IMPUGNADO)\b', '', text, count=1)
  text = re.sub(rf'\b{impugnado}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(ADVOGADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' '))
  advogado = match.group(1).strip() if match else None
  print(advogado)

  advogado_splitted = advogado.split(' ') if advogado != None else []
  for word in advogado_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(ADVOGADO)\b', '', text, count=1)
  text = re.sub(rf'\b{advogado}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnado_2 = match.group(1).strip() if match else None
  print(f"impugnado 2: {impugnado_2}")

  text = re.sub(r'\b(IMPUGNADO)\b', '', text, count=1)
  text = text.strip()
  impugnado_splitted_2 = impugnado_2.split(' ') if impugnado_2 != None else []
  for word in impugnado_splitted_2:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_2 = match.group(1).strip() if match else None
  print(requerente_2)
  
  requerente2_splitted = requerente_2.split(' ') if requerente_2 != None else []
  for word in requerente2_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_2}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(ADVOGADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' '))
  advogado_2 = match.group(1).strip() if match else None
  print(f"advogado 2: {advogado_2}")

  text = re.sub(r'\b(ADVOGADO)\b', '', text, count=1)
  text = re.sub(rf'\b{advogado_2}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_3 = match.group(1).strip() if match else None
  print(requerente_3)

  requerente3_splitted = requerente_3.split(' ') if requerente_3 != None else []
  for word in requerente3_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_3}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_4 = match.group(1).strip() if match else None
  print(requerente_4)

  requerente4_splitted = requerente_4.split(' ') if requerente_4 != None else []
  for word in requerente4_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_4}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_5 = match.group(1).strip() if match else None
  print(requerente_5)

  requerente5_splitted = requerente_5.split(' ') if requerente_5 != None else []
  for word in requerente5_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_5}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_6 = match.group(1).strip() if match else None
  print(requerente_6)

  requerente6_splitted = requerente_6.split(' ') if requerente_6 != None else []
  for word in requerente6_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_6}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_7 = match.group(1).strip() if match else None
  print(requerente_7)

  requerente7_splitted = requerente_7.split(' ') if requerente_7 != None else []
  for word in requerente7_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_7}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_8 = match.group(1).strip() if match else None
  print(requerente_8)

  requerente8_splitted = requerente_8.split(' ') if requerente_8 != None else []
  for word in requerente8_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_8}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  outros_participantes_search = 'Outros participantes'
  fiscal_de_lei_search = r'\(FISCAL DA LEI\)'
  extracted_text = re.search(f'{outros_participantes_search}(.*?){fiscal_de_lei_search}', text, re.DOTALL)
  fiscal_de_lei_nome = extracted_text.group(1).strip().replace('\n', ' ') if extracted_text else None
  print(fiscal_de_lei_nome)

  text = re.sub(rf'\b{outros_participantes_search}\n\b', '', text, count=1)
  text = re.sub(rf'\bFISCAL DA LEI\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)
  text = re.sub(r'\bPROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO\nNORTE\b', '', text, count=1)

  text = re.sub(r'\bDocumentos\b', '', text, count=1)

  text_treated_for_index = text.strip().replace('\n', ' ').split(' ')

  print(text_treated_for_index)

  text_treated_for_index.remove('Id.')
  text_treated_for_index.remove('Data')
  text_treated_for_index.remove('da')
  text_treated_for_index.remove('Documento')
  text_treated_for_index.remove('Tipo')
  text_treated_for_index.remove('Assinatura')

  print(text_treated_for_index)

  id = text_treated_for_index[0]
  print(f"id: {id}")
  
  text_treated_for_index.pop(0)

  data_da_assinatura = text_treated_for_index[0]
  print(f"data: {data_da_assinatura}")
  text_treated_for_index.pop(0)

  initial_index = 0

  for index, word in enumerate(text_treated_for_index, start=initial_index):
    pattern = r'\d{2}:\d{2}'
    match = re.match(pattern, word)

    if match:
      hora_da_assinatura = word
      break

  text_treated_for_index.remove(hora_da_assinatura)
  

  data_hora_da_assinatura = f'{data_da_assinatura} {hora_da_assinatura}'
  data_hora_da_assinatura_timestamp = datetime.datetime.strptime(data_hora_da_assinatura, '%d/%m/%Y %H:%M').isoformat()
  print(data_hora_da_assinatura_timestamp)

  tipo = ''

  for index, word in enumerate(text_treated_for_index):
    if word == 'Sentença' or word == 'Petição':
      tipo = word
      break
    elif word == 'Outros' and text_treated_for_index[index + 1] == 'documentos':
      tipo = f'{word} {text_treated_for_index[index + 1]}'
      break
    elif word == 'Parecer' and text_treated_for_index[index + 1] == 'da' and text_treated_for_index[index + 1] == 'Procuradoria':
      tipo = f'{word} {text_treated_for_index[index + 1]} {text_treated_for_index[index + 2]}'
      break
    elif word == 'Cota' and text_treated_for_index[index + 1] == 'ministerial':
      tipo = f'{word} {text_treated_for_index[index + 1]}'
      break

  print(tipo)
  if tipo == 'Sentença' or tipo == 'Petição':
    text_treated_for_index.remove(tipo)
  elif tipo == 'Outros documentos':
    text_treated_for_index.remove('Outros')
    text_treated_for_index.remove('documentos')
  elif tipo == 'Parecer da Procuradoria':
    text_treated_for_index.remove('Parecer')
    text_treated_for_index.remove('da')
    text_treated_for_index.remove('Procuradoria')
  elif tipo == 'Cota ministerial':
    text_treated_for_index.remove('Cota')
    text_treated_for_index.remove('ministerial')

  index = text_treated_for_index.index('page:')
  
  documento = ' '.join(text_treated_for_index[:index])
  print(documento)

  resultado = 'NÃO DEFINIDO'

  detalhamento = ' '.join(text_treated_for_index[index+2:])
  print(detalhamento)
  
  indeferimento_word_appearance = detalhamento.lower().find('indefiro')

  if detalhamento.lower().find('defiro') != -1:
    resultado = 'DEFERIDO'
  elif detalhamento.lower().find('deferimento') != -1:
    resultado = 'DEFERIDO'
  elif detalhamento.lower().find('manifesta-se pelo deferimento') != -1:
    resultado = 'DEFERIDO'
  elif detalhamento.lower().find('homologo') != -1:
    resultado = 'DEFERIDO'


  if indeferimento_word_appearance != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('indeferindo-se') != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('indeferimento') != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('litispendência') != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('não foram preenchidas') != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('extinguo') != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('ausência de documentos exigidos') != -1:
    resultado = 'INDEFERIDO'

  
  print(resultado)

  data = {
    'advogado': advogado,
    'advogado_2': advogado_2,
    'data_hora_da_assinatura_timestamp': data_hora_da_assinatura_timestamp,
    'documento': documento,
    'detalhamento': detalhamento,
    'fiscal_de_lei_nome': fiscal_de_lei_nome,
    'formal_request': formal_request,
    'free_judicial': free_judicial,
    'id': id,
    'judicial_secrecy': judicial_secrecy,
    'last_distribution': last_distribution,
    'legal_action_goal': legal_action_goal,
    'legal_action_number': legal_action_number,
    'legal_class': legal_class,
    'matters': matters,
    'position': position,
    'reference_legal_action': reference_legal_action,
    'requerente': requerente,
    'requerente_2': requerente_2,
    'requerente_3': requerente_3,
    'requerente_4': requerente_4,
    'requerente_5': requerente_5,
    'requerente_6': requerente_6,
    'requerente_7': requerente_7,
    'requerente_8': requerente_8,
    'impugnante': impugnante,
    'impugnante_2': impugnante_2,
    'impugnado': impugnado,
    'impugnado_2': impugnado_2,
    'resultado': resultado,
  }

  return data



In [5]:
import pandas as pd

df = pd.DataFrame()

pdf_files_path_with_error = []

for index, raw_text in enumerate(dataframe['raw']):
  try:
    data = extract_needed_information_from_pdf_text(raw_text)
    data['pdf_file_path'] = pdf_files_path[index]
    new_row = pd.DataFrame([data])
    df = pd.concat([df, new_row], ignore_index=True)
  except ValueError:
    pdf_files_path_with_error.append(pdf_files_path[index])
    continue



30/07/2024
0600130-68.2020.6.20.0035
REGISTRO DE CANDIDATURA
035ª ZONA ELEITORAL DE APODI RN
07/10/2020
cause cost: None
06000804220206200035
Registro de Candidatura - Substituição de Candidato - Por Cancelamento de Registro,
Vereador
Registro de Candidatura - Substituição de Candidato - Por Cancelamento de
Registro - 77 - SOLIDARIEDADE - COMISSAO PROVISORIA MUNICIPAL APODI/RN - TEODORIA
INGRID TORRES CORTEZ
NÃO
NÃO
NÃO
TEODORIA INGRID TORRES CORTEZ
impugnante: None
impugnante: None
impugnado: None
None
impugnado 2: None
77 - SOLIDARIEDADE - COMISSAO PROVISORIA MUNICIPAL APODI/RN
advogado 2: None
None
None
None
None
None
None
PROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO NORTE
['Id.', 'Data', 'da', 'Documento', 'Tipo', 'Assinatura', '18538928', '20/10/2020', 'Sentença', 'Sentença', '09:44', 'page:', '1', 'JUSTIÇA', 'ELEITORAL', '035ª', 'ZONA', 'ELEITORAL', 'DE', 'APODI', 'RN', 'REGISTRO', 'DE', 'CANDIDATURA', '(11532)', 'Nº', '0600130-68.2020.6.20.0035', '/', '035ª', 'ZONA', 'ELEITORAL

In [7]:
df.head(3)

Unnamed: 0,advogado,advogado_2,data_hora_da_assinatura_timestamp,documento,detalhamento,fiscal_de_lei_nome,formal_request,free_judicial,id,judicial_secrecy,...,requerente_4,requerente_5,requerente_6,requerente_7,requerente_8,impugnante,impugnante_2,impugnado,impugnado_2,resultado
0,,,2020-10-20T09:44:00,Sentença,JUSTIÇA ELEITORAL 035ª ZONA ELEITORAL DE APODI...,PROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO ...,NÃO,NÃO,18538928,NÃO,...,,,,,,,,,,DEFERIDO
1,,,2020-10-05T15:37:00,AIRC - Não filiação - Edna Maria Félix,MINISTÉRIO PÚBLICO ELEITORAL PROMOTORIA ELEITO...,PROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO ...,NÃO,NÃO,12423847,NÃO,...,,,,,,DEMOCRATAS (REQUERENTE) PROMOTOR ELEITORAL DO ...,,EDNA MARIA FELIX,,INDEFERIDO
2,,,2020-10-20T09:38:00,Sentença,JUSTIÇA ELEITORAL 035ª ZONA ELEITORAL DE APODI...,PROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO ...,NÃO,NÃO,18480714,NÃO,...,PARTIDO VERDE - APODI - RN - MUNICIPAL,,,,,,,,,DEFERIDO


In [8]:
df.count()

advogado                              48
advogado_2                             6
data_hora_da_assinatura_timestamp    211
documento                            211
detalhamento                         211
fiscal_de_lei_nome                   211
formal_request                       211
free_judicial                        211
id                                   211
judicial_secrecy                     211
last_distribution                    211
legal_action_goal                    211
legal_action_number                  211
legal_class                          211
matters                              211
position                              96
reference_legal_action               193
requerente                           161
requerente_2                         143
requerente_3                          10
requerente_4                           8
requerente_5                           5
requerente_6                           3
requerente_7                           2
requerente_8    

In [9]:
print(pdf_files_path_with_error)

[]


In [None]:
not_defined = df[df.resultado == 'NÃO DEFINIDO']
not_defined

### Vetorizar documento

In [10]:
import nltk
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
# Download necessário para a NLTK
nltk.download('stopwords')  # Baixa a lista de stop words (palavras comuns) para uso no processamento de texto
nltk.download('punkt')  # Baixa o tokenizer Punkt, necessário para a tokenização de frases

# Carregar o modelo de português para o spaCy
!python -m spacy download pt_core_news_sm

In [12]:
def remove_noise(text):

    text = re.sub(r'http\S+', '', text)

    # Converte para minúsculas
    text = text.lower()

    # Remove pontuação
    # []: colchetes são usados para definir uma classe de caracteres.
    # ^: quando usado no início de uma classe de caracteres, o ^ nega a classe, ou seja, seleciona tudo que não está na classe.
    # \w: corresponde a qualquer caractere alfanumérico (letras e números, incluindo o caractere de sublinhado _)
    # \s: corresponde a qualquer espaço em branco (espaços, tabulações, quebras de linha).
    text = re.sub(r'[^\w\s]', '', text)

    # Remove underlines
    text = re.sub(r'_+', '', text)

    # Remove números
    # \d: corresponde a qualquer dígito (de 0 a 9).
    # +: significa “um ou mais” do elemento precedente. Portanto, \d+ corresponde a uma sequência de um ou mais dígitos consecutivos.
    text = re.sub(r'\d+', '', text)

    return text

In [13]:
def remove_stopwords(text):
  # Obtém a lista de stopwords em português usando o NLTK e as converte para um conjunto para melhorar a eficiência da busca
  stop_words = set(stopwords.words('portuguese'))

  # Divide o texto em palavras, remove as stopwords e então junta as palavras restantes de volta em uma string
  text = ' '.join([word for word in text.split() if word not in stop_words])

  return text

In [None]:
data = df[df.resultado != 'NÃO DEFINIDO']
data

In [None]:
data['detalhamento'] = data['detalhamento'].apply(remove_noise)
data.head()

In [None]:
data.iloc[1]['detalhamento']

In [None]:
data['detalhamento'] = data['detalhamento'].apply(remove_stopwords)
data.head()

In [None]:
text = data['detalhamento'][267]
text

In [19]:
# Usando o modelo em português do spaCy para tokenização
import spacy
nlp = spacy.load('pt_core_news_sm')

# # Passa o texto para o pipeline de processamento do spaCy. O resultado é um objeto doc, que contém as palavras e sentenças tokenizadas, além de outras informações linguísticas.
# doc = nlp(text)

# # Extrair sentenças como tokens
# sentence_tokens = [sent.text for sent in doc.sents]

# print("Tokens:", sentence_tokens)

In [20]:
# tokens = [token.text for token in doc]
# print('Tokens:', tokens)

In [None]:
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

data['tokens'] = data['detalhamento'].apply(tokenize_text)

In [None]:
data['tokens'][267]

## Stemming e Lemmatização

In [None]:
nltk.download('punkt')
nltk.download('punkt_tab')

In [24]:
from nltk.stem import PorterStemmer

In [25]:
def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [None]:
data['stemming'] = data['tokens'].apply(apply_stemming)
data['stemming'][267]

In [None]:
stemmer = PorterStemmer()

# Tokeniza a frase em palavras
palavras = word_tokenize(data['detalhamento'][267])

# Aplica o stemming a cada palavra
stemmed_words = [stemmer.stem(word) for word in palavras]

print("Frase original:", data['detalhamento'][267])
print("Palavras após Stemming:", stemmed_words)
print()

In [28]:
def apply_lemming(doc):
  doc = ' '.join(doc)

  nlp = spacy.load("pt_core_news_sm")

  doc = nlp(doc)

  lemmatized_words = [token.lemma_ for token in doc]
  
  return lemmatized_words

In [None]:
data['lemming'] = data['tokens'].apply(apply_lemming)

In [None]:
data['detalhamento'][267]

In [None]:
data['pdf_file_path'][267]

In [None]:
data['lemming'][267]

In [None]:
data['lemming'][2]


In [None]:
data.iloc[2]

In [None]:
data

In [None]:
%pip install scikit-learn

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer

vectorizer_tfidf = TfidfVectorizer()
# Ajustar e transformar os documentos em uma matriz TF-IDF
X_tfidf = vectorizer_tfidf.fit_transform(data['lemming'].apply(lambda x: ' '.join(x)))
vocab = vectorizer_tfidf.get_feature_names_out()
print("Representação TF-IDF:\n", X_tfidf.toarray())
print("Vocabulário TF-IDF:\n", vocab)

print("Vocabulário TF-IDF:", vocab)
# Imprime a matriz TF-IDF com rótulos de linha e coluna
print("Matriz TF-IDF:")
print("Documento ", end="")
for i, doc in enumerate(X_tfidf.toarray()):
    print(f"Documento {i+1}:", end="   ")
    for word, tfidf in zip(vocab, X_tfidf[i].toarray()[0]):
        if tfidf == 0:
            continue
        print(f"{word}: {tfidf:.4f}")
    print()

In [None]:
data.count()

## Vetorizar dataframe using TFiDF

In [None]:
data.fillna('', inplace=True)

In [None]:
data['detalhamento'] = data['lemming'].apply(lambda x: ' '.join(x))

In [None]:
data_tfidf = data.drop(columns=['requerente_8', 'lemming', 'stemming', 'tokens', 'pdf_file_path', 'resultado'])
columns = data_tfidf.columns

for column in columns:
  print(f'== {column} ==')
  if data_tfidf[column].str.strip().any():
    tfidf_matrix = vectorizer_tfidf.fit_transform(data_tfidf[column])

    data_tfidf[column] = tfidf_matrix.toarray().mean(axis=1)
  else:
    data_tfidf[column] = df[column].replace('', 0)

data_tfidf

In [None]:
data_tfidf[100:110]

## Dividir treinamento e teste

In [44]:
from sklearn.model_selection import train_test_split

labels = data['resultado']

In [None]:
labels = labels.replace('DEFERIDO', 1)
labels = labels.replace('INDEFERIDO', 0)

In [46]:
# X_train, X_test, y_train, y_test = train_test_split(data_tfidf, labels, test_size=0.4, random_state=42)

In [47]:
# X_train.shape

In [48]:
# X_test.shape

### Aprendizado não supervisionado

#### Funções

##### De Pré-Processamento

In [49]:
def get_correlation_df(df, num_features):
  import pandas as pd

  correlation_matrix = df.corr(method="pearson")

  column_sums = correlation_matrix.sum()

  sorted_columns = column_sums.sort_values(ascending=False)

  top_num_columns = sorted_columns.head(num_features)

  top_num_column_names = top_num_columns.index.tolist()

  return df[top_num_column_names]


  # correlation_matrix = df.corr()

  # average_correlation = correlation_matrix.abs().mean().sort_values()

  # smallest_average_correlations = average_correlation.head(num_features).index.tolist()

  # return df[smallest_average_correlations]


In [50]:
def get_pca_df(df, num_components):
    from sklearn.decomposition import PCA
    pca = PCA(n_components=num_components)
    return pd.DataFrame(pca.fit_transform(df))

In [51]:
def get_k_nn_accuracy(df, labels):
  from sklearn.neighbors import KNeighborsClassifier
  from sklearn.metrics import accuracy_score

  X_train, X_test, y_train, y_test = train_test_split(df, labels, test_size=0.3, random_state=42)
  knn = KNeighborsClassifier(n_neighbors=1, metric='euclidean')
  knn.fit(X_train, y_train)

  y_pred = knn.predict(X_test)

  accuracy = accuracy_score(y_test, y_pred)
  return accuracy

In [52]:
def get_best_representation(df1, df2, labels):
  accuracy1 = get_k_nn_accuracy(df1, labels)
  accuracy2 = get_k_nn_accuracy(df2, labels)

  print(f'1: {accuracy1} x 2: {accuracy2}')

  return df1 if accuracy1 > accuracy2 else df2

##### De Algoritmos de Agrupamento

In [53]:
def get_kmean_indices(df, labels, num_clusters):
    from sklearn.cluster import KMeans
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    km = KMeans(n_clusters=num_clusters, init='k-means++', max_iter=300, n_init=10, random_state=42)
    
    km.fit(df)
    km_labels = km.fit_predict(df)

    indice_db = davies_bouldin_score(df, km_labels)
    indice_sil = silhouette_score(df, km_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, km_labels)

    return indice_db, indice_sil, indice_cr

In [54]:
def get_hierarquico_indices(df, labels, num_clusters, linkage):
    from sklearn.cluster import AgglomerativeClustering
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    hiera_aglo = AgglomerativeClustering(n_clusters=num_clusters, metric='euclidean', linkage=linkage)
    
    hiera_aglo.fit(df)
    ha_labels = hiera_aglo.fit_predict(df)

    indice_db = davies_bouldin_score(df, ha_labels)
    indice_sil = silhouette_score(df, ha_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, ha_labels)

    return indice_db, indice_sil, indice_cr

In [55]:
def get_em_indices(df, labels, num_components, cov_type):
    from sklearn.mixture import GaussianMixture
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    gmm = GaussianMixture(n_components=num_components, covariance_type=cov_type)

    gmm.fit(df)
    gmm_labels = gmm.fit_predict(df)

    indice_db = davies_bouldin_score(df, gmm_labels)
    indice_sil = silhouette_score(df, gmm_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, gmm_labels)

    return indice_db, indice_sil, indice_cr

In [56]:
def get_dbscan_indices(df, labels, minimum_samples, eps_value):
    from sklearn.cluster import DBSCAN
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    dbscan = DBSCAN(eps=eps_value, min_samples=minimum_samples)
    
    dbscan_labels = dbscan.fit_predict(df)

    indice_db = davies_bouldin_score(df, dbscan_labels)
    indice_sil = silhouette_score(df, dbscan_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, dbscan_labels)
 

    return indice_db, indice_sil, indice_cr

##### De Execução

In [57]:
def plot_cluster_indices(df_original, df_corr, df_pca, labels, cluster_alg, **kwargs):
    import pandas as pd
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    db_list, sil_list, cr_list = [], [], []

    for index in range(2, 21):
        db_orig, sil_orig, cr_orig = {}, {}, {}
        db_corr, sil_corr, cr_corr = {}, {}, {}
        db_pca, sil_pca, cr_pca = {}, {}, {}
        
        match cluster_alg:
            case 'kmeans':
                db_orig, sil_orig, cr_orig = get_kmean_indices(df_original, labels, index)
                db_corr, sil_corr, cr_corr = get_kmean_indices(df_corr, labels, index)
                db_pca, sil_pca, cr_pca = get_kmean_indices(df_pca, labels, index)
            case 'hierarquico':
                db_orig, sil_orig, cr_orig = get_hierarquico_indices(df_original, labels, index, linkage=kwargs.get('linkage'))
                db_corr, sil_corr, cr_corr = get_hierarquico_indices(df_corr, labels, index, linkage=kwargs.get('linkage'))
                db_pca, sil_pca, cr_pca = get_hierarquico_indices(df_pca, labels, index, linkage=kwargs.get('linkage'))
            case 'em':
                db_orig, sil_orig, cr_orig = get_em_indices(df_original, labels, index, cov_type=kwargs.get('cov_type'))
                db_corr, sil_corr, cr_corr = get_em_indices(df_corr, labels, index, cov_type=kwargs.get('cov_type'))
                db_pca, sil_pca, cr_pca = get_em_indices(df_pca, labels, index, cov_type=kwargs.get('cov_type'))
            case 'dbscan':
                db_orig, sil_orig, cr_orig = get_dbscan_indices(df_original, labels, index, eps_value=kwargs.get('eps_value'))
                # db_corr, sil_corr, cr_corr = get_dbscan_indices(df_corr, labels, index, eps_value=kwargs.get('eps_value'))
                db_pca, sil_pca, cr_pca = get_dbscan_indices(df_pca, labels, index, eps_value=kwargs.get('eps_value'))

        db_value = {'Grupos': index, 'Base original': db_orig, 'Correlação': db_corr, 'PCA': db_pca}
        sil_value = {'Grupos': index, 'Base original': sil_orig, 'Correlação': sil_corr, 'PCA': sil_pca}
        cr_value = {'Grupos': index, 'Base original': cr_orig, 'Correlação': cr_corr, 'PCA': cr_pca}
        db_list.append(db_value)
        sil_list.append(sil_value)
        cr_list.append(cr_value)

    df_db = pd.DataFrame(db_list)
    df_sil = pd.DataFrame(sil_list)
    df_cr = pd.DataFrame(cr_list)

    df_indice_list = [df_db, df_sil, df_cr]
    fig = make_subplots(rows=1, cols=3, subplot_titles=('Índice - Davies-Bouldin', 'Índice - Silhouette', 'Índice - Adjusted Rand Score'))
    
    for index, df in enumerate(df_indice_list, 1):
      fig.add_trace(go.Scatter(x=df['Grupos'], y=df['Base original'], name='Base original'), row=1, col=index)
      fig.add_trace(go.Scatter(x=df['Grupos'], y=df['Correlação'], name='Correlação'), row=1, col=index)
      fig.add_trace(go.Scatter(x=df['Grupos'], y=df['PCA'], name='PCA'), row=1, col=index)

    fig.update_layout(title=f'Gráfico de {cluster_alg.upper()}', showlegend=True, boxmode='group')
    fig.show()

#### Aplicação dos modelos

In [None]:
labels.value_counts()

In [59]:
data_tfidf = data_tfidf.drop(['advogado_2', 'requerente_3', 'requerente_4', 
                            'requerente_5', 'requerente_6', 'requerente_7', 
                            'impugnado_2', 'impugnado', 'fiscal_de_lei_nome',
                            'formal_request','free_judicial', 'judicial_secrecy', 'id'] , axis='columns')

##### Selecionando de atributos pela correlação

In [None]:
reduced_dataset_corr_3 = get_correlation_df(data_tfidf, 3)
reduced_dataset_corr_3.head()

In [None]:
reduced_dataset_corr_9 = get_correlation_df(data_tfidf, 9)
reduced_dataset_corr_9.head()

In [None]:
selected_df_corr = get_best_representation(reduced_dataset_corr_3, reduced_dataset_corr_9, labels)
selected_df_corr.head()

In [None]:
data_tfidf
# See the NA values

In [64]:
# for column in data_tfidf.columns:
#     print(data_tfidf[column].value_counts())

##### Redução de dimensionalidade pelo PCA

In [None]:
dataset_pca_90_cov = get_pca_df(data_tfidf, 0.9)

dataset_pca_3_cps = get_pca_df(data_tfidf, 3)
dataset_pca_3_cps.head()

In [None]:
selected_df_pca = get_best_representation(dataset_pca_90_cov, dataset_pca_3_cps, labels)
selected_df_pca.head()

##### Geração de gráficos

In [None]:
%pip install --upgrade jupyter notebook jupyterlab plotly

In [None]:
import nbformat
print(nbformat.__version__)

In [None]:
plot_cluster_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'kmeans')

In [None]:
plot_cluster_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'hierarquico', linkage='ward')

In [None]:
plot_cluster_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'em', cov_type='full')

In [None]:
# Giving error 'Number of labels is 1. Valid values are 2 to n_samples - 1 (inclusive)' how to solve?
plot_cluster_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'dbscan', eps_value=.1)

In [None]:
# from sklearn.cluster import DBSCAN

# for i in [.1, .2, .3, .4, .5, .6, .7, .8]:
#     for index in range(2, 21):
#         dbscan = DBSCAN(eps=i, min_samples=index)
#         dbscan_labels = dbscan.fit_predict(data_tfidf)
        
#         n_noise = list(dbscan_labels).count(-1)
#         n_clusters = len(set(dbscan_labels)) - (1 if -1 in dbscan_labels else 0)
#         print(f'EPS: {i}, MIN_SAMPLES: {index}: {n_clusters} clusters, {n_noise} noise points')


#### Comitê de Agrupamento

##### Funções

In [74]:
from sklearn.cluster import KMeans, DBSCAN, AgglomerativeClustering
import numpy as np
import pandas as pd

def build_binary_matrix(clabels):
    from scipy import sparse

    data_len = len(clabels) 

    matrix = np.zeros((data_len, data_len))
    
    for index in range(data_len):
        matrix[index,:] = clabels == clabels[index]

    return matrix                                         

In [75]:
def build_similarity_matrix(models_labels):
    n_runs, n_data = models_labels.shape[0], models_labels.shape[1]

    sim_matrix = np.zeros( (n_data, n_data) )

    for index in range(n_runs):
        sim_matrix += build_binary_matrix( models_labels[index,:] )

    sim_matrix = sim_matrix / n_clusters

    return sim_matrix

In [76]:
def get_kmeans_models(df, model_sizes):
    from sklearn.cluster import KMeans
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    clt_models = [KMeans(n_clusters=i, n_init=4, random_state=214)
              for i in model_sizes]

    for i, model in enumerate(clt_models):
        model.fit(data_tfidf)

    return clt_models

In [77]:
def get_hierarquico_models(df, model_sizes):
    from sklearn.cluster import AgglomerativeClustering

    agglo_models = [AgglomerativeClustering(n_clusters=i)
                  for i in model_sizes]

    for i, model in enumerate(agglo_models):
        model.fit(df)

    return agglo_models
    

In [96]:
def get_em_models(df: pd.DataFrame, model_sizes):
    from sklearn.mixture import GaussianMixture
    
    gmm_models = [GaussianMixture(n_components=i, random_state=214) for i in model_sizes]
    models_labels = []

    for i, model in enumerate(gmm_models):
        model.fit(df)
        labels = model.predict(df)
        models_labels.append(labels)

    return models_labels


In [116]:
def get_dbscan_models(df, NUM_MODELS):
    from sklearn.cluster import DBSCAN

    eps_values = np.random.uniform(.1, 1.0, size=NUM_MODELS[3])
    min_samples_values = np.random.randint(2, 20, size=NUM_MODELS[3])

    dbscan_models = [DBSCAN(eps=eps, min_samples=ms) for eps, ms in zip(eps_values, min_samples_values)]

    for i, model in enumerate(dbscan_models):
        model.fit(df)
     
    return dbscan_models

In [117]:
def get_similarity_matrix(clt_models):
    models_labels = np.array([model.labels_ for model in clt_models])
    sim_matrix = build_similarity_matrix(models_labels)

    return sim_matrix

In [None]:
def get_similarity_matrix_em(clt_labels):
    sim_matrix = build_similarity_matrix(clt_labels)

    return sim_matrix

In [118]:
def get_final_labels(sim_matrix, num_clusters):
    from sklearn.cluster import SpectralClustering

    spec_clt = SpectralClustering(n_clusters=num_clusters, affinity='precomputed',
                                n_init=5, random_state=214)

    final_labels = spec_clt.fit_predict(sim_matrix)

    return final_labels

In [119]:
def get_ensemble_indices(df, final_labels, labels):
    from sklearn.metrics import davies_bouldin_score, silhouette_score, adjusted_rand_score

    indice_db = davies_bouldin_score(df, final_labels)
    indice_sil = silhouette_score(df, final_labels, metric='euclidean')
    indice_cr = adjusted_rand_score(labels, final_labels)
 
    return indice_db, indice_sil, indice_cr

In [120]:
def plot_ensemble_indices(df_original, df_corr, df_pca, labels, cluster_type, **kwargs):
    import pandas as pd
    import plotly.graph_objects as go
    from plotly.subplots import make_subplots

    NUM_MODELS = [5, 10, 15, 25]
    MIN_NUM_CLUSTERS = 2
    MAX_NUM_CLUSTERS = 20

    model_sizes = np.random.randint(MIN_NUM_CLUSTERS, MAX_NUM_CLUSTERS+1, size=NUM_MODELS[3])

    db_list, sil_list, cr_list = [], [], []

    for index in range(2, 21):
        orig_models, corr_models, pca_models = [], [], []
        db_orig, sil_orig, cr_orig = {}, {}, {}
        db_corr, sil_corr, cr_corr = {}, {}, {}
        db_pca, sil_pca, cr_pca = {}, {}, {}

        labels_orig, labels_corr, labels_pca = [], [], []
        
        match cluster_type:
            case 'kmeans':
                orig_models = get_kmeans_models(df_original, model_sizes)
                corr_models = get_kmeans_models(df_corr, model_sizes)
                pca_models = get_kmeans_models(df_pca, model_sizes)
            case 'hierarquico':
                orig_models = get_hierarquico_models(df_original, model_sizes)
                corr_models = get_hierarquico_models(df_corr, model_sizes)
                pca_models = get_hierarquico_models(df_pca, model_sizes)
            case 'em':
                orig_models = get_em_models(df_original, model_sizes)
                corr_models = get_em_models(df_corr, model_sizes)
                pca_models = get_em_models(df_pca, model_sizes)
            case 'dbscan':
                orig_models = get_dbscan_models(df_original, NUM_MODELS)
                corr_models = get_dbscan_models(df_corr, NUM_MODELS)
                pca_models = get_dbscan_models(df_pca, NUM_MODELS)


        orig_sim_matrix = get_similarity_matrix(orig_models)
        corr_sim_matrix = get_similarity_matrix(corr_models)
        pca_sim_matrix = get_similarity_matrix(pca_models)

        orig_final_labels = get_final_labels(orig_sim_matrix, index)
        corr_final_labels = get_final_labels(corr_sim_matrix, index)
        pca_final_labels = get_final_labels(pca_sim_matrix, index)

        db_orig, sil_orig, cr_orig = get_ensemble_indices(df_original, orig_final_labels, labels)
        db_corr, sil_corr, cr_corr = get_ensemble_indices(df_corr, corr_final_labels, labels)
        db_pca, sil_pca, cr_pca = get_ensemble_indices(df_pca, pca_final_labels, labels)

        db_value = {'Grupos': index, 'Base original': db_orig, 'Correlação': db_corr, 'PCA': db_pca}
        sil_value = {'Grupos': index, 'Base original': sil_orig, 'Correlação': sil_corr, 'PCA': sil_pca}
        cr_value = {'Grupos': index, 'Base original': cr_orig, 'Correlação': cr_corr, 'PCA': cr_pca}
        db_list.append(db_value)
        sil_list.append(sil_value)
        cr_list.append(cr_value)

    df_db = pd.DataFrame(db_list)
    df_sil = pd.DataFrame(sil_list)
    df_cr = pd.DataFrame(cr_list)

    df_indice_list = [df_db, df_sil, df_cr]
    fig = make_subplots(rows=1, cols=3, subplot_titles=('Índice - Davies-Bouldin', 'Índice - Silhouette', 'Índice - Adjusted Rand Score'))
    
    for index, df in enumerate(df_indice_list, 1):
      fig.add_trace(go.Scatter(x=df['Grupos'], y=df['Base original'], name='Base original'), row=1, col=index)
      fig.add_trace(go.Scatter(x=df['Grupos'], y=df['Correlação'], name='Correlação'), row=1, col=index)
      fig.add_trace(go.Scatter(x=df['Grupos'], y=df['PCA'], name='PCA'), row=1, col=index)

    fig.update_layout(title=f'Gráfico de comitê de agrupamento com {cluster_type.upper()}', showlegend=True, boxmode='group')
    fig.show()

##### Execução

In [121]:
plot_ensemble_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'kmeans')


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


In [122]:
plot_ensemble_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'hierarquico')


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


Graph is not fully connected, spectral embedding may not work as expected.


In [123]:
# plot_ensemble_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'em')

In [124]:
plot_ensemble_indices(data_tfidf, selected_df_corr, selected_df_pca, labels, 'dbscan')

#### Teste Estatístico