# Instalar dependências necessárias

In [None]:
import pdfplumber
import pandas as pd
import os

pdf_folder = '../data'

pdf_files_path = []
for root, dirs, files in os.walk(pdf_folder):
  for file in files:
    if file.endswith('.pdf'):
      pdf_files_path.append(os.path.join(root, file))

raw_texts = []

for pdf_file_path in pdf_files_path:
  whole_text = ''

  with pdfplumber.open(pdf_file_path) as pdf:
    for index, page in enumerate(pdf.pages):
      text = page.extract_text()
      if index > 0:
        whole_text += f"page: {index} {text}\n"
      else:
        whole_text += f"{text}\n"

  raw_texts.append(whole_text)

## Extrair o texto não estruturado do PDF

In [None]:
dataframe = pd.DataFrame(data={'raw': raw_texts})

row_count = dataframe.count()
columns = dataframe.columns
print(f"Row count: {row_count}")
print(f"Columns: {columns}")
print(dataframe.head())

## Functions

In [None]:
def extract_text_based_on_appearance_in_text(text_to_extract: str, text: str, type: str = 'after'):
  """
  Extracts the text after the appearance of a specific text in a given string.

  Parameters:
    text_to_extract (str): The text to extract.
    text (str): The input string.
    type (str): The type of extraction. It can be either "before" or "after".

  Returns:
    str: The extracted text.

  Raises:
    ValueError: If the type is neither "before" nor "after".
  """

  import re

  if type == 'before':
    search = re.search(rf'.*(?<={text_to_extract})', text, re.DOTALL)
  elif type == 'after':
    search = re.search(rf'(?<={text_to_extract}).*', text)
  else:
    raise ValueError('type must be either "before" or "after"')

  extracted_text = search.group().strip() if search else None

  return extracted_text


## Extract all essential data from raw data

In [None]:
import re
import datetime

def extract_needed_information_from_pdf_text(text: str):
  """
  Extracts the needed information from a given PDF text.

  Parameters:
    text (str): The input text.

  Returns:
    dict: The extracted information.
  """

  # Remove redundant texts that are not useful for the analysis
  text = re.sub(r'\bTribunal Regional Eleitoral do Rio Grande do Norte\n\b', '', text, count=1)
  text = re.sub(r'\bPJe - Processo Judicial Eletrônico\n\b', '', text, count=1)
  text = re.sub(r'\bN úmero: \b', '', text, count=1)
  # Extracting the date of process
  date_pattern = r'\d{2}/\d{2}/\d{4}'
  date_match = re.search(date_pattern, text)
  date = date_match.group() if date_match else None 
  print(date)

  text = re.sub(date_pattern, '\n', text, count=1)

  # Extracting the legal action number
  legal_action_number_pattern = r'\b\d{7}-\d{2}.\d{4}.\d{1}.\d{2}.\d{4}\b' 
  legal_action_number_match = re.search(legal_action_number_pattern, text)
  legal_action_number = legal_action_number_match.group() if legal_action_number_match else None
  text = re.sub(legal_action_number_pattern, '\n', text, count=1)
  print(legal_action_number)


  # Extracting the data of raw text
  search = 'Classe:'
  legal_class = extract_text_based_on_appearance_in_text(search, text)
  print(legal_class)

  text = re.sub(rf'\b{search} {legal_class}\n\b', '', text, count=1)

  search = 'Órgão julgador:'
  tribunal = extract_text_based_on_appearance_in_text(search, text)
  print(tribunal)

  text = re.sub(rf'\b{search} {tribunal}\n\b', '', text, count=1)

  search = 'Última distribuição :'
  last_distribution = extract_text_based_on_appearance_in_text('Última distribuição :', text)
  print(last_distribution)

  text = re.sub(rf'\b{search} {last_distribution}\n\b', '', text, count=1)

  search = 'Valor da causa:'
  cause_cost = extract_text_based_on_appearance_in_text('Valor da causa:', text)
  print(f'cause cost: {cause_cost}')

  text = re.sub(rf'\b{search} {cause_cost}\n\b', '', text, count=1)
  text = re.sub(r'R\$', '', text, count=1)

  search = 'Processo referência:'
  reference_legal_action = extract_text_based_on_appearance_in_text('Processo referência:', text)
  print(reference_legal_action)

  text = re.sub(rf'\b{search} {reference_legal_action}\n\b', '', text, count=1)

  search = 'Assuntos:'
  matters = extract_text_based_on_appearance_in_text('Assuntos:', text)
  print(matters)

  text = re.sub(rf'\b{search} {matters}\n\b', '', text, count=1)

  search = 'Cargo -'
  position = extract_text_based_on_appearance_in_text('Cargo -', text)
  print(position)

  text = re.sub(rf'\b{search} {position}\n\b', '', text, count=1)

  legal_action_goal_string = 'Objeto do processo: '
  judicial_secrecy = 'Segredo de Justiça?'
  extracted_text = re.search(f'{legal_action_goal_string}(.*?){judicial_secrecy}', text, re.DOTALL)
  legal_action_goal = extracted_text.group(1).strip() if extracted_text else None
  print(legal_action_goal)

  text = re.sub(rf'\b{legal_action_goal_string}\b', '', text.strip(), count=1)
  legal_action_goal = legal_action_goal.replace('\n', ' ').strip()

  legal_action_goal_splitted = legal_action_goal.split(' ')

  for word in legal_action_goal_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)

  text = text.strip()

  search = 'Segredo de Justiça\?'
  judicial_secrecy = extract_text_based_on_appearance_in_text('Segredo de Justiça\?', text)
  print(judicial_secrecy)

  text = re.sub(rf'\b{search} {judicial_secrecy}\b', '', text, count=1)

  search = 'Justiça gratuita\?'
  free_judicial = extract_text_based_on_appearance_in_text('Justiça gratuita\?', text)
  print(free_judicial)

  text = re.sub(rf'\b{search} {free_judicial}\b', '', text, count=1)

  search = 'Pedido de liminar ou antecipação de tutela\?'
  formal_request = extract_text_based_on_appearance_in_text('Pedido de liminar ou antecipação de tutela\?', text)
  print(formal_request)

  text = re.sub(rf'\b{search} {formal_request}\b', '', text, count=1)

  text = re.sub(r'\bPartes Advogados\n\b', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' '))
  requerente = match.group(1).strip() if match else None
  print(requerente)

  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  requerente_splitted = requerente.split(' ') if requerente != None else []
  for word in requerente_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNANTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnante = match.group(1).strip() if match else None
  print(f'impugnante: {impugnante}')

  text = re.sub(r'\b(IMPUGNANTE)\b', '', text, count=1)
  text = text.strip()
  impugnante_splitted = impugnante.split(' ') if impugnante != None else []
  for word in impugnante_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNANTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnante_2 = match.group(1).strip() if match else None
  print(f'impugnante: {impugnante_2}')

  text = re.sub(r'\b(IMPUGNANTE)\b', '', text, count=1)
  text = text.strip()
  impugnante_splitted_2 = impugnante_2.split(' ') if impugnante_2 != None else []
  for word in impugnante_splitted_2:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  # print(text)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnado = match.group(1).strip() if match else None
  print(f"impugnado: {impugnado}")

  text = re.sub(r'\b(IMPUGNADO)\b', '', text, count=1)
  text = re.sub(rf'\b{impugnado}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(ADVOGADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' '))
  advogado = match.group(1).strip() if match else None
  print(advogado)

  advogado_splitted = advogado.split(' ') if advogado != None else []
  for word in advogado_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(ADVOGADO)\b', '', text, count=1)
  text = re.sub(rf'\b{advogado}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(IMPUGNADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  impugnado_2 = match.group(1).strip() if match else None
  print(f"impugnado 2: {impugnado_2}")

  text = re.sub(r'\b(IMPUGNADO)\b', '', text, count=1)
  text = text.strip()
  impugnado_splitted_2 = impugnado_2.split(' ') if impugnado_2 != None else []
  for word in impugnado_splitted_2:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_2 = match.group(1).strip() if match else None
  print(requerente_2)
  
  requerente2_splitted = requerente_2.split(' ') if requerente_2 != None else []
  for word in requerente2_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_2}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(ADVOGADO\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' '))
  advogado_2 = match.group(1).strip() if match else None
  print(f"advogado 2: {advogado_2}")

  text = re.sub(r'\b(ADVOGADO)\b', '', text, count=1)
  text = re.sub(rf'\b{advogado_2}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_3 = match.group(1).strip() if match else None
  print(requerente_3)

  requerente3_splitted = requerente_3.split(' ') if requerente_3 != None else []
  for word in requerente3_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_3}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_4 = match.group(1).strip() if match else None
  print(requerente_4)

  requerente4_splitted = requerente_4.split(' ') if requerente_4 != None else []
  for word in requerente4_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_4}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_5 = match.group(1).strip() if match else None
  print(requerente_5)

  requerente5_splitted = requerente_5.split(' ') if requerente_5 != None else []
  for word in requerente5_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_5}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_6 = match.group(1).strip() if match else None
  print(requerente_6)

  requerente6_splitted = requerente_6.split(' ') if requerente_6 != None else []
  for word in requerente6_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_6}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_7 = match.group(1).strip() if match else None
  print(requerente_7)

  requerente7_splitted = requerente_7.split(' ') if requerente_7 != None else []
  for word in requerente7_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_7}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  search = r'\(REQUERENTE\)'
  match = re.search(rf'(.+?)\s+{search}', text.replace('\n', ' ').strip())
  requerente_8 = match.group(1).strip() if match else None
  print(requerente_8)

  requerente8_splitted = requerente_8.split(' ') if requerente_8 != None else []
  for word in requerente8_splitted:
    word = word.replace('(', r'\(').replace(')', r'\)')
    text = re.sub(rf'{word}\n?', '', text, count=1)
  text = re.sub(r'\b(REQUERENTE)\b', '', text, count=1)
  text = re.sub(rf'\b{requerente_8}\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)

  outros_participantes_search = 'Outros participantes'
  fiscal_de_lei_search = r'\(FISCAL DA LEI\)'
  extracted_text = re.search(f'{outros_participantes_search}(.*?){fiscal_de_lei_search}', text, re.DOTALL)
  fiscal_de_lei_nome = extracted_text.group(1).strip().replace('\n', ' ') if extracted_text else None
  print(fiscal_de_lei_nome)

  text = re.sub(rf'\b{outros_participantes_search}\n\b', '', text, count=1)
  text = re.sub(rf'\bFISCAL DA LEI\b', '', text, count=1)
  text = re.sub(r'\(\)', '', text, count=1)
  text = re.sub(r'\bPROMOTOR ELEITORAL DO ESTADO DO RIO GRANDE DO\nNORTE\b', '', text, count=1)

  text = re.sub(r'\bDocumentos\b', '', text, count=1)

  text_treated_for_index = text.strip().replace('\n', ' ').split(' ')

  print(text_treated_for_index)

  text_treated_for_index.remove('Id.')
  text_treated_for_index.remove('Data')
  text_treated_for_index.remove('da')
  text_treated_for_index.remove('Documento')
  text_treated_for_index.remove('Tipo')
  text_treated_for_index.remove('Assinatura')

  print(text_treated_for_index)

  id = text_treated_for_index[0]
  print(f"id: {id}")
  
  text_treated_for_index.pop(0)

  data_da_assinatura = text_treated_for_index[0]
  print(f"data: {data_da_assinatura}")
  text_treated_for_index.pop(0)

  initial_index = 0

  for index, word in enumerate(text_treated_for_index, start=initial_index):
    pattern = r'\d{2}:\d{2}'
    match = re.match(pattern, word)

    if match:
      hora_da_assinatura = word
      break

  text_treated_for_index.remove(hora_da_assinatura)
  

  data_hora_da_assinatura = f'{data_da_assinatura} {hora_da_assinatura}'
  data_hora_da_assinatura_timestamp = datetime.datetime.strptime(data_hora_da_assinatura, '%d/%m/%Y %H:%M').isoformat()
  print(data_hora_da_assinatura_timestamp)

  tipo = ''

  for index, word in enumerate(text_treated_for_index):
    if word == 'Sentença' or word == 'Petição':
      tipo = word
      break
    elif word == 'Outros' and text_treated_for_index[index + 1] == 'documentos':
      tipo = f'{word} {text_treated_for_index[index + 1]}'
      break
    elif word == 'Parecer' and text_treated_for_index[index + 1] == 'da' and text_treated_for_index[index + 1] == 'Procuradoria':
      tipo = f'{word} {text_treated_for_index[index + 1]} {text_treated_for_index[index + 2]}'
      break
    elif word == 'Cota' and text_treated_for_index[index + 1] == 'ministerial':
      tipo = f'{word} {text_treated_for_index[index + 1]}'
      break

  print(tipo)
  if tipo == 'Sentença' or tipo == 'Petição':
    text_treated_for_index.remove(tipo)
  elif tipo == 'Outros documentos':
    text_treated_for_index.remove('Outros')
    text_treated_for_index.remove('documentos')
  elif tipo == 'Parecer da Procuradoria':
    text_treated_for_index.remove('Parecer')
    text_treated_for_index.remove('da')
    text_treated_for_index.remove('Procuradoria')
  elif tipo == 'Cota ministerial':
    text_treated_for_index.remove('Cota')
    text_treated_for_index.remove('ministerial')

  index = text_treated_for_index.index('page:')
  
  documento = ' '.join(text_treated_for_index[:index])
  print(documento)

  resultado = 'NÃO DEFINIDO'

  detalhamento = ' '.join(text_treated_for_index[index+2:])
  print(detalhamento)
  
  indeferimento_word_appearance = detalhamento.lower().find('indefiro')

  if detalhamento.lower().find('defiro') != -1:
    resultado = 'DEFERIDO'
  elif detalhamento.lower().find('deferimento') != -1:
    resultado = 'DEFERIDO'
  elif detalhamento.lower().find('manifesta-se pelo deferimento') != -1:
    resultado = 'DEFERIDO'
  elif detalhamento.lower().find('homologo') != -1:
    resultado = 'DEFERIDO'


  if indeferimento_word_appearance != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('indeferindo-se') != -1:
    resultado = 'INDEFERIDO'
  elif detalhamento.lower().find('indeferimento') != -1:
    resultado = 'INDEFERIDO'

  
  print(resultado)

  data = {
    'advogado': advogado,
    'advogado_2': advogado_2,
    'data_hora_da_assinatura_timestamp': data_hora_da_assinatura_timestamp,
    'documento': documento,
    'detalhamento': detalhamento,
    'fiscal_de_lei_nome': fiscal_de_lei_nome,
    'formal_request': formal_request,
    'free_judicial': free_judicial,
    'id': id,
    'judicial_secrecy': judicial_secrecy,
    'last_distribution': last_distribution,
    'legal_action_goal': legal_action_goal,
    'legal_action_number': legal_action_number,
    'legal_class': legal_class,
    'matters': matters,
    'position': position,
    'reference_legal_action': reference_legal_action,
    'requerente': requerente,
    'requerente_2': requerente_2,
    'requerente_3': requerente_3,
    'requerente_4': requerente_4,
    'requerente_5': requerente_5,
    'requerente_6': requerente_6,
    'requerente_7': requerente_7,
    'requerente_8': requerente_8,
    'impugnante': impugnante,
    'impugnante_2': impugnante_2,
    'impugnado': impugnado,
    'impugnado_2': impugnado_2,
    'resultado': resultado,
  }

  return data



In [None]:
import pandas as pd

df = pd.DataFrame()

pdf_files_path_with_error = []

for index, raw_text in enumerate(dataframe['raw']):
  try:
    data = extract_needed_information_from_pdf_text(raw_text)
    new_row = pd.DataFrame([data])
    df = pd.concat([df, new_row], ignore_index=True)
  except ValueError:
    pdf_files_path_with_error.append(pdf_files_path[index])
    continue



In [None]:
df.head(300)

In [None]:
df.count()

In [None]:
print(pdf_files_path_with_error)

In [None]:
not_defined = df[df.resultado == 'NÃO DEFINIDO']
not_defined

### Vetorizar documento

In [None]:
import nltk
import spacy
from nltk.corpus import stopwords 
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize

In [None]:
# Download necessário para a NLTK
nltk.download('stopwords')  # Baixa a lista de stop words (palavras comuns) para uso no processamento de texto
nltk.download('punkt')  # Baixa o tokenizer Punkt, necessário para a tokenização de frases

# Carregar o modelo de português para o spaCy
!python -m spacy download pt_core_news_sm

In [None]:
def remove_noise(text):

    text = re.sub(r'http\S+', '', text)

    # Converte para minúsculas
    text = text.lower()

    # Remove pontuação
    # []: colchetes são usados para definir uma classe de caracteres.
    # ^: quando usado no início de uma classe de caracteres, o ^ nega a classe, ou seja, seleciona tudo que não está na classe.
    # \w: corresponde a qualquer caractere alfanumérico (letras e números, incluindo o caractere de sublinhado _)
    # \s: corresponde a qualquer espaço em branco (espaços, tabulações, quebras de linha).
    text = re.sub(r'[^\w\s]', '', text)

    # Remove underlines
    text = re.sub(r'_+', '', text)

    # Remove números
    # \d: corresponde a qualquer dígito (de 0 a 9).
    # +: significa “um ou mais” do elemento precedente. Portanto, \d+ corresponde a uma sequência de um ou mais dígitos consecutivos.
    text = re.sub(r'\d+', '', text)

    return text

In [None]:
def remove_stopwords(text):
  # Obtém a lista de stopwords em português usando o NLTK e as converte para um conjunto para melhorar a eficiência da busca
  stop_words = set(stopwords.words('portuguese'))

  # Divide o texto em palavras, remove as stopwords e então junta as palavras restantes de volta em uma string
  text = ' '.join([word for word in text.split() if word not in stop_words])

  return text

In [None]:
data = df[df.resultado != 'NÃO DEFINIDO']
data

In [None]:
data['detalhamento'] = data['detalhamento'].apply(remove_noise)
data.head()

In [None]:
data.iloc[1]['detalhamento']

In [None]:
data['detalhamento'] = data['detalhamento'].apply(remove_stopwords)
data.head()

In [None]:
text = data['detalhamento'][267]
text

In [None]:
# Usando o modelo em português do spaCy para tokenização
nlp = spacy.load('pt_core_news_sm')

# # Passa o texto para o pipeline de processamento do spaCy. O resultado é um objeto doc, que contém as palavras e sentenças tokenizadas, além de outras informações linguísticas.
# doc = nlp(text)

# # Extrair sentenças como tokens
# sentence_tokens = [sent.text for sent in doc.sents]

# print("Tokens:", sentence_tokens)

In [None]:
# tokens = [token.text for token in doc]
# print('Tokens:', tokens)

In [None]:
def tokenize_text(text):
    doc = nlp(text)
    tokens = [token.text for token in doc]
    return tokens

data['tokens'] = data['detalhamento'].apply(tokenize_text)

In [None]:
data['tokens'][267]

## Stemming e Lemmatização

In [None]:
nltk.download('punkt')

In [None]:
from nltk.stem import PorterStemmer

In [None]:
def apply_stemming(tokens):
    stemmer = PorterStemmer()
    stemmed_tokens = [stemmer.stem(token) for token in tokens]
    return stemmed_tokens

In [None]:
data['stemming'] = data['tokens'].apply(apply_stemming)
data['stemming'][267]

In [None]:
stemmer = PorterStemmer()

# Tokeniza a frase em palavras
palavras = word_tokenize(frase)

# Aplica o stemming a cada palavra
stemmed_words = [stemmer.stem(word) for word in palavras]

print("Frase original:", frase)
print("Palavras após Stemming:", stemmed_words)
print()

In [None]:
def apply_lemming(doc):
  doc = ' '.join(doc)

  nlp = spacy.load("pt_core_news_sm")

  doc = nlp(doc)

  lemmatized_words = [token.lemma_ for token in doc]
  
  return lemmatized_words

In [None]:
data['lemming'] = data['tokens'].apply(apply_lemming)

In [None]:
data['lemming'][267]

In [None]:
data.iloc[2]

In [None]:
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer


vectorizer_tfidf = TfidfVectorizer()
# Ajustar e transformar os documentos em uma matriz TF-IDF
X_tfidf = vectorizer_tfidf.fit_transform(data['lemming'].apply(lambda x: ' '.join(x)))
vocab = vectorizer_tfidf.get_feature_names_out()
print("Representação TF-IDF:\n", X_tfidf.toarray())
print("Vocabulário TF-IDF:\n", vocab)

print("Vocabulário TF-IDF:", vocab)
# Imprime a matriz TF-IDF com rótulos de linha e coluna
print("Matriz TF-IDF:")
print("Documento |", end="    ")
for palavra in vocab:
    print(palavra, end="  ")
print()
for i, doc in enumerate(X_tfidf.toarray()):
    print(f"Documento {i+1}:", end="   ")
    for valor in doc:
        print("{:.2f}".format(valor), end="    ")
    print()