In [20]:
import re
import string
import nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords

# Baixar recursos necessários do NLTK
nltk.download('stopwords', download_dir='/home/codespace/nltk_data')
nltk.download('wordnet', download_dir='/home/codespace/nltk_data')
nltk.download('omw-1.4', download_dir='/home/codespace/nltk_data')  # Open Multilingual WordNet
nltk.data.path.append('/home/codespace/nltk_data')

def basic_cleaning(text):
    # Converter para minúsculas
    text = text.lower()
    
    # Remover pontuações
    text = text.translate(str.maketrans('', '', string.punctuation))
    
    # Remover números
    text = re.sub(r'\d+', '', text)
    
    # Remover espaços extras
    text = re.sub(r'\s+', ' ', text).strip()
    
    return text

def simple_tokenize(text):
    """
    Tokeniza um texto dividindo por espaços
    """
    return text.split()

def remove_stopwords(tokens):
    """
    Remove stopwords da lista de tokens
    """
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

def lemmatize_tokens(tokens):
    """
    Lematiza uma lista de tokens usando WordNetLemmatizer do NLTK
    """
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

# Exemplo de uso
sentence = 'The children were playing in the leaves yesterday. She studies computer science and is taking three courses. The wolves howled at the moon while mice scurried in the grass. He was driving faster than the cars around him. The chefs used sharp knives to prepare the tastiest dishes=/'


# Etapa 1: Limpeza básica
cleaned_sentence = basic_cleaning(sentence)
print("Após limpeza básica:", cleaned_sentence)

# Etapa 2: Tokenização simples
tokens = simple_tokenize(cleaned_sentence)
print("Após tokenização:", tokens)

# Etapa 3: Remoção de stopwords
filtered_tokens = remove_stopwords(tokens)
print("Após remoção de stopwords:", filtered_tokens)

# Etapa 4: Lematização
lemmatized_tokens = lemmatize_tokens(filtered_tokens)
print("Após lematização:", lemmatized_tokens)

Após limpeza básica: the children were playing in the leaves yesterday she studies computer science and is taking three courses the wolves howled at the moon while mice scurried in the grass he was driving faster than the cars around him the chefs used sharp knives to prepare the tastiest dishes
Após tokenização: ['the', 'children', 'were', 'playing', 'in', 'the', 'leaves', 'yesterday', 'she', 'studies', 'computer', 'science', 'and', 'is', 'taking', 'three', 'courses', 'the', 'wolves', 'howled', 'at', 'the', 'moon', 'while', 'mice', 'scurried', 'in', 'the', 'grass', 'he', 'was', 'driving', 'faster', 'than', 'the', 'cars', 'around', 'him', 'the', 'chefs', 'used', 'sharp', 'knives', 'to', 'prepare', 'the', 'tastiest', 'dishes']
Após remoção de stopwords: ['children', 'playing', 'leaves', 'yesterday', 'studies', 'computer', 'science', 'taking', 'three', 'courses', 'wolves', 'howled', 'moon', 'mice', 'scurried', 'grass', 'driving', 'faster', 'cars', 'around', 'chefs', 'used', 'sharp', 'kni

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /home/codespace/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
