<a href="https://colab.research.google.com/github/nMishelRamirez/Project_IR/blob/preprocess/Preprocesamiento_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%writefile /content/drive/MyDrive/preprocessing_prueba.py
# ======================= #
#    LIBRERÍA NLP BASE   #
# ======================= #
import nltk
import contractions
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize

# Descargar recursos necesarios
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab', quiet=True)

# ======================= #
#  PREPROCESAMIENTO NLP   #
# ======================= #

def preprocess(corpus, lang='english', return_all=False):
    """
    Realiza el preprocesamiento completo de un corpus.

    Pasos:
        - Expandir contracciones
        - Tokenización robusta
        - Normalización (minúsculas + solo alfabético)
        - Eliminación de stopwords y puntuación
        - Stemming
        - Lematización

    Parámetros:
        corpus (list): Lista de strings (documentos)
        lang (str): Idioma para las stopwords (por defecto 'english')
        return_all (bool): Si True, devuelve todos los pasos. Si False, solo lematizado y stemmed

    Retorna:
        Si return_all:
            (tokenized, normalized, filtered, stemmed, lemmatized)
        Si no:
            (stemmed, lemmatized)
    """

    stop_words = set(stopwords.words(lang))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    special_chars = set(string.punctuation)

    tokenized, normalized, filtered, stemmed, lemmatized = [], [], [], [], []

    for doc in corpus:
        # Expandir contracciones
        expanded = contractions.fix(doc)

        # ======================= #
        #   1. TOKENIZACIÓN       #
        # ======================= #
        tokens = word_tokenize(expanded.lower())
        tokenized.append(tokens)

        # ============================ #
        #   2. NORMALIZACIÓN           #
        # ============================ #
        normalized_tokens = [token for token in tokens if token.isalpha()]
        normalized.append(normalized_tokens)

        # ============================ #
        #   3. STOPWORDS FILTER        #
        # ============================ #
        filtered_tokens = [
            token for token in normalized_tokens
            if token not in stop_words and token not in special_chars
        ]
        filtered.append(filtered_tokens)

        # ============================ #
        #   4. STEMMING                #
        # ============================ #
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
        stemmed.append(stemmed_tokens)

        # ============================ #
        #   5. LEMMATIZING             #
        # ============================ #
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
        lemmatized.append(lemmatized_tokens)

    if return_all:
        return tokenized, normalized, filtered, stemmed, lemmatized
    else:
        return stemmed, lemmatized


Overwriting /content/drive/MyDrive/preprocessing_prueba.py
