<a href="https://colab.research.google.com/github/nMishelRamirez/Project_IR/blob/main/Preprocesamiento_.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [None]:
%%writefile /content/drive/MyDrive/preprocessing.py
# ======================= #
#    LIBRERÍA NLP BASE   #
# ======================= #
import nltk
import contractions
import string
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
import pandas as pd

# Descargar recursos necesarios
nltk.download('punkt', quiet=True)
nltk.download('stopwords', quiet=True)
nltk.download('wordnet', quiet=True)
nltk.download('omw-1.4', quiet=True)
nltk.download('punkt_tab')

# ======================= #
#  PREPROCESAMIENTO NLP   #
# ======================= #

def preprocess(corpus, lang='english', return_all=False):

    stop_words = set(stopwords.words(lang))
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
    special_chars = set(string.punctuation)

    tokenized, normalized, filtered, stemmed, lemmatized = [], [], [], [], []

    for doc in corpus:
        doc = doc.replace("\n", " ").replace("\r", " ").strip()
        doc = contractions.fix(doc)

        # ======================= #
        #   1. TOKENIZACIÓN       #
        # ======================= #
        tokens = word_tokenize(doc)
        tokenized.append(tokens)

        # ============================ #
        #   2. NORMALIZACIÓN           #
        # ============================ #
        normalized_tokens = [
            token.lower() for token in tokens
            if token.isalpha() and token not in special_chars
        ]
        normalized.append(normalized_tokens)

        # ============================ #
        #   3. STOPWORDS FILTER        #
        # ============================ #
        filtered_tokens = [
            token for token in normalized_tokens if token not in stop_words
        ]
        filtered.append(filtered_tokens)

        # ============================ #
        #   4. STEMMING                #
        # ============================ #
        stemmed_tokens = [stemmer.stem(token) for token in filtered_tokens]
        stemmed.append(stemmed_tokens)

        # ============================ #
        #   5. LEMMATIZACIÓN           #
        # ============================ #
        lemmatized_tokens = [lemmatizer.lemmatize(token) for token in filtered_tokens]
        lemmatized.append(lemmatized_tokens)

    if return_all:
        return tokenized, normalized, filtered, stemmed, lemmatized
    else:
        return stemmed, lemmatized


# Función para aplicar el preprocesamiento sobre un DataFrame
def preprocess_dataframe(df, lang='english', return_all=False):
    processed_docs = preprocess(df['raw'], lang=lang, return_all=return_all)

    if return_all:
        df[['tokenized', 'normalized', 'filtered', 'stemmed', 'lemmatized']] = pd.DataFrame(processed_docs).T
    else:
        df['stemmed'] = processed_docs[0]
        df['lemmatized'] = processed_docs[1]

    return df


Writing /content/drive/MyDrive/preprocessing.py
