## 2. Preprocesado

In [1]:
!pip install beautifulsoup4
!pip install spacy
!pip install nltk



In [2]:
import pandas as pd
import numpy as np
from bs4 import BeautifulSoup
from spacy.lang.en.stop_words import STOP_WORDS
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')  #  para lematización
from nltk.stem import WordNetLemmatizer
import unicodedata
import re
import string

[nltk_data] Downloading package wordnet to /Users/maru/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /Users/maru/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


### 2.1 Cargo el dataset

In [4]:
# Cargar el dataset balanceado desde el Notebook 1
df = pd.read_pickle('Outputs/data/df_beauty_balanced.pkl')
print(f"{len(df)} reviews")
print(f"Columnas: {df.columns.tolist()}")
df.head()

6000 reviews
Columnas: ['review', 'rating']


Unnamed: 0,review,rating
0,Sculpting Crean Use this product and find that...,5.0
1,Keep your money Foe the price one expects more...,1.0
2,Fell apart after a year Was good while it last...,1.0
3,Five Stars Works beautifully. Great for my cli...,5.0
4,Worst Product I recently purchased this produc...,1.0


### 2.2 Etiqueto las rating en positivo [1] y negativo [0]

- `0`: las menores de 3
- `1`: las mayor o igual a 3

In [5]:
def label_sentiment(rating):
    #Convierte a etiqueta binaria
    if rating < 3:
        return 1  
    else:
        return 0  

### 2.3 Convierto a minúsculas

In [6]:
def a_minusculas(texto):
    return texto.lower()

In [7]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
print(f"Procesado: {a_minusculas(df.iloc[0]['review'])}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: sculpting crean use this product and find that when i run out, i notice the difference in the tautness of my skin, especially around mouth and neck.


### 2.4 Aplico beatifulSoup para eliminar las etiquetas HTLM que pueda haber en el texto

In [8]:
def remove_html_tags(text):
    soup = BeautifulSoup(text, 'html.parser')
    return soup.get_text(separator=' ')

In [9]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
print(f"Procesado: {remove_html_tags(df.iloc[0]['review'])}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.


### 2.5 Elimino signos de puntuación del texto

In [10]:
def eliminar_puntuacion(texto):
    translator = str.maketrans('', '', string.punctuation)
    return texto.translate(translator)

In [11]:
# Ejemplo
print(f"Original: {df.iloc[62]['review']}")
print(f"Procesado: {eliminar_puntuacion(df.iloc[62]['review'])}")

Original: Nothing Special Didn’t work no where near like it seemed to for others. Did nothing but weigh my hair down with greasy product
Procesado: Nothing Special Didn’t work no where near like it seemed to for others Did nothing but weigh my hair down with greasy product


### 2.6 Estandarizo caracteres especiales y elimino tildes (á→a, é→e, etc)

In [12]:
def normalizar_unicode(texto):
    return texto.encode('ascii', 'ignore').decode('ascii')

In [13]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
print(f"Procesado: {normalizar_unicode(df.iloc[0]['review'])}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.


### 2.7 Eliminar números del texto 

- Los números en reviews de productos de belleza suelen ser poco informativos para análisis de sentimiento y no aportan valor semantico al modelo. 
- Reducimos ruido, simplificamos vocabulario

In [14]:
def eliminar_numeros(texto):
    return re.sub(r'\d+', '', texto)

In [15]:
# Ejemplo
print(f"Original: {df.iloc[2025]['review']}")
print(f"Procesado: {eliminar_numeros(df.iloc[2025]['review'])}")

Original: Worked great ONE time Item worked great the first time, but the next day when I tried to turn it on nothing worked. returned
Procesado: Worked great ONE time Item worked great the first time, but the next day when I tried to turn it on nothing worked. returned


### 2.8 Eliminar espacios blancos y espacios al inicio/final del texto.

In [16]:
def normalizar_espacios(texto):
    texto = re.sub(r'\s+', ' ', texto)
    return texto.strip()

In [17]:
# Ejemplo
print(f"Original: {df.iloc[5520]['review']}")
print(f"Procesado: {normalizar_espacios(df.iloc[5520]['review'])}")

Original: Works GREAT It's very stretchy
Procesado: Works GREAT It's very stretchy


### 2.9 Eliminar stopwords

Pero **manteniendo palabras de sentimiento negativo** como 'not', 'no', 'never', etc.

In [18]:
from nltk.corpus import stopwords

# Descargar stopwords si no están disponibles
import nltk
nltk.download('stopwords', quiet=True)

def eliminar_stopwords(texto):
    # Obtener stopwords en inglés
    stop_words = set(stopwords.words('english'))
    
    # Palabras de negación a mantener
    negation_words = {'not', "n't", 'no', 'never', 'neither', 'nobody', 'nothing', 
                      'nowhere', 'none', 'nor', "don't", "doesn't", "didn't", 
                      "won't", "wouldn't", "shouldn't", "can't", "cannot", "couldn't"}
                      
    stop_words = stop_words - negation_words
    
    # Filtrar palabras
    palabras = texto.split()
    palabras_filtradas = [palabra for palabra in palabras if palabra.lower() not in stop_words]
    
    return ' '.join(palabras_filtradas)

In [19]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
print(f"Procesado: {eliminar_stopwords(df.iloc[0]['review'])}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: Sculpting Crean Use product find run out, notice difference tautness skin, especially around mouth neck.


### 2.10 Tokenización separando el texto en palabras

In [20]:
from nltk.tokenize import word_tokenize

def tokenizar(texto):
    return word_tokenize(texto)

In [21]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
print(f"Procesado: {tokenizar(df.iloc[0]['review'])}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: ['Sculpting', 'Crean', 'Use', 'this', 'product', 'and', 'find', 'that', 'when', 'I', 'run', 'out', ',', 'I', 'notice', 'the', 'difference', 'in', 'the', 'tautness', 'of', 'my', 'skin', ',', 'especially', 'around', 'mouth', 'and', 'neck', '.']


### 2.11 Lematización para usar en los modelos de ML con TF-IDF (Reduzco la dimensionalidad y ayudo al modelo a aprender mejor los datos aumentando la coincidencia de palabras)

In [22]:
from nltk.stem import WordNetLemmatizer

def lematizar_texto(tokens):
    lemmatizer = WordNetLemmatizer()
    return [lemmatizer.lemmatize(token) for token in tokens]

In [23]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
tokens = tokenizar(df.iloc[0]['review'])
print(f"Procesado: {lematizar_texto(tokens)}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: ['Sculpting', 'Crean', 'Use', 'this', 'product', 'and', 'find', 'that', 'when', 'I', 'run', 'out', ',', 'I', 'notice', 'the', 'difference', 'in', 'the', 'tautness', 'of', 'my', 'skin', ',', 'especially', 'around', 'mouth', 'and', 'neck', '.']


### 2.12 Eliminar tokens de menos de 3 caracteres (tokens cortos)

- Tokens de 1-2 caracteres suelen ser poco informativos (artículos, preposiciones)
- Mantiene palabras significativas como 'not', 'bad', 'buy', 'use'

In [24]:
def filtrar_tokens_cortos(tokens, min_length=3):
    return [token for token in tokens if len(token) >= min_length]

In [25]:
# Ejemplo
print(f"Original: {df.iloc[0]['review']}")
tokens = tokenizar(df.iloc[0]['review'])
print(f"Procesado: {filtrar_tokens_cortos(tokens)}")

Original: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
Procesado: ['Sculpting', 'Crean', 'Use', 'this', 'product', 'and', 'find', 'that', 'when', 'run', 'out', 'notice', 'the', 'difference', 'the', 'tautness', 'skin', 'especially', 'around', 'mouth', 'and', 'neck']


## 2.2 Pipeline completo de preprocesado

In [26]:
def preprocesado(texto, 
                   usar_stopwords=True, 
                   usar_lematizacion=True, 
                   filtrar_cortos=True,
                   min_length=3):
    
    # Verificar que la review no esté vacía
    if not texto or not isinstance(texto, str):
        return ""

    # Pasar a minusculas
    texto = a_minusculas(texto)
    
    # Eliminar etiquetas
    texto = remove_html_tags(texto)
    
    # Eliminar caracteres especiales
    texto = normalizar_unicode(texto)
    
    # Eliminar puntuación
    texto = eliminar_puntuacion(texto)
    
    # Eliminar números
    texto = eliminar_numeros(texto)
    
    # Normalizar espacios
    texto = normalizar_espacios(texto)
    
    # Eliminar stopwords (opcional)
    if usar_stopwords:
        texto = eliminar_stopwords(texto)
    
    # Tokenización
    tokens = tokenizar(texto)
    
    # Lematización (opcional)
    if usar_lematizacion:
        tokens = lematizar_texto(tokens)
    
    # Filtrar tokens cortos (opcional)
    if filtrar_cortos:
        tokens = filtrar_tokens_cortos(tokens, min_length)
    
    # Unir tokens en texto
    return ' '.join(tokens)

In [27]:
# Ejemplo completo del pipeline
review_ejemplo=df.iloc[2025]['review']

print("TEXTO ORIGINAL:")
print(review_ejemplo)

print("TEXTO PREPROCESADO:")
print(preprocesado(review_ejemplo))

print("SIN STOPWORDS NI LEMATIZACIÓN:")
print(preprocesado(review_ejemplo, usar_stopwords=False, usar_lematizacion=False))

TEXTO ORIGINAL:
Worked great ONE time Item worked great the first time, but the next day when I tried to turn it on nothing worked. returned
TEXTO PREPROCESADO:
worked great one time item worked great first time next day tried turn nothing worked returned
SIN STOPWORDS NI LEMATIZACIÓN:
worked great one time item worked great the first time but the next day when tried turn nothing worked returned


## 2.3 Aplicar preprocesado al dataset completo

In [28]:
# Aplicar la función de preprocesado
df['review_processed_ML'] = df['review'].apply(
    lambda x: preprocesado(x, usar_stopwords=True, usar_lematizacion=True, filtrar_cortos=True)
)

In [29]:
#Ejemplos:
print(f"ORIGINAL: {df.iloc[0]['review'][:150]}")
print(f"PROCESADO: {df.iloc[0]['review_processed_ML'][:150]}")

ORIGINAL: Sculpting Crean Use this product and find that when I run out, I notice the difference in the tautness of my skin, especially around mouth and neck.
PROCESADO: sculpting crean use product find run notice difference tautness skin especially around mouth neck


#### Revisamos si hay alguna review vacia y eliminamos

In [30]:
empty_reviews = df[df['review_processed_ML'].str.strip() == '']
print(f"\nReviews vacías después del preprocesado: {len(empty_reviews)}")



Reviews vacías después del preprocesado: 6


In [31]:
df = df[df['review_processed_ML'].notna() & (df['review_processed_ML'].str.strip() != '')]
print(f"Total de reviews después del filtrado: {len(df)}")


Total de reviews después del filtrado: 5994


In [32]:
# Calcular reducción de vocabulario
from collections import Counter

# Vocabulario original
vocab_original = set()
for text in df['review']:
    vocab_original.update(str(text).lower().split())

# Vocabulario procesado
vocab_procesado = set()
for text in df['review_processed_ML']:
    vocab_procesado.update(str(text).split())

print(f"Vocabulario original: {len(vocab_original):,} palabras únicas")
print(f"Vocabulario procesado: {len(vocab_procesado):,} palabras únicas")
print(f"Reducción: {(1 - len(vocab_procesado)/len(vocab_original))*100:.1f}% palabras únicas")

Vocabulario original: 19,121 palabras únicas
Vocabulario procesado: 9,569 palabras únicas
Reducción: 50.0% palabras únicas


In [33]:
# Aplicar etiquetado al dataframe usando la función label_sentiment
df['label_sentiment'] = df['rating'].apply(label_sentiment)

# Quedarse solo con las columnas especificadas
df_ML = df[['review_processed_ML','label_sentiment']].copy()


In [34]:
df_ML.head()

Unnamed: 0,review_processed_ML,label_sentiment
0,sculpting crean use product find run notice di...,0
1,keep money foe price one expects eye shadowyou...,1
2,fell apart year good lasted wasnt long brush f...,1
3,five star work beautifully great client sensit...,0
4,worst product recently purchased product terri...,1


## 2.4 Guardar dataset preprocesado

Guardamos el dataset con las reviews preprocesadas para usar en el Notebook 3.

In [35]:
# Guardar dataset preprocesado
output_path = 'Outputs/data/df_beauty_preprocessed_ML.pkl'
df_ML.to_pickle(output_path)

print(f"Dataset preprocesado guardado en: {output_path}")

Dataset preprocesado guardado en: Outputs/data/df_beauty_preprocessed_ML.pkl
