In [2]:
import pandas as pd

# URL convertida al formato raw
url = 'https://raw.githubusercontent.com/nferrucho/NPL/main/proyecto/amazon_reviews.csv'

# Leer el archivo CSV directamente desde GitHub
df = pd.read_csv(url)

# Mostrar las primeras filas del DataFrame
df.head()


Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion
0,75a98b82-b222-442b-b8e6-dba1dff5eec0,Jason,Would be nice if the search function actually ...,1,3,28.16.0.100,2024-09-08 18:48:32,28.16.0.100
1,cd5e35df-eda2-43e7-a78c-76ae5c8b880a,Rashid Ali,Good,5,0,28.16.0.100,2024-09-08 18:44:39,28.16.0.100
2,62dba633-af93-4fd8-a42b-3d03f085b99d,Octávio Viana,Uma aldrabice o desconto de 10% para compras a...,1,0,,2024-09-08 18:15:12,
3,846f512f-428a-4147-a790-27e20a0af439,Christiaan Burger,Do not use Amazon,1,0,28.13.6.100,2024-09-08 18:01:03,28.13.6.100
4,bebf8fc4-6640-47e6-b3f1-d674c244c309,Diane L,Missing words when you want to go to account s...,3,261,28.16.0.100,2024-09-08 17:46:29,28.16.0.100


In [3]:
datos = df.content
print(datos.head())

0    Would be nice if the search function actually ...
1                                                 Good
2    Uma aldrabice o desconto de 10% para compras a...
3                                    Do not use Amazon
4    Missing words when you want to go to account s...
Name: content, dtype: object


In [10]:
import re

# Función mejorada para normalizar y tokenizar texto
def normalizar_tokenizar(texto):
  try:
      # Eliminar texto vacio
      if pd.isna(texto) or texto.strip() == '':
        return []

      # Expresión regular para encontrar números y porcentajes
      patron_numeros = r'\d+(?:\.\d+)?%'  # Coincide con porcentajes como '45%' o '23.5%'
      patron_palabras_numeros = r'\d+(?:\.\d+)?'  # Coincide con números como '123' o '45.67'

      # Extraer los porcentajes y números del texto antes de aplicar lower
      porcentajes = re.findall(patron_numeros, texto)
      numeros = re.findall(patron_palabras_numeros, texto)

      # Convertir a minúsculas, pero no los números ni los porcentajes
      texto = re.sub(patron_numeros, '', texto)  # Quitar los porcentajes temporariamente
      texto = re.sub(patron_palabras_numeros, '', texto)  # Quitar los números temporariamente
      texto = texto.lower()

      # Volver a insertar los números y porcentajes en el texto
      for p in porcentajes:
          texto += f' {p}'
      for n in numeros:
          texto += f' {n}'

      # Eliminar caracteres especiales, pero mantener los espacios
      texto = re.sub(r'[^\w\s%]', '', texto)

      # Tokenizar (dividir en palabras)
      tokens = texto.split()

      # Filtrar palabras stop
      #tokens_filtrados = [token for token in tokens if token not in stop_words]

      # Normalizacion
      preprocess_text = ' '.join([tokens])
      return tokens

  except Exception as e:
      print(f"Error procesando el texto: {texto}")
      print(f"Error: {e}")
      return []

In [None]:
# Aplicar la función 'normalizar_tokenizar' a la columna 'review_text'
df['review_text_tokenizado'] = df['content'].apply(normalizar_tokenizar)

# Mostrar las primeras filas con la nueva columna
print(df[['review_text_tokenizado']].head())

In [4]:
import spacy

# Cargar el modelo
nlp = spacy.load("en_core_web_sm")
doc = nlp(datos[0])
print(doc)

Would be nice if the search function actually worked correctly. Shows very little of what I actually searched


In [5]:

#TEST_CELL
!pip install unidecode

Collecting unidecode
  Downloading Unidecode-1.3.8-py3-none-any.whl.metadata (13 kB)
Downloading Unidecode-1.3.8-py3-none-any.whl (235 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m235.5/235.5 kB[0m [31m3.6 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: unidecode
Successfully installed unidecode-1.3.8


In [6]:
import pandas as pd
import re
from unidecode import unidecode

def normalizar_tokenizarSpacy(texto):
  try:
    if pd.isna(texto) or texto.strip() == '':
        return []

    # Procesar el texto con spaCy
    doc = nlp(texto)

    # Filtrar tokens usando spaCy
    tokens_filtrados = [
        token.text.lower()
        for token in doc
        if not token.is_stop and token.pos_ in {'NOUN', 'VERB'}
    ]

    return tokens_filtrados
  except Exception as e:
      print(f"Error procesando el texto: {texto}")
      print(f"Error: {e}")
      return []


In [7]:
# Aplicar la función 'normalizar_tokenizar' a la columna 'content'
df['content_tokenizado'] = df['content'].apply(normalizar_tokenizarSpacy)

# Mostrar las primeras filas con la nueva columna
print(df[['content', 'content_tokenizado']].head())

                                             content  \
0  Would be nice if the search function actually ...   
1                                               Good   
2  Uma aldrabice o desconto de 10% para compras a...   
3                                  Do not use Amazon   
4  Missing words when you want to go to account s...   

                                  content_tokenizado  
0        [search, function, worked, shows, searched]  
1                                                 []  
2                  [aldrabice, o, %, para, euros, o]  
3                                              [use]  
4  [missing, words, want, account, settings, hit,...  


In [8]:
print(df[['content_tokenizado']].head())

                                  content_tokenizado
0        [search, function, worked, shows, searched]
1                                                 []
2                  [aldrabice, o, %, para, euros, o]
3                                              [use]
4  [missing, words, want, account, settings, hit,...


In [18]:
def unir_lista(lista):
  return ' '.join(lista)

In [19]:
# Mostrar las primeras filas del DataFrame
df['normalizado'] = df['content_tokenizado'].apply(unir_lista)
print(df[['normalizado']].head())


                                         normalizado
0              search function worked shows searched
1                                                   
2                         aldrabice o % para euros o
3                                                use
4  missing words want account settings hit body p...


In [20]:
df.head()

Unnamed: 0,reviewId,userName,content,score,thumbsUpCount,reviewCreatedVersion,at,appVersion,content_tokenizado,normalizado
0,75a98b82-b222-442b-b8e6-dba1dff5eec0,Jason,Would be nice if the search function actually ...,1,3,28.16.0.100,2024-09-08 18:48:32,28.16.0.100,"[search, function, worked, shows, searched]",search function worked shows searched
1,cd5e35df-eda2-43e7-a78c-76ae5c8b880a,Rashid Ali,Good,5,0,28.16.0.100,2024-09-08 18:44:39,28.16.0.100,[],
2,62dba633-af93-4fd8-a42b-3d03f085b99d,Octávio Viana,Uma aldrabice o desconto de 10% para compras a...,1,0,,2024-09-08 18:15:12,,"[aldrabice, o, %, para, euros, o]",aldrabice o % para euros o
3,846f512f-428a-4147-a790-27e20a0af439,Christiaan Burger,Do not use Amazon,1,0,28.13.6.100,2024-09-08 18:01:03,28.13.6.100,[use],use
4,bebf8fc4-6640-47e6-b3f1-d674c244c309,Diane L,Missing words when you want to go to account s...,3,261,28.16.0.100,2024-09-08 17:46:29,28.16.0.100,"[missing, words, want, account, settings, hit,...",missing words want account settings hit body p...
