# Etapa de preprocesado de texto

In [28]:
!pip install num2words



In [29]:
# Librerias
import pandas as pd
import string
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from num2words import num2words
from google.colab import drive

In [30]:
# Montamos GDrive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [31]:
# Cargar dataset
df = pd.read_csv('drive/MyDrive/NLP/sampled_data_file.csv')

In [32]:
 #Descargar la lista de palabras vacías
nltk.download('stopwords')
stopwords = stopwords.words('english')
stopwords = set(stopwords) - set(['not', 'no', 'nor', 'but'])

[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [33]:
# Descarga el recurso punkt
nltk.download('punkt')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [34]:
# Descargar la base de datos de WordNet
nltk.download('wordnet')
nltk.download('omw-1.4')
# Crear el lematizador
lemmatizer = WordNetLemmatizer()

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


In [35]:
# Crear la función para preprocesar el texto

def preprocess(text):
    # Obtener todo el texto en minúsculas:
    text = text.lower()
    # Eliminar puntuación:
    text = text.translate(str.maketrans("", "", string.punctuation))
    # tokenizar el texto:
    text = nltk.word_tokenize(text)
    # Inicializar lematizador:
    lemmatizer = WordNetLemmatizer()
    # Obtener lista de palabras vacías (stopwords):
    stopwords = nltk.corpus.stopwords.words("english")
    # Inicializar lista para almacenar texto limpio:
    clean_text = []
    for word in text:
        #Eliminar palabras vacías:
        if word not in stopwords:
            #Lematizar el texto:
            token = lemmatizer.lemmatize(word)
            # Convertir dígitos a palabras:
            if token.isdigit():
                token = num2words(token, lang='en')
            clean_text.append(token)
    return clean_text

In [36]:
df['reviewText'][0]

'The helicopter stopped working after 10 minutes, the tail motor burn out making the helicopter useless, It took 4 weeks to get here, was shipped in a Styrofoam box that looked like it was taken out of the garage wrapped with shipping tape, the helicopter and other parts raddled around inside of the box did not even fit inside the cut outs.'

In [37]:
review = df['reviewText'][0]
preprocess(review)

['helicopter',
 'stopped',
 'working',
 'ten',
 'minute',
 'tail',
 'motor',
 'burn',
 'making',
 'helicopter',
 'useless',
 'took',
 'four',
 'week',
 'get',
 'shipped',
 'styrofoam',
 'box',
 'looked',
 'like',
 'taken',
 'garage',
 'wrapped',
 'shipping',
 'tape',
 'helicopter',
 'part',
 'raddled',
 'around',
 'inside',
 'box',
 'even',
 'fit',
 'inside',
 'cut',
 'out']

In [38]:
# Aplicar el preproceso en los datos
df['reviewText'] = df['reviewText'].apply(preprocess)

In [39]:
# Crear nuevo csv con el texto preprocesado
df_preprocessed = df[['overall', 'reviewText']]
df_preprocessed.to_csv('drive/MyDrive/NLP/df_preprocessed.csv', index=False)