In [39]:
import pandas as pd
import numpy as np
import re
import nltk
from nltk import TreebankWordTokenizer
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer

In [40]:
sw_list = stopwords.words('english')
lemmatizer = WordNetLemmatizer()

In [42]:
for i in ('not', 'no', 'nor'):
    sw_list.remove(i)

Se ha eliminado de la lista de stop words, las palabras 'not', 'nor' y 'no' ya que pueden aportar mucho significado a un modelo de Sentyment Analysis:
No es lo mismo decir:
* No me gusta
* Me gusta

In [43]:
df = pd.read_json('Sports_and_Outdoors_5.json', lines = True)

In [44]:
def lower_words(x):
    """
    To lowercase all words
    """
    x = x.lower()
    
    return x

In [45]:
def remove_punctuation(x):
    """
    It removes punctuation marks
    """
    x = " ".join(re.findall('[\w]+',x))
    
    return x

In [46]:
def remove_stopWords(x, stop_words):
    """
    It removes the stop words
    """
    x = ' '.join(word for word in x.split() if word not in stop_words)
    
    return x

In [47]:
def lemmmatization(x):
    """
    Lemmatize words to reduce the cardinality of the vocabulary
    """
    x = ' '.join(lemmatizer.lemmatize(word) for word in x.split())
    
    return x

In [48]:
def preprocessing(df, stop_words, lemmatize = False):
    """
    Function to preproces text
    Input:
    - df: pandas series to preprocess
    - stop_words: list of stop words
    
    Output:
    - df: pandas series preprocessed
    """
    df = df.apply(lambda x : lower_words(x))
    df = df.apply(lambda x : remove_punctuation(x))
    df = df.apply(lambda x : remove_stopWords(x, stop_words))
    
    if lemmatize:
        df = df.apply(lambda x: lemmatization(x))
    
    return df

In [49]:
df_preprocessed = preprocessing(df['reviewText'], stop_words = sw_list)

In [50]:
df['text_processed'] = df_preprocessed

In [51]:
df_preprocessed.head()

0    came time veru happy haved used already makes ...
1    factory glock tool using glock 26 27 17 since ...
2    3 32 punch would like one glock bag okay butt ...
3    works no better 3 32 punch would find hardware...
4    purchased thinking maybe need special tool eas...
Name: reviewText, dtype: object

In [52]:
df.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime,text_processed
0,1881509818,"[0, 0]",5,This came in on time and I am veru happy with ...,"01 26, 2014",AIXZKN4ACSKI,David Briner,Woks very good,1390694400,came time veru happy haved used already makes ...
1,1881509818,"[1, 1]",5,I had a factory Glock tool that I was using fo...,"02 2, 2012",A1L5P841VIO02V,Jason A. Kramer,Works as well as the factory tool,1328140800,factory glock tool using glock 26 27 17 since ...
2,1881509818,"[2, 2]",4,If you don't have a 3/32 punch or would like t...,"02 28, 2012",AB2W04NI4OEAD,J. Fernald,"It's a punch, that's all.",1330387200,3 32 punch would like one glock bag okay butt ...
3,1881509818,"[0, 0]",4,This works no better than any 3/32 punch you w...,"02 5, 2012",A148SVSWKTJKU6,"Jusitn A. Watts ""Maverick9614""",It's a punch with a Glock logo.,1328400000,works no better 3 32 punch would find hardware...
4,1881509818,"[0, 0]",4,I purchased this thinking maybe I need a speci...,"04 23, 2013",AAAWJ6LW9WMOO,Material Man,"Ok,tool does what a regular punch does.",1366675200,purchased thinking maybe need special tool eas...


In [53]:
df.to_csv('reviews_preprocessed.csv')