# Text Preprocessing Review

In [1]:
# importing libraries & packages
import nltk
import string
import pandas as pd

from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import *
from nltk.stem import WordNetLemmatizer 
  
nltk.download('stopwords')
nltk.download('wordnet')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/msonjap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /Users/msonjap/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


True

## Text Data

In [2]:
# reading data
yelp_ratings = pd.read_csv("./data/yelp_ratings.csv") #44530 samples

In [3]:
# printing first rows
yelp_ratings.head()

Unnamed: 0,text,stars,sentiment
0,Total bill for this horrible service? Over $8G...,1.0,0
1,I *adore* Travis at the Hard Rock's new Kelly ...,5.0,1
2,I have to say that this office really has it t...,5.0,1
3,Went in for a lunch. Steak sandwich was delici...,5.0,1
4,Today was my second out of three sessions I ha...,1.0,0


In [4]:
# processing data
yelp_reviews = yelp_ratings["text"].values
yelp_reviews_labels = yelp_ratings["sentiment"].values
yelp_reviews_stars = yelp_ratings["stars"].values

In [5]:
# printing sample of the data
yelp_reviews[:5]

array(['Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH! Avoid Hospital ERs at all costs.',
       "I *adore* Travis at the Hard Rock's new Kelly Cardenas Salon!  I'm always a fan of a great blowout and no stranger to the chains that offer this service; however, Travis has taken the flawless blowout to a whole new level!  \n\nTravis's greets you with his perfectly green swoosh in his otherwise perfectly styled black hair and a Vegas-worthy rockstar outfit.  Next comes the most relaxing and incredible shampoo -- where you get a full head message that could cure even the very worst migraine in minutes --- and the scented shampoo room.  Travis has freakishly strong fingers (in a good way) and use the perfect amount of pressure.  That was superb!  Then starts the glorious blowout... where not one, not two, but THREE people were involved in doing the best round-brush ac

In [6]:
# defining sentences
raw_yelp = "Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH!" 
raw_yelp

'Total bill for this horrible service? Over $8Gs. These crooks actually had the nerve to charge us $69 for 3 pills. I checked online the pills can be had for 19 cents EACH!'

## Tokenize sentences

In [7]:
# separating the sentences into a list
tokenized_sentences = sent_tokenize(raw_yelp)
tokenized_sentences

['Total bill for this horrible service?',
 'Over $8Gs.',
 'These crooks actually had the nerve to charge us $69 for 3 pills.',
 'I checked online the pills can be had for 19 cents EACH!']

## Convert words to lowercase

In [8]:
# using "".lower() from string library to convert string to lowercase
lowercase_yelp = raw_yelp.lower()
lowercase_yelp

'total bill for this horrible service? over $8gs. these crooks actually had the nerve to charge us $69 for 3 pills. i checked online the pills can be had for 19 cents each!'

## Tokenize words

In [9]:
# tokenizing this sentence down to the word level
tokenized_yelp = word_tokenize(lowercase_yelp)
tokenized_yelp

['total',
 'bill',
 'for',
 'this',
 'horrible',
 'service',
 '?',
 'over',
 '$',
 '8gs',
 '.',
 'these',
 'crooks',
 'actually',
 'had',
 'the',
 'nerve',
 'to',
 'charge',
 'us',
 '$',
 '69',
 'for',
 '3',
 'pills',
 '.',
 'i',
 'checked',
 'online',
 'the',
 'pills',
 'can',
 'be',
 'had',
 'for',
 '19',
 'cents',
 'each',
 '!']

## Remove stop words and punctuation

In [10]:
# concatenating together nltk’s list of English stop words with the string library’s list of punctuation   
stpwrds = stopwords.words('english') + list(string.punctuation)

In [11]:
# printing list comprehension
stpwrds_removed = [w for w in tokenized_yelp if w not in stpwrds]
stpwrds_removed

['total',
 'bill',
 'horrible',
 'service',
 '8gs',
 'crooks',
 'actually',
 'nerve',
 'charge',
 'us',
 '69',
 '3',
 'pills',
 'checked',
 'online',
 'pills',
 '19',
 'cents']

### Stemming

In [12]:
# creating an instance of the PorterStemmer() object to stem words
stemmer = PorterStemmer()

In [13]:
# printing list comprehension
stemmed_sentence = [stemmer.stem(w) for w in stpwrds_removed]
stemmed_sentence

['total',
 'bill',
 'horribl',
 'servic',
 '8g',
 'crook',
 'actual',
 'nerv',
 'charg',
 'us',
 '69',
 '3',
 'pill',
 'check',
 'onlin',
 'pill',
 '19',
 'cent']

### Lemmatizing

In [14]:
# lemmatizing
lemmatizer = WordNetLemmatizer()
# "v": verb, "n": noun, "a": adjective (default)
print(lemmatizer.lemmatize("was", pos="v"))
print(lemmatizer.lemmatize("better", pos="a"))

be
good


In [15]:
# printing list
[lemmatizer.lemmatize(w, pos="v") for w in stpwrds_removed]

['total',
 'bill',
 'horrible',
 'service',
 '8gs',
 'crook',
 'actually',
 'nerve',
 'charge',
 'us',
 '69',
 '3',
 'pills',
 'check',
 'online',
 'pills',
 '19',
 'cents']

## Processing the full corpus

- Make lowercase, tokenize/remove puctuation and remove stop words from corpus

- Use sent_tokenize() if you want sentences separated.


```
processed_sentences = []
for s in sent_tokenize(sentences):
     low_tokens = word_tokenize(s.lower())
     processed_sentences.append([w for w in low_tokens if w not in stpwrds])
```

In [16]:
# don't use sent_tokenize() if you want the full string to be processed/tokenized as a whole.
processed_sentences = []
for s in yelp_reviews:
    low_tokens = word_tokenize(s.lower())
    processed_sentences.append([w for w in low_tokens if w
                        not in stpwrds])

In [17]:
# printing sentences
processed_sentences[10]

['wow',
 'surprised',
 'one',
 'two',
 'star',
 'reviews',
 'started',
 'tender',
 'calamari',
 'although',
 'marinara',
 'sauce',
 'bit',
 'bland',
 'touch',
 'salt',
 'made',
 'right',
 'husband',
 'veal',
 'peppers',
 'said',
 'delicious',
 'tender',
 'mashed',
 'potatoes',
 'perfect',
 'salmon',
 'diablo',
 'also',
 'delicious',
 'salad',
 'beautiful',
 'dressing',
 'served',
 'salad',
 'nice',
 'amount',
 'ended',
 'delicious',
 'meal',
 'piece',
 'tiramisu',
 'server',
 'matt',
 'right',
 'pleasant',
 'knowledgeable',
 'menu',
 'appetizer',
 'salad',
 'entrees',
 'timed',
 'perfectly',
 'love',
 'salad',
 'mind',
 'entree',
 'served',
 'still',
 'eating',
 'problem',
 'let',
 'dinner',
 'cool',
 'right',
 'temp',
 'eat',
 'comfortably',
 'wonder',
 'sometimes',
 'people',
 "n't",
 'appreciate',
 'relaxing',
 'taking',
 'time',
 'eat',
 'wonderful',
 'beautifully',
 'prepared',
 'meal',
 'wonderful',
 'atmosphere',
 'relaxing',
 'chairs',
 'super',
 'comfortable',
 'certainly',
 '