In [44]:
import pandas as pd
import string
import re
import nltk
from nltk.stem import PorterStemmer

nltk.download('stopwords')
ps = PorterStemmer()

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/Patron/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
yelp = pd.read_csv("yelp_review.csv")
yelp.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100000 entries, 0 to 99999
Data columns (total 9 columns):
 #   Column       Non-Null Count   Dtype  
---  ------       --------------   -----  
 0   stars        100000 non-null  float64
 1   useful       100000 non-null  int64  
 2   text         100000 non-null  object 
 3   funny        100000 non-null  int64  
 4   review_id    100000 non-null  object 
 5   cool         100000 non-null  int64  
 6   date         100000 non-null  object 
 7   user_id      100000 non-null  object 
 8   business_id  100000 non-null  object 
dtypes: float64(1), int64(3), object(5)
memory usage: 6.9+ MB


In [4]:
yelp.text.head()

0    First time waxing. I read all the other review...
1    I love my manicure! Such an affordable place. ...
2    Great food, fast service, they try to crank pe...
3    Save your money and go across the street the f...
4    Oops...this was for the Main St location:\n\n-...
Name: text, dtype: object

In [16]:
# Change all text into lower-case
yelp['text_lower'] = yelp.text.apply(lambda x:x.lower())
yelp.text_lower.head()

0    first time waxing. i read all the other review...
1    i love my manicure! such an affordable place. ...
2    great food, fast service, they try to crank pe...
3    save your money and go across the street the f...
4    oops...this was for the main st location:\n\n-...
Name: text_lower, dtype: object

In [32]:
# Remove digits
text_lower_no_digits = []

# Iterate over all letters in each word and replace all digits with an empty string
for word in yelp.text_lower:
    letters = [letter for letter in word if not letter.isdigit()]
    text_lower_no_digits.append(''.join(letters))

yelp['text_lower_no_digits'] = pd.Series(text_lower_no_digits)
yelp.text_lower_no_digits.head()
# yelp.iloc[22,:].text_lower_no_digits

0    first time waxing. i read all the other review...
1    i love my manicure! such an affordable place. ...
2    great food, fast service, they try to crank pe...
3    save your money and go across the street the f...
4    oops...this was for the main st location:\n\n-...
Name: text_lower_no_digits, dtype: object

In [35]:
# Remove the punctuation
text_lower_no_digits_non_punct = []

# Iterate over all letters in each word and replace all punctuations with an empty string
for word in yelp.text_lower_no_digits:
    letters = [letter for letter in word if letter not in string.punctuation]
    text_lower_no_digits_non_punct.append(''.join(letters))


yelp['text_lower_no_digits_non_punct'] = pd.Series(text_lower_no_digits_non_punct)
yelp.text_lower_no_digits_non_punct.head()
# yelp.iloc[22,:].text_lower_no_digits_non_punct

0    first time waxing i read all the other reviews...
1    i love my manicure such an affordable place th...
2    great food fast service they try to crank peop...
3    save your money and go across the street the f...
4    oopsthis was for the main st location\n\n\n\nw...
Name: text_lower_no_digits_non_punct, dtype: object

In [38]:
# Tokenization

tokens = []

# Iterate over all reviews and split it into a list of words
for review in yelp.text_lower_no_digits_non_punct:
    split_words = re.split("\W+",review)
    tokens.append([word.lower() for word in split_words])


yelp['text_lower_no_digits_non_punct_split'] = pd.Series(tokens)
yelp.text_lower_no_digits_non_punct_split.head()

  split_words = re.split("\W+",word)


0    [first, time, waxing, i, read, all, the, other...
1    [i, love, my, manicure, such, an, affordable, ...
2    [great, food, fast, service, they, try, to, cr...
3    [save, your, money, and, go, across, the, stre...
4    [oopsthis, was, for, the, main, st, location, ...
Name: text_lower_no_digits_non_punct_split, dtype: object

In [40]:
# Remove stop words
nltk_stopwords = nltk.corpus.stopwords.words('english')

stopwords = []

# Iterate over each splitted review and remove all the stop words
for review in yelp.text_lower_no_digits_non_punct_split:
    non_stop = [word for word in review if word not in nltk_stopwords]
    stopwords.append(non_stop)

yelp['text_lower_no_digits_non_punct_split_non_stopwords'] = pd.Series(stopwords)
yelp.text_lower_no_digits_non_punct_split_non_stopwords.head()

0    [first, time, waxing, read, reviews, decided, ...
1    [love, manicure, affordable, place, manicurist...
2    [great, food, fast, service, try, crank, peopl...
3    [save, money, go, across, street, food, way, b...
4    [oopsthis, main, st, location, visited, aladdi...
Name: text_lower_no_digits_non_punct_split_non_stopwords, dtype: object

In [42]:
# Lemmatize/Stem
stemmed_list = []

# Iterate over each splitted review and remove and stem every word to its root form
for review in yelp['text_lower_no_digits_non_punct_split_non_stopwords']:
    stemmed = [ps.stem(word) for word in review]
    stemmed_list.append(stemmed)

yelp['text_lower_no_digits_non_punct_split_non_stopwords_stemmed'] = pd.Series(stemmed_list)
yelp.text_lower_no_digits_non_punct_split_non_stopwords_stemmed.head()

0    [first, time, wax, read, review, decid, trust,...
1    [love, manicur, afford, place, manicurist, kin...
2    [great, food, fast, servic, tri, crank, peopl,...
3    [save, money, go, across, street, food, way, b...
4    [oopsthi, main, st, locat, visit, aladdin, way...
Name: text_lower_no_digits_non_punct_split_non_stopwords_stemmed, dtype: object

## List five words that got removed after removing stop words.

+ all
+ the
+ other
+ my
+ such

## Please use two examples to briefly describe the purpose of ‘stemming’.

Stemming produces morphological variants of root words. For example, `location` changes to `locat`, This helps in grouping the words for further analysis, because if a review contains the word `located`, it should should fall under the same word tree as `location`. Same goes for `try` and `tried`, which fall under the root `tr`. This helps in classifying text into related clusters which can eventually help in deriving sentiment of the text.

In [43]:
# Store your pre-processed data in your Google Drive and name it as ‘yelp_review_cleaned.csv’
yelp.to_csv("yelp_review_cleaned.csv")