In [1]:
# Load the libraries
import pandas as pd
import numpy as np
import re

# Import from nltk
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk import bigrams, FreqDist

In [2]:
# Load in the data
data = pd.read_csv('../../Datasets/yelp_labelled_processed/yelp_labelled_raw.csv')

In [3]:
# Have a look at the first 5 tests
data['text'].head(n=5)

0    The new rule is - \r\nif you are waiting for a...
1    Flirted with giving this two stars, but that's...
2    I was staying at planet Hollywood across the s...
3    Food is good but prices are super expensive.  ...
4    Worse company to deal with they do horrible wo...
Name: text, dtype: object

## Using nltk - the old fashioned way

#### Get the stop words

In [4]:
# Get the stop words
stop_words = stopwords.words('english') # NLTK stop_words

# Remove certain words from the list of stop words
words_to_remove = ["not", "too", "very", "don", "don't", "should", "should've", "aren", "aren't", 
                   "couldn", "couldn't", "didn", "didn't", "doesn", "doesn't", "hadn", "hadn't", 
                   "hasn", "hasn't", "haven", "haven't", "isn", "isn't", "shouldn", "shouldn't", 
                   "wasn", "wasn't", "weren", "weren't", "won", "won't", "wouldn", "wouldn't", 
                   "shan", "shan't", "mightn", "mightn't", "mustn", "mustn't", "needn", "needn't"]
for word in words_to_remove:
    stop_words.remove(word)

#### Set the lemmantizer

In [5]:
# Set the lemmatizer
lemmatizer = WordNetLemmatizer()

#### Create the tokenizer

In [6]:
# Create the tokenizer
def tokenize(review):
    review = review.lower() # make everything lower case
    tokens = word_tokenize(review) # use NLTK tokenizer to generate tokens
    tokens = [token for token in tokens if bool(re.search(r'\w{1,}', token))] # remove tokens that don't have letters or numbers in them
    tokens = [token for token in tokens if not token in stop_words] # remove stop words
    tokens = [lemmatizer.lemmatize(token) for token in tokens] # return each token in their base form
    
    return ' '.join(tokens) # return the review tokenized review as a string

#### Tokenize the reviews

In [7]:
# Tokenize the reviews
data['text'] = data['text'].apply(tokenize)

In [8]:
# Have a look at the first 5 tests
data['text'].head(n=5)

0    new rule waiting table almost always cant wait...
1    flirted giving two star 's pretty damning rati...
2    staying planet hollywood across street saw goo...
3    food good price super expensive 8 buck extra l...
4    worse company deal horrible work bring truck b...
Name: text, dtype: object