In [26]:
import re
import pandas as pd
import numpy as np
import nltk

In [133]:
df = pd.read_csv("attraction_point_reviews.csv")
df.tail()

Unnamed: 0,attraction_name,attraction_id,user_name,user_profile_link,review_date,helpful_votes,rating,review_link,review_text,review_title,experience_date
6970,Vernal Fall,g61000-d483481,Fairport Travelers,/Profile/Clarkvara,Jul 2008,560.0,4,/ShowUserReviews-g61000-d483481-r17854214-Vern...,Vernal Falls is sort of like the first leg of ...,Nice not too long hike,
6971,Vernal Fall,g61000-d483481,CAtravelfamily,/Profile/CAtravelfamily,Jun 2008,16.0,5,/ShowUserReviews-g61000-d483481-r17268549-Vern...,"Whew, it was a tough climb at times, but once ...",Worth the effort!,
6972,Vernal Fall,g61000-d483481,doodlebugakj,/Profile/doodlebugakj,Jul 2007,,5,/ShowUserReviews-g61000-d483481-r8255121-Verna...,This was a really fun hike but when we came ar...,Wow that was a lot of stairs!,
6973,Vernal Fall,g61000-d483481,Jase2153,/Profile/Jase2153,Sep 2005,9.0,5,/ShowUserReviews-g61000-d483481-r3910762-Verna...,I was visiting Yosemite from Australia and wen...,Worth The Trip,
6974,Vernal Fall,g61000-d483481,booradley2,/Profile/booradley2,Sep 2004,214.0,5,/ShowUserReviews-g61000-d483481-r2512847-Verna...,I've never been especially enthusiastic about ...,Do Not Miss Vernal Fall,


### Tokenization

In [None]:
from nltk.tokenize import word_tokenize

my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from \
the store. Should I pick up some black-eyed peas as well?"

print(word_tokenize(my_text))

### Tokenization (N-Grams)

In [134]:
from nltk.util import ngrams
my_words = word_tokenize(my_text) # This is the list of all words
twograms = list(ngrams(my_words,2)) # This is for two-word combos, but can pick any n
print(twograms)

[(&#39;James&#39;, &#39;Smith&#39;), (&#39;Smith&#39;, &#39;lives&#39;), (&#39;lives&#39;, &#39;in&#39;), (&#39;in&#39;, &#39;the&#39;), (&#39;the&#39;, &#39;United&#39;), (&#39;United&#39;, &#39;States&#39;), (&#39;States&#39;, &#39;.&#39;)]


### Tokenization (Regular Expressions)

In [135]:
from nltk.tokenize import RegexpTokenizer

# RegexpTokenizer with whitespace delimiter
whitespace_tokenizer = RegexpTokenizer("\s+", gaps=True)
print(whitespace_tokenizer.tokenize(my_text))


# RegexpTokenizer to match only capitalized words
cap_tokenizer = RegexpTokenizer("[A-Z]['\w]+")
print(cap_tokenizer.tokenize(my_text))

[&#39;James&#39;, &#39;Smith&#39;, &#39;lives&#39;, &#39;in&#39;, &#39;the&#39;, &#39;United&#39;, &#39;States.&#39;]
[&#39;James&#39;, &#39;Smith&#39;, &#39;United&#39;, &#39;States&#39;]


### Other Misc. Cleanups

In [None]:
import re # Regular expression library
import string

# Replace punctuations with a white space
clean_text = re.sub('[%s]' % re.escape(string.punctuation), ' ', my_text)
clean_text

clean_text = clean_text.lower()
clean_text

# Removes all words containing digits
clean_text = re.sub('\w*\d\w*', ' ', clean_text)
clean_text

### Preprocessing: Stop Words

In [137]:
from nltk.corpus import stopwords
set(stopwords.words('english'))

#Example impact with code

my_text = ["Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from \
the store. Should I pick up some black-eyed peas as well?"]

# Incorporate stop words when creating the count vectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(my_text)
pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

### Preprocessing: Stemming & Lemmatizing

Can use PorterStemmer, LancasterStemmer, SnowballStemmer, WordNetLemmatizer

In [None]:
from nltk.stem.lancaster import LancasterStemmer

stemmer = LancasterStemmer()
# Try some stems
print('drive: {}’.format(stemmer.stem('drive')))
print('drives: {}'.format(stemmer.stem('drives')))
print('driver: {}'.format(stemmer.stem('driver')))
print('drivers: {}'.format(stemmer.stem('drivers')))
print('driven: {}'.format(stemmer.stem('driven')))

### POS Tagging With NLTK

In [128]:
from nltk.tag import pos_tag
my_text = "James Smith lives in the United States."
tokens = pos_tag(word_tokenize(my_text))
print(tokens)

#For help on the codes, use the below
# nltk.help.upenn_tagset()

[(&#39;James&#39;, &#39;NNP&#39;), (&#39;Smith&#39;, &#39;NNP&#39;), (&#39;lives&#39;, &#39;VBZ&#39;), (&#39;in&#39;, &#39;IN&#39;), (&#39;the&#39;, &#39;DT&#39;), (&#39;United&#39;, &#39;NNP&#39;), (&#39;States&#39;, &#39;NNPS&#39;), (&#39;.&#39;, &#39;.&#39;)]


### Named Entity Recognition

In [132]:
from nltk.chunk import ne_chunk
my_text = "James Smith lives in the United States."
tokens = pos_tag(word_tokenize(my_text)) # this labels each word as a part of speech
entities = ne_chunk(tokens) # this extracts entities from the list of words
# help(entities)

### Compound Term Extraction

In [None]:
from nltk.tokenize import MWETokenizer # multi-word expression
my_text = "You all are the greatest students of all time."
mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
mwe_tokens