In [354]:
import re
import pandas as pd
import numpy as np
from nltk.tokenize import word_tokenize, wordpunct_tokenize, sent_tokenize, RegexpTokenizer, regexp_tokenize, WhitespaceTokenizer
from nltk.tokenize import MWETokenizer
from nltk.stem import WordNetLemmatizer 
from nltk.corpus import stopwords, wordnet
from nltk.tag import pos_tag
from nltk.chunk import ne_chunk
from nltk.book import *
import spacy
import string
from collections import Counter

from nlp_cleaning import *

pd.set_option("display.max_rows", 50)
pd.set_option("display.max_columns", None)
pd.set_option('display.max_colwidth', -1)
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [159]:
df = pd.read_csv("attraction_point_reviews.csv")
# df.tail()

# Cleaning the Text 

As part of cleaning the text before tokenizing it, various steps will be undertaken. 

#### But first, to get an initial feel for the data, the following steps are being undertaken:
1. All reviews are combined into a giant single string corpus.
1. All words are changed to lower letter case
1. website links & email ids are dropped
1. A lot of words are wrongly connected with punctuations. Simply dropping the punctuations will connect these words. These will be substituted with whitespace for the following punctuations: <.*?>;-!()/,:&—\ 
1. Everything except for letters & whitespace is dropped with no substitutions in between.
1. The string is tokenized on whitespace.
1. It is then converted into a word counter using FreqDist to explore frequencies and look at the various words used.

In [312]:
all_reviews = ' '.join(df.review_text.tolist())
all_reviews = all_reviews.lower()
all_reviews = re.sub('http\S+', '' , all_reviews)
all_reviews = re.sub('\S*@\S+', '', all_reviews)

all_reviews = re.sub(r'[<.*?>;\-!()/,:&—\\]+', ' ', all_reviews)
all_reviews = re.sub(r'[^A-Za-z\s]', '', all_reviews)
words = WhitespaceTokenizer().tokenize(all_reviews)
words_count = FreqDist(words)

long_words = [w for w in words_count if len(w) > 15]
small_words = [w for w in words_count if len(w) < 4]

**The above process was carried out iteratively to ensure as many long words as possible can be captured properly and not words arbitrarily combined together using punctuations**

## COME BACK TO TRY MORE CLEANING FOR:
1. letter repeats
2. Spell Check
1. Different languages
1. Named Entity Extraction
3. Identifying actual hypentated words instead of separating them 
1. Removing words that are less than 4 letters perhaps - but selectively

The cleaning strategy was applied to the 

In [348]:
df['review_clean'] = df.review_text.map(cleaning)
df.review_clean.sample()

Name: review_clean, dtype: object

In [353]:
# nlp = spacy.load('en', disable=[ 'parser', 'ner'])

In [352]:
n = 7
spacy_text = sp_nlp(df.review_clean[n])
text_lemma = [word.lemma_ for word in spacy_text \
                    if word.pos_ != 'PRON']
print(' '.join(text_lemma))
print(df.review_clean[n])

print([word.pos_ for word in spacy_text])
# ' '.join(sample)

yes   have be here many time    once propose to -PRON- now wife at sunrise    be one of the most amazing view in the world   up there with the grand canyon    s not to like note    in either   or   the park service will close glacier point road   be postpone from    the only vehicle access   for a year or more in order to renovate    so go now
yes  weve been here many times   once i proposed to my now wife at sunrise   it is one of the most amazing views in the world  up there with the grand canyon   whats not to like note   in either  or  the park service will close glacier point road  it was postponed from   the only vehicle access  for a year or more in order to renovate it   so go now 
[&#39;INTJ&#39;, &#39;SPACE&#39;, &#39;PRON&#39;, &#39;VERB&#39;, &#39;AUX&#39;, &#39;ADV&#39;, &#39;ADJ&#39;, &#39;NOUN&#39;, &#39;SPACE&#39;, &#39;SCONJ&#39;, &#39;PRON&#39;, &#39;VERB&#39;, &#39;ADP&#39;, &#39;DET&#39;, &#39;ADV&#39;, &#39;NOUN&#39;, &#39;ADP&#39;, &#39;NOUN&#39;, &#39;SPACE&#39;,

In [330]:
stopwords_list = stopwords.words('english')

### Tokenization 

In [138]:
from nltk.tokenize import word_tokenize

my_text = "Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from \
the store. Should I pick up some black-eyed peas as well?"

print(word_tokenize(my_text))

# (N-Grams)

from nltk.util import ngrams
my_words = word_tokenize(my_text) # This is the list of all words
twograms = list(ngrams(my_words,2)) # This is for two-word combos, but can pick any n
print(twograms)

# Regular Expressions

from nltk.tokenize import RegexpTokenizer

# RegexpTokenizer with whitespace delimiter
whitespace_tokenizer = RegexpTokenizer("\s+", gaps=True)
print(whitespace_tokenizer.tokenize(my_text))

# RegexpTokenizer to match only capitalized words
cap_tokenizer = RegexpTokenizer("[A-Z]['\w]+")
print(cap_tokenizer.tokenize(my_text))

from nltk.tokenize import regexp_tokenize, wordpunct_tokenize, blankline_tokenize

s = "Good muffins cost $3.88\nin New York.  Please buy me\ntwo of them.\n\nThanks."
regexp_tokenize(s, pattern='\w+|\$[\d\.]+|\S+')

wordpunct_tokenize(s)

blankline_tokenize(s)


[&#39;Hi&#39;, &#39;Mr.&#39;, &#39;Smith&#39;, &#39;!&#39;, &#39;I&#39;, &#39;’&#39;, &#39;m&#39;, &#39;going&#39;, &#39;to&#39;, &#39;buy&#39;, &#39;some&#39;, &#39;vegetables&#39;, &#39;(&#39;, &#39;tomatoes&#39;, &#39;and&#39;, &#39;cucumbers&#39;, &#39;)&#39;, &#39;from&#39;, &#39;the&#39;, &#39;store&#39;, &#39;.&#39;, &#39;Should&#39;, &#39;I&#39;, &#39;pick&#39;, &#39;up&#39;, &#39;some&#39;, &#39;black-eyed&#39;, &#39;peas&#39;, &#39;as&#39;, &#39;well&#39;, &#39;?&#39;]
[(&#39;Hi&#39;, &#39;Mr.&#39;), (&#39;Mr.&#39;, &#39;Smith&#39;), (&#39;Smith&#39;, &#39;!&#39;), (&#39;!&#39;, &#39;I&#39;), (&#39;I&#39;, &#39;’&#39;), (&#39;’&#39;, &#39;m&#39;), (&#39;m&#39;, &#39;going&#39;), (&#39;going&#39;, &#39;to&#39;), (&#39;to&#39;, &#39;buy&#39;), (&#39;buy&#39;, &#39;some&#39;), (&#39;some&#39;, &#39;vegetables&#39;), (&#39;vegetables&#39;, &#39;(&#39;), (&#39;(&#39;, &#39;tomatoes&#39;), (&#39;tomatoes&#39;, &#39;and&#39;), (&#39;and&#39;, &#39;cucumbers&#39;), (&#39;cucumbers&#3

### Preprocessing: Stop Words

In [137]:
from nltk.corpus import stopwords
set(stopwords.words('english'))

#Example impact with code

my_text = ["Hi Mr. Smith! I’m going to buy some vegetables (tomatoes and cucumbers) from \
the store. Should I pick up some black-eyed peas as well?"]

# Incorporate stop words when creating the count vectorizer
cv = CountVectorizer(stop_words='english')
X = cv.fit_transform(my_text)
pd.DataFrame(X.toarray(), columns=cv.get_feature_names())

### POS Tagging With NLTK

In [128]:
from nltk.tag import pos_tag
my_text = "James Smith lives in the United States."
tokens = pos_tag(word_tokenize(my_text))
print(tokens)

#For help on the codes, use the below
# nltk.help.upenn_tagset()

[(&#39;James&#39;, &#39;NNP&#39;), (&#39;Smith&#39;, &#39;NNP&#39;), (&#39;lives&#39;, &#39;VBZ&#39;), (&#39;in&#39;, &#39;IN&#39;), (&#39;the&#39;, &#39;DT&#39;), (&#39;United&#39;, &#39;NNP&#39;), (&#39;States&#39;, &#39;NNPS&#39;), (&#39;.&#39;, &#39;.&#39;)]


### Named Entity Recognition

In [132]:
from nltk.chunk import ne_chunk
my_text = "James Smith lives in the United States."
tokens = pos_tag(word_tokenize(my_text)) # this labels each word as a part of speech
entities = ne_chunk(tokens) # this extracts entities from the list of words
# help(entities)

### Compound Term Extraction

In [None]:
from nltk.tokenize import MWETokenizer # multi-word expression
my_text = "You all are the greatest students of all time."
mwe_tokenizer = MWETokenizer([('You','all'), ('of', 'all', 'time')])
mwe_tokens = mwe_tokenizer.tokenize(word_tokenize(my_text))
mwe_tokens